From 26c9042ea0f0529f464435cbeef111f3e6d396a5 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 17:06:52 +0000 Subject: [PATCH 0001/1081] Analyzer: support aliases in StorageMerge --- src/Analyzer/IQueryTreePass.h | 2 +- ...egateFunctionsArithmericOperationsPass.cpp | 2 +- ...gregateFunctionsArithmericOperationsPass.h | 2 +- src/Analyzer/Passes/ArrayExistsToHasPass.cpp | 2 +- src/Analyzer/Passes/ArrayExistsToHasPass.h | 2 +- src/Analyzer/Passes/AutoFinalOnQueryPass.cpp | 2 +- src/Analyzer/Passes/AutoFinalOnQueryPass.h | 2 +- .../Passes/ComparisonTupleEliminationPass.cpp | 2 +- .../Passes/ComparisonTupleEliminationPass.h | 2 +- .../Passes/ConvertOrLikeChainPass.cpp | 2 +- src/Analyzer/Passes/ConvertOrLikeChainPass.h | 2 +- src/Analyzer/Passes/ConvertQueryToCNFPass.cpp | 2 +- src/Analyzer/Passes/ConvertQueryToCNFPass.h | 2 +- src/Analyzer/Passes/CountDistinctPass.cpp | 2 +- src/Analyzer/Passes/CountDistinctPass.h | 2 +- src/Analyzer/Passes/CrossToInnerJoinPass.cpp | 2 +- src/Analyzer/Passes/CrossToInnerJoinPass.h | 2 +- .../Passes/FunctionToSubcolumnsPass.cpp | 2 +- .../Passes/FunctionToSubcolumnsPass.h | 2 +- src/Analyzer/Passes/FuseFunctionsPass.cpp | 2 +- src/Analyzer/Passes/FuseFunctionsPass.h | 2 +- .../Passes/GroupingFunctionsResolvePass.cpp | 2 +- .../Passes/GroupingFunctionsResolvePass.h | 2 +- src/Analyzer/Passes/IfChainToMultiIfPass.cpp | 2 +- src/Analyzer/Passes/IfChainToMultiIfPass.h | 2 +- .../Passes/IfConstantConditionPass.cpp | 2 +- src/Analyzer/Passes/IfConstantConditionPass.h | 2 +- .../Passes/IfTransformStringsToEnumPass.cpp | 2 +- .../Passes/IfTransformStringsToEnumPass.h | 2 +- .../Passes/LogicalExpressionOptimizerPass.cpp | 2 +- .../Passes/LogicalExpressionOptimizerPass.h | 2 +- src/Analyzer/Passes/MultiIfToIfPass.cpp | 2 +- src/Analyzer/Passes/MultiIfToIfPass.h | 2 +- .../Passes/NormalizeCountVariantsPass.cpp | 2 +- .../Passes/NormalizeCountVariantsPass.h | 2 +- .../OptimizeGroupByFunctionKeysPass.cpp | 2 +- .../Passes/OptimizeGroupByFunctionKeysPass.h | 2 +- ...ptimizeRedundantFunctionsInOrderByPass.cpp | 2 +- .../OptimizeRedundantFunctionsInOrderByPass.h | 2 +- ...OrderByLimitByDuplicateEliminationPass.cpp | 2 +- .../OrderByLimitByDuplicateEliminationPass.h | 2 +- .../Passes/OrderByTupleEliminationPass.cpp | 2 +- .../Passes/OrderByTupleEliminationPass.h | 2 +- src/Analyzer/Passes/QueryAnalysisPass.cpp | 15 ++- src/Analyzer/Passes/QueryAnalysisPass.h | 2 +- .../RewriteAggregateFunctionWithIfPass.cpp | 2 +- .../RewriteAggregateFunctionWithIfPass.h | 2 +- .../Passes/ShardNumColumnToFunctionPass.cpp | 2 +- .../Passes/ShardNumColumnToFunctionPass.h | 2 +- src/Analyzer/Passes/SumIfToCountIfPass.cpp | 2 +- src/Analyzer/Passes/SumIfToCountIfPass.h | 2 +- .../UniqInjectiveFunctionsEliminationPass.cpp | 2 +- .../UniqInjectiveFunctionsEliminationPass.h | 2 +- src/Planner/PlannerActionsVisitor.cpp | 4 +- src/Storages/StorageDistributed.cpp | 4 +- src/Storages/StorageMerge.cpp | 121 +++++++++++++++--- src/Storages/StorageMerge.h | 9 +- 57 files changed, 177 insertions(+), 80 deletions(-) diff --git a/src/Analyzer/IQueryTreePass.h b/src/Analyzer/IQueryTreePass.h index 4293934c32d..d4499c3271c 100644 --- a/src/Analyzer/IQueryTreePass.h +++ b/src/Analyzer/IQueryTreePass.h @@ -31,7 +31,7 @@ public: virtual String getDescription() = 0; /// Run pass over query tree - virtual void run(QueryTreeNodePtr query_tree_node, ContextPtr context) = 0; + virtual void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) = 0; }; diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp index 1476a66c892..2a69292ff78 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp @@ -201,7 +201,7 @@ private: } -void AggregateFunctionsArithmericOperationsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void AggregateFunctionsArithmericOperationsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { AggregateFunctionsArithmericOperationsVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h index a89d2f87ad9..d510b62f9be 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Extract arithmeric operations from aggregate functions."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp index c0f958588f1..63d417cd570 100644 --- a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp +++ b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp @@ -92,7 +92,7 @@ public: } -void RewriteArrayExistsToHasPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void RewriteArrayExistsToHasPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { RewriteArrayExistsToHasVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/ArrayExistsToHasPass.h b/src/Analyzer/Passes/ArrayExistsToHasPass.h index 8f4623116e3..4795b61c625 100644 --- a/src/Analyzer/Passes/ArrayExistsToHasPass.h +++ b/src/Analyzer/Passes/ArrayExistsToHasPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Rewrite arrayExists(func, arr) functions to has(arr, elem) when logically equivalent"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp index 15326ca1dc8..ee9e1023949 100644 --- a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp +++ b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp @@ -67,7 +67,7 @@ private: } -void AutoFinalOnQueryPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void AutoFinalOnQueryPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto visitor = AutoFinalOnQueryPassVisitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/AutoFinalOnQueryPass.h b/src/Analyzer/Passes/AutoFinalOnQueryPass.h index 3489597108c..d595b98d349 100644 --- a/src/Analyzer/Passes/AutoFinalOnQueryPass.h +++ b/src/Analyzer/Passes/AutoFinalOnQueryPass.h @@ -25,7 +25,7 @@ public: return "Automatically applies final modifier to table expressions in queries if it is supported and if user level final setting is set"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp b/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp index 4e0562a2fe8..57920065513 100644 --- a/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp +++ b/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp @@ -201,7 +201,7 @@ private: } -void ComparisonTupleEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ComparisonTupleEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { ComparisonTupleEliminationPassVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/ComparisonTupleEliminationPass.h b/src/Analyzer/Passes/ComparisonTupleEliminationPass.h index 954a9d6a2f0..7f4245e2d95 100644 --- a/src/Analyzer/Passes/ComparisonTupleEliminationPass.h +++ b/src/Analyzer/Passes/ComparisonTupleEliminationPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Rewrite tuples comparison into equivalent comparison of tuples arguments"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp index 7d7362fb742..0d2ddd20374 100644 --- a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp +++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp @@ -132,7 +132,7 @@ private: } -void ConvertOrLikeChainPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto or_function_resolver = FunctionFactory::instance().get("or", context); auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context); diff --git a/src/Analyzer/Passes/ConvertOrLikeChainPass.h b/src/Analyzer/Passes/ConvertOrLikeChainPass.h index 0f734bfa73d..90bccaa0e8d 100644 --- a/src/Analyzer/Passes/ConvertOrLikeChainPass.h +++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.h @@ -14,7 +14,7 @@ public: String getDescription() override { return "Replaces all the 'or's with {i}like to multiMatchAny"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp index 4d32c96b845..ecba2e28749 100644 --- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp +++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp @@ -720,7 +720,7 @@ public: } -void ConvertLogicalExpressionToCNFPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ConvertLogicalExpressionToCNFPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { const auto & settings = context->getSettingsRef(); if (!settings.convert_query_to_cnf) diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.h b/src/Analyzer/Passes/ConvertQueryToCNFPass.h index 5ed874db006..60943c04d78 100644 --- a/src/Analyzer/Passes/ConvertQueryToCNFPass.h +++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.h @@ -12,7 +12,7 @@ public: String getDescription() override { return "Convert logical expression to CNF and apply optimizations using constraints"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/CountDistinctPass.cpp b/src/Analyzer/Passes/CountDistinctPass.cpp index 945295f5cbc..eb2859020be 100644 --- a/src/Analyzer/Passes/CountDistinctPass.cpp +++ b/src/Analyzer/Passes/CountDistinctPass.cpp @@ -84,7 +84,7 @@ public: } -void CountDistinctPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void CountDistinctPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { CountDistinctVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/CountDistinctPass.h b/src/Analyzer/Passes/CountDistinctPass.h index cac5033c98f..33728b0228c 100644 --- a/src/Analyzer/Passes/CountDistinctPass.h +++ b/src/Analyzer/Passes/CountDistinctPass.h @@ -20,7 +20,7 @@ public: return "Optimize single countDistinct into count over subquery"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp index d4877d23f28..3283c163890 100644 --- a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp +++ b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp @@ -264,7 +264,7 @@ private: } -void CrossToInnerJoinPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void CrossToInnerJoinPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { CrossToInnerJoinVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/CrossToInnerJoinPass.h b/src/Analyzer/Passes/CrossToInnerJoinPass.h index 127d26dc41d..b0437c562ac 100644 --- a/src/Analyzer/Passes/CrossToInnerJoinPass.h +++ b/src/Analyzer/Passes/CrossToInnerJoinPass.h @@ -22,7 +22,7 @@ public: return "Replace CROSS JOIN with INNER JOIN"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp index 696483862e0..1b04136e6a4 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp @@ -202,7 +202,7 @@ private: } -void FunctionToSubcolumnsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void FunctionToSubcolumnsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { FunctionToSubcolumnsVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.h b/src/Analyzer/Passes/FunctionToSubcolumnsPass.h index 0e1d2583e7b..d4edcc5b922 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.h +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.h @@ -24,7 +24,7 @@ public: String getDescription() override { return "Rewrite function to subcolumns, for example tupleElement(column, subcolumn) into column.subcolumn"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/FuseFunctionsPass.cpp b/src/Analyzer/Passes/FuseFunctionsPass.cpp index 14082697955..ef87528964c 100644 --- a/src/Analyzer/Passes/FuseFunctionsPass.cpp +++ b/src/Analyzer/Passes/FuseFunctionsPass.cpp @@ -254,7 +254,7 @@ void tryFuseQuantiles(QueryTreeNodePtr query_tree_node, ContextPtr context) } -void FuseFunctionsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void FuseFunctionsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { tryFuseSumCountAvg(query_tree_node, context); tryFuseQuantiles(query_tree_node, context); diff --git a/src/Analyzer/Passes/FuseFunctionsPass.h b/src/Analyzer/Passes/FuseFunctionsPass.h index a92b77b1115..2fd85da4747 100644 --- a/src/Analyzer/Passes/FuseFunctionsPass.h +++ b/src/Analyzer/Passes/FuseFunctionsPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Replaces several calls of aggregate functions of the same family into one call"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp b/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp index 0cf5310a3ad..774014e5ffd 100644 --- a/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp +++ b/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp @@ -248,7 +248,7 @@ private: } -void GroupingFunctionsResolvePass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void GroupingFunctionsResolvePass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { GroupingFunctionsResolveVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/GroupingFunctionsResolvePass.h b/src/Analyzer/Passes/GroupingFunctionsResolvePass.h index 070c8dd9389..cd932f76977 100644 --- a/src/Analyzer/Passes/GroupingFunctionsResolvePass.h +++ b/src/Analyzer/Passes/GroupingFunctionsResolvePass.h @@ -24,7 +24,7 @@ public: String getDescription() override { return "Resolve GROUPING functions based on GROUP BY modifiers"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp index 1f97e012331..91a5709f142 100644 --- a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp +++ b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp @@ -73,7 +73,7 @@ private: } -void IfChainToMultiIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context); IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context)); diff --git a/src/Analyzer/Passes/IfChainToMultiIfPass.h b/src/Analyzer/Passes/IfChainToMultiIfPass.h index 43f3fb8831d..9e7335d93e4 100644 --- a/src/Analyzer/Passes/IfChainToMultiIfPass.h +++ b/src/Analyzer/Passes/IfChainToMultiIfPass.h @@ -18,7 +18,7 @@ public: String getDescription() override { return "Optimize if chain to multiIf"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/IfConstantConditionPass.cpp b/src/Analyzer/Passes/IfConstantConditionPass.cpp index 6f9cfe482f1..35c6718f018 100644 --- a/src/Analyzer/Passes/IfConstantConditionPass.cpp +++ b/src/Analyzer/Passes/IfConstantConditionPass.cpp @@ -49,7 +49,7 @@ public: } -void IfConstantConditionPass::run(QueryTreeNodePtr query_tree_node, ContextPtr) +void IfConstantConditionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr) { IfConstantConditionVisitor visitor; visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/IfConstantConditionPass.h b/src/Analyzer/Passes/IfConstantConditionPass.h index 7817e67aa5e..7548fc702bc 100644 --- a/src/Analyzer/Passes/IfConstantConditionPass.h +++ b/src/Analyzer/Passes/IfConstantConditionPass.h @@ -21,7 +21,7 @@ public: String getDescription() override { return "Optimize if, multiIf for constant condition."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp b/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp index 562aff4cf05..32e3c3cda51 100644 --- a/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp +++ b/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp @@ -205,7 +205,7 @@ public: } -void IfTransformStringsToEnumPass::run(QueryTreeNodePtr query, ContextPtr context) +void IfTransformStringsToEnumPass::run(QueryTreeNodePtr & query, ContextPtr context) { ConvertStringsToEnumVisitor visitor(std::move(context)); visitor.visit(query); diff --git a/src/Analyzer/Passes/IfTransformStringsToEnumPass.h b/src/Analyzer/Passes/IfTransformStringsToEnumPass.h index a4a014967e0..522087aafae 100644 --- a/src/Analyzer/Passes/IfTransformStringsToEnumPass.h +++ b/src/Analyzer/Passes/IfTransformStringsToEnumPass.h @@ -33,7 +33,7 @@ public: String getDescription() override { return "Replaces string-type arguments in If and Transform to enum"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp index 13f8025f5ea..7e0b6b2f828 100644 --- a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp +++ b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp @@ -233,7 +233,7 @@ private: } }; -void LogicalExpressionOptimizerPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void LogicalExpressionOptimizerPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { LogicalExpressionOptimizerVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h index 05c10ddc685..51d9968b48c 100644 --- a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h +++ b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h @@ -76,7 +76,7 @@ public: String getDescription() override { return "Transform equality chain to a single IN function or a constant if possible"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/MultiIfToIfPass.cpp b/src/Analyzer/Passes/MultiIfToIfPass.cpp index 4672351bcfb..5012aa7fa78 100644 --- a/src/Analyzer/Passes/MultiIfToIfPass.cpp +++ b/src/Analyzer/Passes/MultiIfToIfPass.cpp @@ -43,7 +43,7 @@ private: } -void MultiIfToIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto if_function_ptr = FunctionFactory::instance().get("if", context); MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context)); diff --git a/src/Analyzer/Passes/MultiIfToIfPass.h b/src/Analyzer/Passes/MultiIfToIfPass.h index 2213f3713ed..e3c03913aaa 100644 --- a/src/Analyzer/Passes/MultiIfToIfPass.h +++ b/src/Analyzer/Passes/MultiIfToIfPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Optimize multiIf with single condition to if."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp index d36be98751c..20b308c3af6 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp @@ -64,7 +64,7 @@ private: } -void NormalizeCountVariantsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void NormalizeCountVariantsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { NormalizeCountVariantsVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.h b/src/Analyzer/Passes/NormalizeCountVariantsPass.h index 78a114f4a85..6cf9f34619a 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.h +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Optimize count(literal), sum(1) into count()."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp index 5ed52f1210b..7c851d5fc35 100644 --- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp +++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp @@ -130,7 +130,7 @@ private: } }; -void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { OptimizeGroupByFunctionKeysVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h index 632960c45bb..fd5eadcb796 100644 --- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h +++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h @@ -16,7 +16,7 @@ public: String getDescription() override { return "Eliminates functions of other keys in GROUP BY section."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp index c6d312d0ecf..b6cc50caffe 100644 --- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp +++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp @@ -124,7 +124,7 @@ private: } -void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { OptimizeRedundantFunctionsInOrderByVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h index 609a6360d27..4a63c78022b 100644 --- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h +++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp index 3632c41028b..26ca5984b49 100644 --- a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp +++ b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp @@ -70,7 +70,7 @@ private: } -void OrderByLimitByDuplicateEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr) +void OrderByLimitByDuplicateEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr) { OrderByLimitByDuplicateEliminationVisitor visitor; visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h index 11a025af5b9..de5e1898a4c 100644 --- a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h +++ b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Remove duplicate columns from ORDER BY, LIMIT BY."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp b/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp index f70ec27ba5d..7c106082124 100644 --- a/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp +++ b/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp @@ -50,7 +50,7 @@ public: } -void OrderByTupleEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr) +void OrderByTupleEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr) { OrderByTupleEliminationVisitor visitor; visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OrderByTupleEliminationPass.h b/src/Analyzer/Passes/OrderByTupleEliminationPass.h index 5665561e227..45c8a756795 100644 --- a/src/Analyzer/Passes/OrderByTupleEliminationPass.h +++ b/src/Analyzer/Passes/OrderByTupleEliminationPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Remove tuple from ORDER BY."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index c454ad9f84f..1a76bc762a4 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -77,6 +77,8 @@ #include #include #include +#include +#include namespace ProfileEvents { @@ -1056,7 +1058,7 @@ private: class QueryAnalyzer { public: - void resolve(QueryTreeNodePtr node, const QueryTreeNodePtr & table_expression, ContextPtr context) + void resolve(QueryTreeNodePtr & node, const QueryTreeNodePtr & table_expression, ContextPtr context) { IdentifierResolveScope scope(node, nullptr /*parent_scope*/); @@ -1097,6 +1099,7 @@ public: { if (table_expression) { + LOG_DEBUG(&Poco::Logger::get("resolve"), "Table expression: {}", table_expression->dumpTree()); scope.expression_join_tree_node = table_expression; validateTableExpressionModifiers(scope.expression_join_tree_node, scope); initializeTableExpressionData(scope.expression_join_tree_node, scope); @@ -1106,6 +1109,7 @@ public: resolveExpressionNodeList(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); else resolveExpressionNode(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); + LOG_DEBUG(&Poco::Logger::get("resolve"), "Result: {}", node->dumpTree()); break; } @@ -2677,6 +2681,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier */ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableColumns(const IdentifierLookup & identifier_lookup, IdentifierResolveScope & scope) { + LOG_DEBUG(&Poco::Logger::get("tryResolveIdentifierFromTableColumns"), "{} {}", scope.column_name_to_column_node.size(), !identifier_lookup.isExpressionLookup()); if (scope.column_name_to_column_node.empty() || !identifier_lookup.isExpressionLookup()) return {}; @@ -2836,11 +2841,14 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableExpression(const Id QueryTreeNodePtr result_expression; bool match_full_identifier = false; + LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Looking for id: {}", identifier_without_column_qualifier.getFullName()); + auto it = table_expression_data.column_name_to_column_node.find(identifier_without_column_qualifier.getFullName()); if (it != table_expression_data.column_name_to_column_node.end()) { match_full_identifier = true; result_expression = it->second; + LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Found: {}", result_expression->dumpTree()); } else { @@ -5389,6 +5397,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id auto unresolved_identifier = identifier_node.getIdentifier(); auto resolve_identifier_expression_result = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::EXPRESSION}, scope); auto resolved_identifier_node = resolve_identifier_expression_result.resolved_identifier; + LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Resolved: {}", resolved_identifier_node ? resolved_identifier_node->dumpTree() : "Not resolved"); if (resolved_identifier_node && result_projection_names.empty() && (resolve_identifier_expression_result.isResolvedFromJoinTree() || resolve_identifier_expression_result.isResolvedFromExpressionArguments())) @@ -5470,6 +5479,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id } node = std::move(resolved_identifier_node); + LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Result node: {}", node ? node->dumpTree() : "Not resolved"); if (node->getNodeType() == QueryTreeNodeType::LIST) { @@ -6173,6 +6183,7 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table table_expression_data.should_qualify_columns = false; } + LOG_DEBUG(&Poco::Logger::get("Analyzer"), "Table data: {}", table_expression_data.dump()); scope.table_expression_node_to_data.emplace(table_expression_node, std::move(table_expression_data)); } @@ -7152,7 +7163,7 @@ QueryAnalysisPass::QueryAnalysisPass(QueryTreeNodePtr table_expression_) : table_expression(std::move(table_expression_)) {} -void QueryAnalysisPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void QueryAnalysisPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { QueryAnalyzer analyzer; analyzer.resolve(query_tree_node, table_expression, context); diff --git a/src/Analyzer/Passes/QueryAnalysisPass.h b/src/Analyzer/Passes/QueryAnalysisPass.h index fa8778ebf76..5d335d3e712 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.h +++ b/src/Analyzer/Passes/QueryAnalysisPass.h @@ -89,7 +89,7 @@ public: return "Resolve type for each query expression. Replace identifiers, matchers with query expressions. Perform constant folding. Evaluate scalar subqueries."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; private: QueryTreeNodePtr table_expression; diff --git a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp index de264948d4c..2fe5a89578b 100644 --- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp +++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp @@ -108,7 +108,7 @@ private: } -void RewriteAggregateFunctionWithIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void RewriteAggregateFunctionWithIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { RewriteAggregateFunctionWithIfVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h index be8ad3ac34d..0a2fc1ba423 100644 --- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h +++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h @@ -20,7 +20,7 @@ public: return "Rewrite aggregate functions with if expression as argument when logically equivalent"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp index b28816e8ff3..c273aecc9b5 100644 --- a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp +++ b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp @@ -58,7 +58,7 @@ public: } -void ShardNumColumnToFunctionPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ShardNumColumnToFunctionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { ShardNumColumnToFunctionVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h index 71a038bcf39..248f4e29bbe 100644 --- a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h +++ b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Rewrite _shard_num column into shardNum() function"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/SumIfToCountIfPass.cpp b/src/Analyzer/Passes/SumIfToCountIfPass.cpp index d55af278152..04d6c134d10 100644 --- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp +++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp @@ -180,7 +180,7 @@ private: } -void SumIfToCountIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void SumIfToCountIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { SumIfToCountIfVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/SumIfToCountIfPass.h b/src/Analyzer/Passes/SumIfToCountIfPass.h index f3ba47f1c2c..439d80c6306 100644 --- a/src/Analyzer/Passes/SumIfToCountIfPass.h +++ b/src/Analyzer/Passes/SumIfToCountIfPass.h @@ -23,7 +23,7 @@ public: String getDescription() override { return "Rewrite sum(if) and sumIf into countIf"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp index 5c4484457e8..e256934010d 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp @@ -87,7 +87,7 @@ public: } -void UniqInjectiveFunctionsEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void UniqInjectiveFunctionsEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { UniqInjectiveFunctionsEliminationVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h index a0f07dfb7b5..c143fe2c39c 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Remove injective functions from uniq functions arguments."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp index c64d82299ca..e9fa72f925d 100644 --- a/src/Planner/PlannerActionsVisitor.cpp +++ b/src/Planner/PlannerActionsVisitor.cpp @@ -494,8 +494,8 @@ PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::vi return visitFunction(node); throw Exception(ErrorCodes::UNSUPPORTED_METHOD, - "Expected column, constant, function. Actual {}", - node->formatASTForErrorMessage()); + "Expected column, constant, function. Actual {} with type: {}", + node->formatASTForErrorMessage(), node_type); } PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::visitColumn(const QueryTreeNodePtr & node) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index b91ad0b963a..9f9f0fda9e2 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,6 +30,7 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" #include #include @@ -937,7 +938,8 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, table_function_node->setTableExpressionModifiers(*table_expression_modifiers); QueryAnalysisPass query_analysis_pass; - query_analysis_pass.run(table_function_node, query_context); + QueryTreeNodePtr node = table_function_node; + query_analysis_pass.run(node, query_context); replacement_table_expression = std::move(table_function_node); } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index b0ed242d14d..a49155ac2d9 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -27,9 +27,18 @@ #include #include #include +#include "Common/logger_useful.h" #include #include +#include "Analyzer/ColumnNode.h" +#include "Analyzer/IQueryTreeNode.h" +#include "Analyzer/Identifier.h" +#include "Analyzer/IdentifierNode.h" +#include "Analyzer/Passes/QueryAnalysisPass.h" +#include "Analyzer/QueryTreeBuilder.h" +#include "Core/NamesAndTypes.h" #include "DataTypes/IDataType.h" +#include "Planner/PlannerActionsVisitor.h" #include #include #include @@ -42,6 +51,7 @@ #include #include #include +#include namespace @@ -464,8 +474,8 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); - auto modified_query_info = getModifiedQueryInfo(query_info, context, table, nested_storage_snaphsot); Names column_names_as_aliases; + auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, column_names_as_aliases); if (!context->getSettingsRef().allow_experimental_analyzer) { @@ -553,10 +563,10 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu pipeline.addResources(std::move(resources)); } -SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, +SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot) + const StorageSnapshotPtr & storage_snapshot, + Names & column_names_as_aliases) const { const auto & [database_name, storage, storage_lock, table_name] = storage_with_lock_and_name; const StorageID current_storage_id = storage->getStorageID(); @@ -586,6 +596,47 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); + auto storage_columns = storage_snapshot->metadata->getColumns(); + + bool with_aliases = /* common_processed_stage == QueryProcessingStage::FetchColumns && */ !storage_columns.getAliases().empty(); + if (with_aliases) + { + auto filter_actions_dag = std::make_shared(); + for (const auto & column : column_names) + { + const auto column_default = storage_columns.getDefault(column); + bool is_alias = column_default && column_default->kind == ColumnDefaultKind::Alias; + + QueryTreeNodePtr column_node; + + if (is_alias) + { + column_node = buildQueryTree(column_default->expression, modified_context); + + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", column_node->dumpTree(), modified_query_info.table_expression->dumpTree()); + + column_node->setAlias(column); + + QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); + query_analysis_pass.run(column_node, modified_context); + + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); + + column_name_to_node.emplace(column, column_node); + } + else + { + column_node = std::make_shared(NameAndTypePair{column, storage_columns.getColumn(get_column_options, column).type }, modified_query_info.table_expression); + } + + + PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); + actions_visitor.visit(filter_actions_dag, column_node); + } + column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Required names: {}", toString(column_names_as_aliases)); + } + if (!column_name_to_node.empty()) { replaceColumns(modified_query_info.query_tree, @@ -594,6 +645,7 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer } modified_query_info.query = queryNodeToSelectQuery(modified_query_info.query_tree); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Modified query: {}", modified_query_info.query->formatForLogging()); } else { @@ -640,6 +692,8 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( modified_select.setFinal(); } + LOG_DEBUG(&Poco::Logger::get("createSources"), "real_column_names: {}", toString(real_column_names)); + bool allow_experimental_analyzer = modified_context->getSettingsRef().allow_experimental_analyzer; auto storage_stage = storage->getQueryProcessingStage(modified_context, @@ -783,7 +837,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); + convertingSourceStream(header, modified_query_info, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); } return builder; @@ -957,9 +1011,10 @@ void StorageMerge::alter( void ReadFromMerge::convertingSourceStream( const Block & header, + SelectQueryInfo & modified_query_info, const StorageMetadataPtr & metadata_snapshot, const Aliases & aliases, - ContextPtr local_context, + ContextMutablePtr local_context, QueryPipelineBuilder & builder, const QueryProcessingStage::Enum & processed_stage) { @@ -968,21 +1023,49 @@ void ReadFromMerge::convertingSourceStream( auto storage_sample_block = metadata_snapshot->getSampleBlock(); auto pipe_columns = builder.getHeader().getNamesAndTypesList(); - for (const auto & alias : aliases) + if (local_context->getSettingsRef().allow_experimental_analyzer) { - pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); - ASTPtr expr = alias.expression; - auto syntax_result = TreeRewriter(local_context).analyze(expr, pipe_columns); - auto expression_analyzer = ExpressionAnalyzer{alias.expression, syntax_result, local_context}; - - auto dag = std::make_shared(pipe_columns); - auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) + for (const auto & alias : aliases) { - return std::make_shared(stream_header, actions); - }); + pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); + + auto actions_dag = std::make_shared(); + + QueryTreeNodePtr query_tree = buildQueryTree(alias.expression, local_context); + query_tree->setAlias(alias.name); + + QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); + query_analysis_pass.run(query_tree, local_context); + + PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); + actions_visitor.visit(actions_dag, query_tree); + + auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + + builder.addSimpleTransform([&](const Block & stream_header) + { + return std::make_shared(stream_header, actions); + }); + } + } + else + { + for (const auto & alias : aliases) + { + pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); + ASTPtr expr = alias.expression; + auto syntax_result = TreeRewriter(local_context).analyze(expr, pipe_columns); + auto expression_analyzer = ExpressionAnalyzer{alias.expression, syntax_result, local_context}; + + auto dag = std::make_shared(pipe_columns); + auto actions_dag = expression_analyzer.getActionsDAG(true, false); + auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + + builder.addSimpleTransform([&](const Block & stream_header) + { + return std::make_shared(stream_header, actions); + }); + } } ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index babf0dd92e8..739d6831f6f 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -177,10 +177,10 @@ private: using Aliases = std::vector; - static SelectQueryInfo getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, + SelectQueryInfo getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot); + const StorageSnapshotPtr & storage_snapshot, + Names & column_names_as_aliases) const; QueryPipelineBuilderPtr createSources( const StorageSnapshotPtr & storage_snapshot, @@ -197,9 +197,10 @@ private: static void convertingSourceStream( const Block & header, + SelectQueryInfo & modified_query_info, const StorageMetadataPtr & metadata_snapshot, const Aliases & aliases, - ContextPtr context, + ContextMutablePtr context, QueryPipelineBuilder & builder, const QueryProcessingStage::Enum & processed_stage); }; From fc9ee3eb4e1e4c4b145bc39bc7ce507cf05b9d1d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 13 Jun 2023 15:01:31 +0000 Subject: [PATCH 0002/1081] Correctly build the ActionsDAG --- src/Storages/StorageMerge.cpp | 28 +++++++++++++++++++++------- src/Storages/StorageMerge.h | 3 ++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index a49155ac2d9..d036eaa9f25 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -475,7 +475,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); Names column_names_as_aliases; - auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, column_names_as_aliases); + auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, column_names_as_aliases, aliases); if (!context->getSettingsRef().allow_experimental_analyzer) { @@ -566,7 +566,8 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, const StorageSnapshotPtr & storage_snapshot, - Names & column_names_as_aliases) const + Names & column_names_as_aliases, + Aliases & aliases) const { const auto & [database_name, storage, storage_lock, table_name] = storage_with_lock_and_name; const StorageID current_storage_id = storage->getStorageID(); @@ -611,18 +612,23 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ if (is_alias) { - column_node = buildQueryTree(column_default->expression, modified_context); + // column_node = buildQueryTree(column_default->expression, modified_context); + column_node = std::make_shared(Identifier{column}); LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", column_node->dumpTree(), modified_query_info.table_expression->dumpTree()); - column_node->setAlias(column); - QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); query_analysis_pass.run(column_node, modified_context); LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); + auto * resolved_column = column_node->as(); + if (!resolved_column || !resolved_column->getExpression()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); + + column_node = resolved_column->getExpression(); column_name_to_node.emplace(column, column_node); + aliases.push_back({ .name = column, .type = resolved_column->getResultType(), .expression = column_node->toAST() }); } else { @@ -634,6 +640,9 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ actions_visitor.visit(filter_actions_dag, column_node); } column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); + if (column_names_as_aliases.empty()) + column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Required names: {}", toString(column_names_as_aliases)); } @@ -1029,7 +1038,7 @@ void ReadFromMerge::convertingSourceStream( { pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); - auto actions_dag = std::make_shared(); + auto actions_dag = std::make_shared(pipe_columns); QueryTreeNodePtr query_tree = buildQueryTree(alias.expression, local_context); query_tree->setAlias(alias.name); @@ -1038,7 +1047,12 @@ void ReadFromMerge::convertingSourceStream( query_analysis_pass.run(query_tree, local_context); PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); - actions_visitor.visit(actions_dag, query_tree); + const auto & nodes = actions_visitor.visit(actions_dag, query_tree); + + if (nodes.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); + + actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 739d6831f6f..987869e5de3 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -180,7 +180,8 @@ private: SelectQueryInfo getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, const StorageSnapshotPtr & storage_snapshot, - Names & column_names_as_aliases) const; + Names & column_names_as_aliases, + Aliases & aliases) const; QueryPipelineBuilderPtr createSources( const StorageSnapshotPtr & storage_snapshot, From 55b81a5a5e7ad73a3e53aee0d0b83731ff8e76ed Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 13 Jun 2023 23:13:18 +0000 Subject: [PATCH 0003/1081] Fix style --- src/Storages/StorageMerge.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index d036eaa9f25..e2a27d4e20e 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -80,6 +80,7 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; extern const int NOT_IMPLEMENTED; extern const int ILLEGAL_PREWHERE; From 6489922dc19a0fda86bdcc8e08c108812dc4aebf Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 16 Jun 2023 18:49:59 +0000 Subject: [PATCH 0004/1081] Fix for column aliases that use other aliases --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 9 ------ src/Storages/StorageMerge.cpp | 38 ++++++++++++++++++++--- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 1a76bc762a4..309f067c4c0 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1099,7 +1099,6 @@ public: { if (table_expression) { - LOG_DEBUG(&Poco::Logger::get("resolve"), "Table expression: {}", table_expression->dumpTree()); scope.expression_join_tree_node = table_expression; validateTableExpressionModifiers(scope.expression_join_tree_node, scope); initializeTableExpressionData(scope.expression_join_tree_node, scope); @@ -1109,7 +1108,6 @@ public: resolveExpressionNodeList(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); else resolveExpressionNode(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); - LOG_DEBUG(&Poco::Logger::get("resolve"), "Result: {}", node->dumpTree()); break; } @@ -2681,7 +2679,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier */ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableColumns(const IdentifierLookup & identifier_lookup, IdentifierResolveScope & scope) { - LOG_DEBUG(&Poco::Logger::get("tryResolveIdentifierFromTableColumns"), "{} {}", scope.column_name_to_column_node.size(), !identifier_lookup.isExpressionLookup()); if (scope.column_name_to_column_node.empty() || !identifier_lookup.isExpressionLookup()) return {}; @@ -2841,14 +2838,11 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableExpression(const Id QueryTreeNodePtr result_expression; bool match_full_identifier = false; - LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Looking for id: {}", identifier_without_column_qualifier.getFullName()); - auto it = table_expression_data.column_name_to_column_node.find(identifier_without_column_qualifier.getFullName()); if (it != table_expression_data.column_name_to_column_node.end()) { match_full_identifier = true; result_expression = it->second; - LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Found: {}", result_expression->dumpTree()); } else { @@ -5397,7 +5391,6 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id auto unresolved_identifier = identifier_node.getIdentifier(); auto resolve_identifier_expression_result = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::EXPRESSION}, scope); auto resolved_identifier_node = resolve_identifier_expression_result.resolved_identifier; - LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Resolved: {}", resolved_identifier_node ? resolved_identifier_node->dumpTree() : "Not resolved"); if (resolved_identifier_node && result_projection_names.empty() && (resolve_identifier_expression_result.isResolvedFromJoinTree() || resolve_identifier_expression_result.isResolvedFromExpressionArguments())) @@ -5479,7 +5472,6 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id } node = std::move(resolved_identifier_node); - LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Result node: {}", node ? node->dumpTree() : "Not resolved"); if (node->getNodeType() == QueryTreeNodeType::LIST) { @@ -6183,7 +6175,6 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table table_expression_data.should_qualify_columns = false; } - LOG_DEBUG(&Poco::Logger::get("Analyzer"), "Table data: {}", table_expression_data.dump()); scope.table_expression_node_to_data.emplace(table_expression_node, std::move(table_expression_data)); } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index e2a27d4e20e..13548a84826 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -34,6 +34,7 @@ #include "Analyzer/IQueryTreeNode.h" #include "Analyzer/Identifier.h" #include "Analyzer/IdentifierNode.h" +#include "Analyzer/InDepthQueryTreeVisitor.h" #include "Analyzer/Passes/QueryAnalysisPass.h" #include "Analyzer/QueryTreeBuilder.h" #include "Core/NamesAndTypes.h" @@ -564,6 +565,26 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu pipeline.addResources(std::move(resources)); } +namespace +{ + +class ApplyAliasColumnExpressionsVisitor : public InDepthQueryTreeVisitor +{ +public: + ApplyAliasColumnExpressionsVisitor() = default; + + void visitImpl(QueryTreeNodePtr & node) + { + if (auto * column = node->as(); + column != nullptr && column->hasExpression()) + { + node = column->getExpressionOrThrow(); + } + } +}; + +} + SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, const StorageSnapshotPtr & storage_snapshot, @@ -611,23 +632,28 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ QueryTreeNodePtr column_node; + if (is_alias) { // column_node = buildQueryTree(column_default->expression, modified_context); - column_node = std::make_shared(Identifier{column}); + QueryTreeNodePtr fake_node = std::make_shared(Identifier{column}); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", column_node->dumpTree(), modified_query_info.table_expression->dumpTree()); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", fake_node->dumpTree(), modified_query_info.table_expression->dumpTree()); QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); - query_analysis_pass.run(column_node, modified_context); + query_analysis_pass.run(fake_node, modified_context); + + auto * resolved_column = fake_node->as(); + + column_node = fake_node; + ApplyAliasColumnExpressionsVisitor visitor; + visitor.visit(column_node); LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); - auto * resolved_column = column_node->as(); if (!resolved_column || !resolved_column->getExpression()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); - column_node = resolved_column->getExpression(); column_name_to_node.emplace(column, column_node); aliases.push_back({ .name = column, .type = resolved_column->getResultType(), .expression = column_node->toAST() }); } @@ -1095,6 +1121,8 @@ void ReadFromMerge::convertingSourceStream( std::move(convert_actions_dag), ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + LOG_DEBUG(&Poco::Logger::get("convertingSourceStream"), "The header: {}", builder.getHeader().dumpStructure()); + builder.addSimpleTransform([&](const Block & stream_header) { return std::make_shared(stream_header, actions); From f9e67fe0427ee2d698d2b946a8286e228d47b0ec Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 19 Jun 2023 15:10:29 +0000 Subject: [PATCH 0005/1081] Update broken_tests.txt --- tests/broken_tests.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index d49b4f391e5..1635c8740cc 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -24,7 +24,6 @@ 01173_transaction_control_queries 01211_optimize_skip_unused_shards_type_mismatch 01213_optimize_skip_unused_shards_DISTINCT -01214_test_storage_merge_aliases_with_where 01231_distributed_aggregation_memory_efficient_mix_levels 01244_optimize_distributed_group_by_sharding_key 01247_optimize_distributed_group_by_sharding_key_dist_on_dist @@ -68,7 +67,6 @@ 01890_materialized_distributed_join 01901_in_literal_shard_prune 01925_join_materialized_columns -01925_test_storage_merge_aliases 01930_optimize_skip_unused_shards_rewrite_in 01947_mv_subquery 01951_distributed_push_down_limit From dcdadd5f639def096bd330f987609d0c5740ca83 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 19 Jun 2023 15:18:04 +0000 Subject: [PATCH 0006/1081] Update broken_tests.txt --- tests/broken_tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 1635c8740cc..8b11c5f5413 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -99,7 +99,6 @@ 02494_optimize_group_by_function_keys_and_alias_columns 02521_aggregation_by_partitions 02554_fix_grouping_sets_predicate_push_down -02575_merge_prewhere_different_default_kind 02713_array_low_cardinality_string 02707_skip_index_with_in 02241_join_rocksdb_bs From 20c752fb787a05f9180f791401afe56bf372acfc Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 19 Jun 2023 15:44:01 +0000 Subject: [PATCH 0007/1081] Fix generated query --- src/Storages/StorageMerge.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 13548a84826..22308c1d901 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -614,7 +614,11 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ std::unordered_map column_name_to_node; if (!storage_snapshot->tryGetColumn(get_column_options, "_table")) - column_name_to_node.emplace("_table", std::make_shared(current_storage_id.table_name)); + { + auto table_name_node = std::make_shared(current_storage_id.table_name); + table_name_node->setAlias("_table"); + column_name_to_node.emplace("_table", table_name_node); + } if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); From 118b84703bb0f08aa622b956b1207d9092f5f2d7 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 21 Jun 2023 01:51:34 +0200 Subject: [PATCH 0008/1081] WIP on StorageMerge and distributed JOIN --- src/Analyzer/ColumnNode.h | 5 ++ src/Storages/StorageMerge.cpp | 86 ++++++++++++++++++++++++++++++++--- src/Storages/StorageMerge.h | 2 +- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/Analyzer/ColumnNode.h b/src/Analyzer/ColumnNode.h index b320df788c5..46e7c8eb500 100644 --- a/src/Analyzer/ColumnNode.h +++ b/src/Analyzer/ColumnNode.h @@ -108,6 +108,11 @@ public: */ QueryTreeNodePtr getColumnSourceOrNull() const; + void setColumnSource(const QueryTreeNodePtr & source) + { + getSourceWeakPointer() = source; + } + QueryTreeNodeType getNodeType() const override { return QueryTreeNodeType::COLUMN; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 22308c1d901..85ec21b4765 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include @@ -583,6 +585,76 @@ public: } }; +bool hasUnknownColumn(const QueryTreeNodePtr & node, + QueryTreeNodePtr original_table_expression, + QueryTreeNodePtr replacement_table_expression) +{ + QueryTreeNodes stack = { node }; + while (!stack.empty()) + { + auto current = stack.back(); + stack.pop_back(); + + switch (current->getNodeType()) + { + case QueryTreeNodeType::CONSTANT: + break; + case QueryTreeNodeType::COLUMN: + { + auto * column_node = current->as(); + auto source = column_node->getColumnSourceOrNull(); + if (source != original_table_expression) + return true; + else + column_node->setColumnSource(replacement_table_expression); + break; + } + default: + { + for (const auto & child : node->getChildren()) + { + if (child) + stack.push_back(child); + } + } + } + } + return false; +} + +QueryTreeNodePtr removeJoin( + QueryTreeNodePtr query, + QueryTreeNodePtr original_table_expression, + QueryTreeNodePtr replacement_table_expression) +{ + auto * query_node = query->as(); + auto modified_query = query_node->cloneAndReplace(query_node->getJoinTree(), replacement_table_expression); + + query_node = modified_query->as(); + query_node->getGroupBy().getNodes().clear(); + query_node->getHaving() = {}; + query_node->getOrderBy().getNodes().clear(); + + auto & projection = query_node->getProjection().getNodes(); + auto projection_columns = query_node->getProjectionColumns(); + for (size_t i = 0; i < projection.size();) + { + if (hasUnknownColumn(projection[i], original_table_expression, replacement_table_expression)) + { + projection.erase(projection.begin() + i); + projection_columns.erase(projection_columns.begin() + i); + continue; + } + ++i; + } + + query_node->resolveProjectionColumns(std::move(projection_columns)); + + LOG_DEBUG(&Poco::Logger::get("removeJoin"), "Query without JOIN:\n{}", modified_query->dumpTree()); + + return modified_query; +} + } SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, @@ -602,8 +674,9 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ if (query_info.table_expression_modifiers) replacement_table_expression->setTableExpressionModifiers(*query_info.table_expression_modifiers); - modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, - replacement_table_expression); + modified_query_info.query_tree = removeJoin(modified_query_info.query_tree, modified_query_info.table_expression, replacement_table_expression); + // modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, + // replacement_table_expression); modified_query_info.table_expression = replacement_table_expression; modified_query_info.planner_context->getOrCreateTableExpressionData(replacement_table_expression); @@ -877,7 +950,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, modified_query_info, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); + convertingSourceStream(header, modified_query_info, storage_snapshot, aliases, modified_context, *builder, processed_stage); } return builder; @@ -1052,7 +1125,7 @@ void StorageMerge::alter( void ReadFromMerge::convertingSourceStream( const Block & header, SelectQueryInfo & modified_query_info, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & snapshot, const Aliases & aliases, ContextMutablePtr local_context, QueryPipelineBuilder & builder, @@ -1060,7 +1133,7 @@ void ReadFromMerge::convertingSourceStream( { Block before_block_header = builder.getHeader(); - auto storage_sample_block = metadata_snapshot->getSampleBlock(); + auto storage_sample_block = snapshot->metadata->getSampleBlock(); auto pipe_columns = builder.getHeader().getNamesAndTypesList(); if (local_context->getSettingsRef().allow_experimental_analyzer) @@ -1115,7 +1188,8 @@ void ReadFromMerge::convertingSourceStream( ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; - if (local_context->getSettingsRef().allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns) + if (local_context->getSettingsRef().allow_experimental_analyzer + && (processed_stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Position; auto convert_actions_dag = ActionsDAG::makeConvertingActions(builder.getHeader().getColumnsWithTypeAndName(), diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 987869e5de3..de9480292f9 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -199,7 +199,7 @@ private: static void convertingSourceStream( const Block & header, SelectQueryInfo & modified_query_info, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & snapshot, const Aliases & aliases, ContextMutablePtr context, QueryPipelineBuilder & builder, From 88fe30254a280286ac2bd2b6bcdc71865ec2aed2 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 21 Jun 2023 17:55:14 +0000 Subject: [PATCH 0009/1081] Small fixup --- src/Storages/StorageMerge.cpp | 12 +++++++++--- tests/broken_tests.txt | 1 - 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 85ec21b4765..d1ac3f57ae1 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -631,6 +631,10 @@ QueryTreeNodePtr removeJoin( auto modified_query = query_node->cloneAndReplace(query_node->getJoinTree(), replacement_table_expression); query_node = modified_query->as(); + + //TODO: change the predicates to make it valid and execute it on shards. + query_node->getPrewhere() = {}; + query_node->getWhere() = {}; query_node->getGroupBy().getNodes().clear(); query_node->getHaving() = {}; query_node->getOrderBy().getNodes().clear(); @@ -675,8 +679,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ replacement_table_expression->setTableExpressionModifiers(*query_info.table_expression_modifiers); modified_query_info.query_tree = removeJoin(modified_query_info.query_tree, modified_query_info.table_expression, replacement_table_expression); - // modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, - // replacement_table_expression); modified_query_info.table_expression = replacement_table_expression; modified_query_info.planner_context->getOrCreateTableExpressionData(replacement_table_expression); @@ -694,7 +696,11 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ } if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) - column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); + { + auto database_name_node = std::make_shared(current_storage_id.database_name); + database_name_node->setAlias("_database"); + column_name_to_node.emplace("_database", database_name_node); + } auto storage_columns = storage_snapshot->metadata->getColumns(); diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index e6b5fb4f631..f6e21a29eed 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -38,7 +38,6 @@ 01527_dist_sharding_key_dictGet_reload 01528_allow_nondeterministic_optimize_skip_unused_shards 01540_verbatim_partition_pruning -01560_merge_distributed_join 01563_distributed_query_finish 01576_alias_column_rewrite 01583_const_column_in_set_index From 47fafdc32c320464bbd65468208bbc8e5b7ac62f Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 21 Jun 2023 18:06:24 +0000 Subject: [PATCH 0010/1081] Code cleanup --- src/Storages/StorageDistributed.cpp | 1 - src/Storages/StorageMerge.cpp | 35 ++++++++--------------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 9f9f0fda9e2..b948ca946c3 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,7 +30,6 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" #include #include diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index d1ac3f57ae1..1a0376edbf5 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -28,19 +28,17 @@ #include #include #include -#include "Common/logger_useful.h" #include #include -#include "Analyzer/ColumnNode.h" -#include "Analyzer/IQueryTreeNode.h" -#include "Analyzer/Identifier.h" -#include "Analyzer/IdentifierNode.h" -#include "Analyzer/InDepthQueryTreeVisitor.h" -#include "Analyzer/Passes/QueryAnalysisPass.h" -#include "Analyzer/QueryTreeBuilder.h" -#include "Core/NamesAndTypes.h" -#include "DataTypes/IDataType.h" -#include "Planner/PlannerActionsVisitor.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -52,7 +50,6 @@ #include #include #include -#include #include #include @@ -654,8 +651,6 @@ QueryTreeNodePtr removeJoin( query_node->resolveProjectionColumns(std::move(projection_columns)); - LOG_DEBUG(&Poco::Logger::get("removeJoin"), "Query without JOIN:\n{}", modified_query->dumpTree()); - return modified_query; } @@ -718,11 +713,8 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ if (is_alias) { - // column_node = buildQueryTree(column_default->expression, modified_context); QueryTreeNodePtr fake_node = std::make_shared(Identifier{column}); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", fake_node->dumpTree(), modified_query_info.table_expression->dumpTree()); - QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); query_analysis_pass.run(fake_node, modified_context); @@ -732,8 +724,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ ApplyAliasColumnExpressionsVisitor visitor; visitor.visit(column_node); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); - if (!resolved_column || !resolved_column->getExpression()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); @@ -752,8 +742,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); if (column_names_as_aliases.empty()) column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); - - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Required names: {}", toString(column_names_as_aliases)); } if (!column_name_to_node.empty()) @@ -764,7 +752,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ } modified_query_info.query = queryNodeToSelectQuery(modified_query_info.query_tree); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Modified query: {}", modified_query_info.query->formatForLogging()); } else { @@ -811,8 +798,6 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( modified_select.setFinal(); } - LOG_DEBUG(&Poco::Logger::get("createSources"), "real_column_names: {}", toString(real_column_names)); - bool allow_experimental_analyzer = modified_context->getSettingsRef().allow_experimental_analyzer; auto storage_stage = storage->getQueryProcessingStage(modified_context, @@ -1205,8 +1190,6 @@ void ReadFromMerge::convertingSourceStream( std::move(convert_actions_dag), ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - LOG_DEBUG(&Poco::Logger::get("convertingSourceStream"), "The header: {}", builder.getHeader().dumpStructure()); - builder.addSimpleTransform([&](const Block & stream_header) { return std::make_shared(stream_header, actions); From 97a1ea01badaba10235ab0b01777f324b2f8365e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 22 Jun 2023 15:10:53 +0000 Subject: [PATCH 0011/1081] Fix removeJoin --- src/Storages/StorageMerge.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 1a0376edbf5..fd7c0aae479 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -625,7 +626,8 @@ QueryTreeNodePtr removeJoin( QueryTreeNodePtr replacement_table_expression) { auto * query_node = query->as(); - auto modified_query = query_node->cloneAndReplace(query_node->getJoinTree(), replacement_table_expression); + auto join_tree = query_node->getJoinTree(); + auto modified_query = query_node->cloneAndReplace(join_tree, replacement_table_expression); query_node = modified_query->as(); @@ -636,20 +638,23 @@ QueryTreeNodePtr removeJoin( query_node->getHaving() = {}; query_node->getOrderBy().getNodes().clear(); - auto & projection = query_node->getProjection().getNodes(); - auto projection_columns = query_node->getProjectionColumns(); - for (size_t i = 0; i < projection.size();) + if (join_tree->as() == nullptr && join_tree->as() == nullptr) { - if (hasUnknownColumn(projection[i], original_table_expression, replacement_table_expression)) + auto & projection = query_node->getProjection().getNodes(); + auto projection_columns = query_node->getProjectionColumns(); + for (size_t i = 0; i < projection.size();) { - projection.erase(projection.begin() + i); - projection_columns.erase(projection_columns.begin() + i); - continue; + if (hasUnknownColumn(projection[i], original_table_expression, replacement_table_expression)) + { + projection.erase(projection.begin() + i); + projection_columns.erase(projection_columns.begin() + i); + continue; + } + ++i; } - ++i; - } - query_node->resolveProjectionColumns(std::move(projection_columns)); + query_node->resolveProjectionColumns(std::move(projection_columns)); + } return modified_query; } From 83022b77714a204ef4025d0b5081fbc127f2a586 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 2 Sep 2023 21:56:36 +0200 Subject: [PATCH 0012/1081] Added support for parameterized view with analyzer by analyzing the select part with default values --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 47 +++++++++++++++++++++ src/Analyzer/TableFunctionNode.cpp | 7 +++ src/Analyzer/TableFunctionNode.h | 3 ++ src/Interpreters/InterpreterCreateQuery.cpp | 38 +++++++++++++++-- 4 files changed, 92 insertions(+), 3 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 348189854e8..c82d3079118 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -77,6 +77,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include namespace ProfileEvents { @@ -6210,8 +6216,49 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, QueryExpressionsAliasVisitor & expressions_visitor, bool nested_table_function) { + + String database_name = scope.context->getCurrentDatabase(); + String table_name = table_function_node->getOriginalAST()->as()->name; + + if (table_function_node->getOriginalAST()->as()->is_compound_name) + { + std::vector parts; + splitInto<'.'>(parts, table_function_node->getOriginalAST()->as()->name); + + if (parts.size() == 2) + { + database_name = parts[0]; + table_name = parts[1]; + } + } + auto & table_function_node_typed = table_function_node->as(); + StoragePtr table = DatabaseCatalog::instance().tryGetTable({database_name, table_name}, scope.context->getQueryContext()); + if (table) + { + if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) + { + auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_function_node->getOriginalAST()); + StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); + + ASTCreateQuery create; + create.select = query->as(); + auto sample_block = InterpreterSelectWithUnionQuery::getSampleBlock(query, scope.context); + auto res = std::make_shared(StorageID(database_name, table_name), + create, + ColumnsDescription(sample_block.getNamesAndTypesList()), + /* comment */ "", + /* is_parameterized_view */ true); + res->startup(); + table_function_node->getOriginalAST()->as()->prefer_subquery_to_function_formatting = true; + table_function_node_typed.resolve(std::move(res), scope.context); + return; + } + } + + if (!nested_table_function) expressions_visitor.visit(table_function_node_typed.getArgumentsNode()); diff --git a/src/Analyzer/TableFunctionNode.cpp b/src/Analyzer/TableFunctionNode.cpp index e5158a06373..f4ffe7f4ee5 100644 --- a/src/Analyzer/TableFunctionNode.cpp +++ b/src/Analyzer/TableFunctionNode.cpp @@ -36,6 +36,13 @@ void TableFunctionNode::resolve(TableFunctionPtr table_function_value, StoragePt unresolved_arguments_indexes = std::move(unresolved_arguments_indexes_); } +void TableFunctionNode::resolve(StoragePtr storage_value, ContextPtr context) +{ + storage = std::move(storage_value); + storage_id = storage->getStorageID(); + storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); +} + const StorageID & TableFunctionNode::getStorageID() const { if (!storage) diff --git a/src/Analyzer/TableFunctionNode.h b/src/Analyzer/TableFunctionNode.h index 69237ac8416..ed1a26c4dd4 100644 --- a/src/Analyzer/TableFunctionNode.h +++ b/src/Analyzer/TableFunctionNode.h @@ -100,6 +100,9 @@ public: /// Resolve table function with table function, storage and context void resolve(TableFunctionPtr table_function_value, StoragePtr storage_value, ContextPtr context, std::vector unresolved_arguments_indexes_); + /// Resolve table function as parameterized view with storage and context + void resolve(StoragePtr storage_value, ContextPtr context); + /// Get storage id, throws exception if function node is not resolved const StorageID & getStorageID() const; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 92d74f4f18a..58b6722aae9 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -76,7 +76,8 @@ #include #include - +#include +#include namespace DB { @@ -745,12 +746,43 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti } else if (create.select) { - Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) { - as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(create.select->clone(), getContext()); + if (create.isParameterizedView()) + { + auto select = create.select->clone(); + + ///Get all query parameters + const auto parameters = analyzeReceiveQueryParamsWithType(select); + NameToNameMap parameter_values; + + for (const auto & parameter : parameters) + { + const auto data_type = DataTypeFactory::instance().get(parameter.second); + /// Todo improve getting default values & include more datatypes + if (data_type->isValueRepresentedByNumber() || parameter.second == "String") + parameter_values[parameter.first] = "1"; + else if (parameter.second.starts_with("Array") || parameter.second.starts_with("Map")) + parameter_values[parameter.first] = "[]"; + else + parameter_values[parameter.first] = " "; + LOG_INFO(&Poco::Logger::get("InterpreterCreateQuery"), "parameter = {} = {} ", parameter.first, parameter_values[parameter.first]); + + } + + /// Replace with default parameters + ReplaceQueryParameterVisitor visitor(parameter_values); + visitor.visit(select); + + as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(select, getContext()); + } + else + { + as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(create.select->clone(), getContext()); + } + } else { From 2dfda84da0e16c594df7df4eb2b05ee1baba1193 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 2 Sep 2023 21:57:57 +0200 Subject: [PATCH 0013/1081] Removed parameterized view tests from analyzer_tech_debt.txt --- tests/analyzer_tech_debt.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 15d46403da9..5521234495f 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -88,7 +88,6 @@ 02402_merge_engine_with_view 02404_memory_bound_merging 02426_orc_bug -02428_parameterized_view 02458_use_structure_from_insertion_table 02479_race_condition_between_insert_and_droppin_mv 02493_inconsistent_hex_and_binary_number @@ -123,7 +122,6 @@ 02581_share_big_sets_between_mutation_tasks_long 02581_share_big_sets_between_multiple_mutations_tasks_long 00992_system_parts_race_condition_zookeeper_long -02818_parameterized_view_with_cte_multiple_usage 02790_optimize_skip_unused_shards_join 01940_custom_tld_sharding_key 02815_range_dict_no_direct_join From 59195e1199d5c8ed31f4243b58f3186771219295 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 4 Sep 2023 19:03:23 +0200 Subject: [PATCH 0014/1081] Removed log for each parameter --- src/Interpreters/InterpreterCreateQuery.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 58b6722aae9..66c219dcd56 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -768,8 +768,6 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti parameter_values[parameter.first] = "[]"; else parameter_values[parameter.first] = " "; - LOG_INFO(&Poco::Logger::get("InterpreterCreateQuery"), "parameter = {} = {} ", parameter.first, parameter_values[parameter.first]); - } /// Replace with default parameters From eb7aad00160b1418ed96ecc83770b62ce3bfaaf0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 Nov 2023 11:35:54 +0100 Subject: [PATCH 0015/1081] Do not consider parts broken if only projections are broken --- src/Interpreters/MutationsInterpreter.cpp | 14 + src/Interpreters/MutationsInterpreter.h | 1 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 61 +++- src/Storages/MergeTree/IMergeTreeDataPart.h | 19 +- src/Storages/MergeTree/MergeTreeData.cpp | 26 +- src/Storages/MergeTree/MergeTreeData.h | 11 +- .../MergeTree/MergeTreeDataPartChecksum.h | 2 + src/Storages/MergeTree/MutateTask.cpp | 8 +- .../ReplicatedMergeTreePartCheckThread.cpp | 31 +- .../ReplicatedMergeTreePartCheckThread.h | 4 +- src/Storages/MergeTree/checkDataPart.cpp | 80 ++++- src/Storages/MergeTree/checkDataPart.h | 4 +- src/Storages/StorageMergeTree.cpp | 5 +- src/Storages/StorageReplicatedMergeTree.cpp | 3 +- src/Storages/System/StorageSystemDisks.cpp | 2 +- .../System/StorageSystemPartsBase.cpp | 8 +- src/Storages/System/StorageSystemPartsBase.h | 2 +- .../System/StorageSystemProjectionParts.cpp | 48 ++- .../StorageSystemProjectionPartsColumns.cpp | 21 +- .../02916_broken_projection.reference | 224 ++++++++++++++ .../0_stateless/02916_broken_projection.sh | 283 ++++++++++++++++++ 21 files changed, 795 insertions(+), 62 deletions(-) create mode 100644 tests/queries/0_stateless/02916_broken_projection.reference create mode 100755 tests/queries/0_stateless/02916_broken_projection.sh diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 0ace0a8b79c..a9a5d4f33d0 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -305,6 +305,11 @@ bool MutationsInterpreter::Source::hasProjection(const String & name) const return part && part->hasProjection(name); } +bool MutationsInterpreter::Source::hasBrokenProjection(const String & name) const +{ + return part && part->hasBrokenProjection(name); +} + bool MutationsInterpreter::Source::isCompactPart() const { return part && part->getType() == MergeTreeDataPartType::Compact; @@ -922,6 +927,15 @@ void MutationsInterpreter::prepare(bool dry_run) materialized_indices.insert(index.name); } + /// Always rebuild broken projections. + for (const auto & projection : metadata_snapshot->getProjections()) + { + if (!source.hasBrokenProjection(projection.name)) + continue; + + materialized_projections.insert(projection.name); + } + for (const auto & projection : metadata_snapshot->getProjections()) { if (!source.hasProjection(projection.name)) diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index c53b86ddb5e..33b8021a653 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -122,6 +122,7 @@ public: bool materializeTTLRecalculateOnly() const; bool hasSecondaryIndex(const String & name) const; bool hasProjection(const String & name) const; + bool hasBrokenProjection(const String & name) const; bool isCompactPart() const; void read( diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 9bc72577b25..bc81758675e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -732,7 +732,23 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch else { auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); - part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + + try + { + part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), + "Cannot load projection {}, will consider it broken", projection.name); + + addBrokenProjectionPart(projection.name, std::move(part), getCurrentExceptionMessage(false), getCurrentExceptionCode()); + continue; + } + addProjectionPart(projection.name, std::move(part)); } } @@ -1129,7 +1145,8 @@ void IMergeTreeDataPart::loadChecksums(bool require) /// Check the data while we are at it. LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); - checksums = checkDataPart(shared_from_this(), false); + bool noop; + checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */false); writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); @@ -2130,6 +2147,46 @@ std::optional IMergeTreeDataPart::getStreamNameForColumn( return getStreamNameOrHash(stream_name, extension, storage_); } +void IMergeTreeDataPart::addBrokenProjectionPart( + const String & projection_name, + std::shared_ptr projection_part, + const String & message, + int code) +{ + projection_part->setBrokenReason(message, code); + bool inserted = broken_projection_parts.emplace(projection_name, projection_part).second; + if (!inserted) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection part {} in part {} is already added to a broken projection parts list", projection_name, name); +} + +void IMergeTreeDataPart::markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const +{ + std::lock_guard lock(broken_projections_mutex); + + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no projection part '{}'", projection_name); + + it->second->setBrokenReason(message, code); + + broken_projection_parts.emplace(projection_name, it->second); + projection_parts.erase(it); +} + +void IMergeTreeDataPart::setBrokenReason(const String & message, int code) +{ + std::lock_guard lock(broken_projections_mutex); + is_broken = true; + exception = message; + exception_code = code; +} + +bool IMergeTreeDataPart::hasBrokenProjection(const String & projection_name) const +{ + std::lock_guard lock(broken_projections_mutex); + return broken_projection_parts.contains(projection_name); +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::Compact); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index a9659d2f5f4..52a1541e15f 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -255,6 +255,12 @@ public: /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table. mutable std::atomic is_frozen {false}; + /// If it is a projection part, it can be broken sometimes. + mutable std::atomic is_broken {false}; + mutable std::string exception; + mutable int exception_code = 0; + mutable std::mutex broken_projections_mutex; + /// Indicates that the part was marked Outdated by PartCheckThread because the part was not committed to ZooKeeper mutable bool is_unexpected_local_part = false; @@ -405,12 +411,20 @@ public: const std::map> & getProjectionParts() const { return projection_parts; } + const std::map> & getBrokenProjectionParts() const { return broken_projection_parts; } + MergeTreeDataPartBuilder getProjectionPartBuilder(const String & projection_name, bool is_temp_projection = false); void addProjectionPart(const String & projection_name, std::shared_ptr && projection_part); + void addBrokenProjectionPart(const String & projection_name, std::shared_ptr projection_part, const String & message, int code); + + void markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const; + bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } + bool hasBrokenProjection(const String & projection_name) const; + void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); /// Return set of metadata file names without checksums. For example, @@ -564,7 +578,8 @@ protected: const IMergeTreeDataPart * parent_part; String parent_part_name; - std::map> projection_parts; + mutable std::map> projection_parts; + mutable std::map> broken_projection_parts; mutable PartMetadataManagerPtr metadata_manager; @@ -678,6 +693,8 @@ private: void incrementStateMetric(MergeTreeDataPartState state) const; void decrementStateMetric(MergeTreeDataPartState state) const; + void setBrokenReason(const String & message, int code); + /// This ugly flag is needed for debug assertions only mutable bool part_is_probably_removed_from_disk = false; }; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1c0f9208fef..152c386e188 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5737,7 +5737,7 @@ MergeTreeData::getDataPartsVectorForInternalUsage(const DataPartStates & afforda } MergeTreeData::ProjectionPartsVector -MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, DataPartStateVector * out_states) const +MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, bool fill_states) const { auto lock = lockParts(); ProjectionPartsVector res; @@ -5749,14 +5749,20 @@ MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & a res.data_parts.push_back(part); for (const auto & [_, projection_part] : part->getProjectionParts()) res.projection_parts.push_back(projection_part); + for (const auto & [_, projection_part] : part->getBrokenProjectionParts()) + res.broken_projection_parts.push_back(projection_part); } } - if (out_states != nullptr) + if (fill_states) { - out_states->resize(res.projection_parts.size()); + res.projection_parts_states.resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + + res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); + for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) + (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); } return res; @@ -5809,7 +5815,7 @@ bool MergeTreeData::supportsLightweightDelete() const return true; } -MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const +MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(bool fill_states) const { ProjectionPartsVector res; auto lock = lockParts(); @@ -5820,11 +5826,15 @@ MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector( res.projection_parts.push_back(projection_part); } - if (out_states != nullptr) + if (fill_states) { - out_states->resize(res.projection_parts.size()); + res.projection_parts_states.resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + + res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); + for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) + (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); } return res; } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 54104849fe4..4ef3b75988b 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -468,8 +468,13 @@ public: struct ProjectionPartsVector { - DataPartsVector projection_parts; DataPartsVector data_parts; + + DataPartsVector projection_parts; + DataPartStateVector projection_parts_states; + + DataPartsVector broken_projection_parts; + DataPartStateVector broken_projection_parts_states; }; /// Returns a copy of the list so that the caller shouldn't worry about locks. @@ -484,7 +489,7 @@ public: const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; /// Same as above but only returns projection parts ProjectionPartsVector getProjectionPartsVectorForInternalUsage( - const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; + const DataPartStates & affordable_states, bool fill_states = false) const; /// Returns absolutely all parts (and snapshot of their states) @@ -496,7 +501,7 @@ public: size_t getTotalMarksCount() const; /// Same as above but only returns projection parts - ProjectionPartsVector getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states = nullptr) const; + ProjectionPartsVector getAllProjectionPartsVector(bool fill_states = false) const; /// Returns parts in Active state DataParts getDataPartsForInternalUsage() const; diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h index 8e5e8c8c448..3595ce38db5 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h @@ -54,6 +54,8 @@ struct MergeTreeDataPartChecksums bool has(const String & file_name) const { return files.find(file_name) != files.end(); } + bool remove(const String & file_name) { return files.erase(file_name); } + bool empty() const { return files.empty(); } /// Checks that the set of columns and their checksums are the same. If not, throws an exception. diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 911b25de2ad..8ef1621b647 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -513,7 +513,9 @@ static std::set getProjectionsToRecalculate( { bool need_recalculate = materialized_projections.contains(projection.name) - || (!is_full_part_storage && source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && (source_part->hasProjection(projection.name) + || source_part->hasBrokenProjection(projection.name))); if (need_recalculate) projections_to_recalc.insert(&projection); @@ -1367,7 +1369,9 @@ private: bool need_recalculate = ctx->materialized_projections.contains(projection.name) - || (!is_full_part_storage && ctx->source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && (ctx->source_part->hasProjection(projection.name) + || ctx->source_part->hasBrokenProjection(projection.name))); if (need_recalculate) { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index b1875464725..4468cf8e3bf 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -274,7 +274,7 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo return std::make_pair(exists_in_zookeeper, part); } -ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name) +ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name, bool throw_on_broken_projection) { ReplicatedCheckResult result; auto [exists_in_zookeeper, part] = findLocalPart(part_name); @@ -341,6 +341,7 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St /// before the ReplicatedMergeTreePartHeader was introduced. String part_path = storage.replica_path + "/parts/" + part_name; String part_znode = zookeeper->get(part_path); + bool is_broken_projection = false; try { @@ -362,8 +363,10 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St checkDataPart( part, - true, - [this] { return need_stop.load(); }); + /* require_checksums */true, + is_broken_projection, + [this] { return need_stop.load(); }, + throw_on_broken_projection); if (need_stop) { @@ -384,12 +387,22 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St tryLogCurrentException(log, __PRETTY_FUNCTION__); - auto message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); - LOG_ERROR(log, message); + PreformattedMessage message; + if (is_broken_projection) + { + message = PreformattedMessage::create("Part {} has a broken projection. It will be ignored.", part_name); + LOG_DEBUG(log, message); + result.action = ReplicatedCheckResult::DoNothing; + } + else + { + message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); + LOG_ERROR(log, message); + result.action = ReplicatedCheckResult::TryFetchMissing; + } /// Part is broken, let's try to find it and fetch. result.status = {part_name, false, message}; - result.action = ReplicatedCheckResult::TryFetchMissing; return result; } @@ -419,12 +432,12 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St } -CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after) +CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after, bool throw_on_broken_projection) { LOG_INFO(log, "Checking part {}", part_name); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks); - ReplicatedCheckResult result = checkPartImpl(part_name); + ReplicatedCheckResult result = checkPartImpl(part_name, throw_on_broken_projection); switch (result.action) { case ReplicatedCheckResult::None: UNREACHABLE(); @@ -577,7 +590,7 @@ void ReplicatedMergeTreePartCheckThread::run() } std::optional recheck_after; - checkPartAndFix(selected->name, &recheck_after); + checkPartAndFix(selected->name, &recheck_after, /* throw_on_broken_projection */false); if (need_stop) return; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h index 68dc6ca3d1d..26c4bfe9384 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h @@ -65,9 +65,9 @@ public: size_t size() const; /// Check part by name - CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr); + CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr, bool throw_on_broken_projection = true); - ReplicatedCheckResult checkPartImpl(const String & part_name); + ReplicatedCheckResult checkPartImpl(const String & part_name, bool throw_on_broken_projection); std::unique_lock pausePartsCheck(); diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index a75df00e8a7..74af7cbb77c 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -111,7 +111,9 @@ static IMergeTreeDataPart::Checksums checkDataPart( const NameSet & files_without_checksums, const ReadSettings & read_settings, bool require_checksums, - std::function is_cancelled) + std::function is_cancelled, + bool & is_broken_projection, + bool throw_on_broken_projection) { /** Responsibility: * - read list of columns from columns.txt; @@ -120,6 +122,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( */ CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedChecks}; + Poco::Logger * log = &Poco::Logger::get("checkDataPart"); NamesAndTypesList columns_txt; @@ -269,23 +272,68 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } - for (const auto & [name, projection] : data_part->getProjectionParts()) + auto check_projection = [&](const String & name, std::shared_ptr projection) { - if (is_cancelled()) - return {}; - auto projection_file = name + ".proj"; - auto projection_checksums = checkDataPart( - projection, *data_part_storage.getProjection(projection_file), - projection->getColumns(), projection->getType(), - projection->getFileNamesWithoutChecksums(), - read_settings, require_checksums, is_cancelled); + if (!throw_on_broken_projection && projection->is_broken) + { + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + return; + } + + IMergeTreeDataPart::Checksums projection_checksums; + try + { + bool noop; + projection_checksums = checkDataPart( + projection, *data_part_storage.getProjection(projection_file), + projection->getColumns(), projection->getType(), + projection->getFileNamesWithoutChecksums(), + read_settings, require_checksums, is_cancelled, noop, /* throw_on_broken_projection */false); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); + + if (!data_part->hasBrokenProjection(name)) + data_part->markProjectionPartAsBroken(name, getCurrentExceptionMessage(false), getCurrentExceptionCode()); + + is_broken_projection = true; + if (throw_on_broken_projection) + throw; + + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + return; + } checksums_data.files[projection_file] = IMergeTreeDataPart::Checksums::Checksum( projection_checksums.getTotalSizeOnDisk(), projection_checksums.getTotalChecksumUInt128()); projections_on_disk.erase(projection_file); + }; + + auto broken_projection_parts = data_part->getBrokenProjectionParts(); /// Iterate over copy + for (const auto & [name, projection] : broken_projection_parts) + { + if (is_cancelled()) + return {}; + else + check_projection(name, projection); + } + + auto projection_parts = data_part->getProjectionParts(); /// Iterate over copy + for (const auto & [name, projection] : projection_parts) + { + if (is_cancelled()) + return {}; + else + check_projection(name, projection); } if (require_checksums && !projections_on_disk.empty()) @@ -315,7 +363,9 @@ IMergeTreeDataPart::Checksums checkDataPartInMemory(const DataPartInMemoryPtr & IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled) + bool & is_broken_projection, + std::function is_cancelled, + bool throw_on_broken_projection) { if (auto part_in_memory = asInMemoryPart(data_part)) return checkDataPartInMemory(part_in_memory); @@ -357,7 +407,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); }; try @@ -371,7 +423,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); } catch (...) { diff --git a/src/Storages/MergeTree/checkDataPart.h b/src/Storages/MergeTree/checkDataPart.h index d0e48b6f80a..a01978f4efe 100644 --- a/src/Storages/MergeTree/checkDataPart.h +++ b/src/Storages/MergeTree/checkDataPart.h @@ -10,7 +10,9 @@ namespace DB IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled = []{ return false; }); + bool & is_broken_projection, + std::function is_cancelled = []{ return false; }, + bool throw_on_broken_projection = false); bool isNotEnoughMemoryErrorCode(int code); bool isRetryableException(const std::exception_ptr exception_ptr); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index e9a0dd5fbf3..74277616e95 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2242,11 +2242,12 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { /// If the checksums file is not present, calculate the checksums and write them to disk. static constexpr auto checksums_path = "checksums.txt"; + bool noop; if (part->isStoredOnDisk() && !part->getDataPartStorage().exists(checksums_path)) { try { - auto calculated_checksums = checkDataPart(part, false); + auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); calculated_checksums.checkEqual(part->checksums, true); auto & part_mutable = const_cast(*part); @@ -2267,7 +2268,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - checkDataPart(part, true); + checkDataPart(part, true, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); return CheckResult(part->name, true, ""); } catch (...) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 74821a9186c..1859fa03094 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8690,12 +8690,11 @@ IStorage::DataValidationTasksPtr StorageReplicatedMergeTree::getCheckTaskList( std::optional StorageReplicatedMergeTree::checkDataNext(DataValidationTasksPtr & check_task_list) { - if (auto part = assert_cast(check_task_list.get())->next()) { try { - return CheckResult(part_check_thread.checkPartAndFix(part->name)); + return part_check_thread.checkPartAndFix(part->name, /* recheck_after */nullptr, /* throw_on_broken_projection */true); } catch (const Exception & ex) { diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 23a00cc7ae5..250fcdba641 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -63,7 +63,7 @@ Pipe StorageSystemDisks::read( for (const auto & [disk_name, disk_ptr] : context->getDisksMap()) { col_name->insert(disk_name); - col_path->insert(disk_ptr->getPath()); + col_path->insert(fs::absolute(disk_ptr->getPath()).string()); col_free->insert(disk_ptr->getAvailableSpace().value_or(std::numeric_limits::max())); col_total->insert(disk_ptr->getTotalSpace().value_or(std::numeric_limits::max())); col_unreserved->insert(disk_ptr->getUnreservedSpace().value_or(std::numeric_limits::max())); diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index 513af6cfc46..e97c13b1fed 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -64,7 +64,7 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat } MergeTreeData::ProjectionPartsVector -StoragesInfo::getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const +StoragesInfo::getProjectionParts(bool fill_states, bool has_state_column) const { if (data->getInMemoryMetadataPtr()->projections.empty()) return {}; @@ -74,12 +74,12 @@ StoragesInfo::getProjectionParts(MergeTreeData::DataPartStateVector & state, boo { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, &state); + return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, fill_states); - return data->getAllProjectionPartsVector(&state); + return data->getAllProjectionPartsVector(fill_states); } - return data->getProjectionPartsVectorForInternalUsage({State::Active}, &state); + return data->getProjectionPartsVectorForInternalUsage({State::Active}, fill_states); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index c3d2e64b303..e0e81f0d24d 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -25,7 +25,7 @@ struct StoragesInfo explicit operator bool() const { return storage != nullptr; } MergeTreeData::DataPartsVector getParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; - MergeTreeData::ProjectionPartsVector getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; + MergeTreeData::ProjectionPartsVector getProjectionParts(bool fill_states, bool has_state_column) const; }; /** A helper class that enumerates the storages that match given query. */ diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 213865a8d61..44bdb294a2d 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -83,7 +83,11 @@ StorageSystemProjectionParts::StorageSystemProjectionParts(const StorageID & tab {"rows_where_ttl_info.expression", std::make_shared(std::make_shared())}, {"rows_where_ttl_info.min", std::make_shared(std::make_shared())}, - {"rows_where_ttl_info.max", std::make_shared(std::make_shared())} + {"rows_where_ttl_info.max", std::make_shared(std::make_shared())}, + + {"is_broken", std::make_shared()}, + {"exception_code", std::make_shared()}, + {"exception", std::make_shared()}, } ) { @@ -93,15 +97,14 @@ void StorageSystemProjectionParts::processNextStorage( ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { using State = MergeTreeDataPartState; - MergeTreeData::DataPartStateVector all_parts_state; - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); + auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) { - const auto & part = all_parts.projection_parts[part_number]; + const auto & part = parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = all_parts_state[part_number]; + auto part_state = states[part_number]; ColumnSize columns_size = part->getTotalColumnsSize(); ColumnSize parent_columns_size = parent_part->getTotalColumnsSize(); @@ -278,10 +281,43 @@ void StorageSystemProjectionParts::processNextStorage( add_ttl_info_map(part->ttl_infos.group_by_ttl); add_ttl_info_map(part->ttl_infos.rows_where_ttl); + { + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->is_broken.load(std::memory_order_relaxed)); + + if (part->is_broken) + { + std::lock_guard lock(part->broken_projections_mutex); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception_code); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception); + } + else + { + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + } + } + /// _state column should be the latest. /// Do not use part->getState*, it can be changed from different thread if (has_state_column) columns[res_index++]->insert(IMergeTreeDataPart::stateString(part_state)); + }; + + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + { + auto part = all_parts.projection_parts[part_number]; + fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); + } + + for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) + { + auto part = all_parts.broken_projection_parts[part_number]; + fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index 06becc6d91c..3f4224e46bb 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -103,15 +103,14 @@ void StorageSystemProjectionPartsColumns::processNextStorage( } /// Go through the list of projection parts. - MergeTreeData::DataPartStateVector all_parts_state; - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); + auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) { - const auto & part = all_parts.projection_parts[part_number]; + const auto & part = parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = all_parts_state[part_number]; + auto part_state = states[part_number]; auto columns_size = part->getTotalColumnsSize(); auto parent_columns_size = parent_part->getTotalColumnsSize(); @@ -260,6 +259,18 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (has_state_column) columns[res_index++]->insert(part->stateString()); } + }; + + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + { + auto part = all_parts.projection_parts[part_number]; + fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); + } + + for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) + { + auto part = all_parts.broken_projection_parts[part_number]; + fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference new file mode 100644 index 00000000000..d0b07e081db --- /dev/null +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -0,0 +1,224 @@ +insert new part +insert new part +insert new part +insert new part +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +1 +0 +broke metadata of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +check table full +all_0_0_0 1 +all_1_1_0 1 +all_3_3_0 1 +all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. +0 +broke data of part 'proj_2' (parent part: all_2_2_0) +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +FILE_DOESNT_EXIST +check table +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +all_2_2_0 proj_2 NO_FILE_IN_DATA_PART +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 [] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +0 +broke data of part 'proj_2' (parent part: all_3_3_0) +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +all_2_2_0 proj_2 NO_FILE_IN_DATA_PART +insert new part +insert new part +optimize +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +all_2_2_0 proj_2 NO_FILE_IN_DATA_PART +all_3_3_0 proj_2 NO_FILE_IN_DATA_PART +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 [] +all_3_3_0 0 ['proj'] +all_3_5_1 1 ['proj'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +0 +broke metadata of part 'proj' (parent part: all_1_1_0) +Detach - Attach +broken projections info +all_1_1_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj_2 FILE_DOESNT_EXIST +all_3_3_0 proj_2 FILE_DOESNT_EXIST +0 +broke data of part 'proj_2' (parent part: all_1_1_0) +Detach - Attach +broken projections info +all_1_1_0 proj NO_FILE_IN_DATA_PART +all_1_1_0 proj_2 FILE_DOESNT_EXIST +all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj_2 FILE_DOESNT_EXIST +all_3_3_0 proj_2 FILE_DOESNT_EXIST +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 [] +all_2_2_0 1 [] +all_3_3_0 0 ['proj'] +all_3_5_1 1 ['proj'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +check table full +all_3_5_1 1 +all_0_0_0 1 +all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. +all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. +materialize projection proj +check table full +all_3_5_1_6 1 +all_0_0_0_6 1 +all_2_2_0_6 1 +all_1_1_0_6 1 +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_0_0_0_6 1 ['proj','proj_2'] +all_1_1_0 0 [] +all_1_1_0_6 1 ['proj','proj_2'] +all_2_2_0 0 [] +all_2_2_0_6 1 ['proj','proj_2'] +all_3_3_0 0 ['proj'] +all_3_5_1 0 ['proj'] +all_3_5_1_6 1 ['proj'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +1 +materialize projection proj_2 +check table full +all_3_5_1_7 1 +all_0_0_0_7 1 +all_2_2_0_7 1 +all_1_1_0_7 1 +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_0_0_0_6 0 ['proj','proj_2'] +all_0_0_0_7 1 ['proj','proj_2'] +all_1_1_0 0 [] +all_1_1_0_6 0 ['proj','proj_2'] +all_1_1_0_7 1 ['proj','proj_2'] +all_2_2_0 0 [] +all_2_2_0_6 0 ['proj','proj_2'] +all_2_2_0_7 1 ['proj','proj_2'] +all_3_3_0 0 ['proj'] +all_3_5_1 0 ['proj'] +all_3_5_1_6 0 ['proj'] +all_3_5_1_7 1 ['proj','proj_2'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh new file mode 100755 index 00000000000..81adfe6e49d --- /dev/null +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -0,0 +1,283 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " +DROP TABLE IF EXISTS test SYNC; +CREATE TABLE test +( + a String, + b String, + c Int32, + d Int32, + e Int32, + + PROJECTION proj + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT d ORDER BY c + ) +) +ENGINE = ReplicatedMergeTree('/test2/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) +SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once=3, + enable_vertical_merge_algorithm=1, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1; +" + +table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") + +function random() +{ + cat /dev/urandom | LC_ALL=C tr -dc 'a-zA-Z' | fold -w ${1:-8} | head -n 1 +} + +function insert() +{ + offset=$1 + size=$2 + echo 'insert new part' + $CLICKHOUSE_CLIENT -q "INSERT INTO test SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" +} + +function break_projection() +{ + part_name=$1 + parent_name=$2 + break_type=$3 + + read -r disk_name part_path <<< $($CLICKHOUSE_CLIENT -nm -q " + SELECT disk_name, path + FROM system.projection_parts + WHERE table='test' + AND database=currentDatabase() + AND active=1 + AND part_name='$part_name' + AND parent_name='$parent_name' + LIMIT 1; + ") + + path=$($CLICKHOUSE_CLIENT -q "SELECT path FROM system.disks WHERE name='$disk_name'") + + # make sure path is absolute + $CLICKHOUSE_CLIENT -q "select throwIf(substring('$path', 1, 1) != '/', 'Path is relative: $path')" || exit + + if [ "$break_type" = "data" ] + then + rm "$path/$part_path/d.bin" + rm "$path/$part_path/c.bin" + echo "broke data of part '$part_name' (parent part: $parent_name)" + else + rm "$path/$part_path/columns.txt" + echo "broke metadata of part '$part_name' (parent part: $parent_name)" + fi +} + +function broken_projections_info() +{ + echo 'broken projections info' + $CLICKHOUSE_CLIENT -q " + SELECT parent_name, name, errors.name FROM + ( + SELECT parent_name, name, exception_code + FROM system.projection_parts + WHERE table='test' + AND database=currentDatabase() + AND is_broken = 1 + ) AS parts_info + INNER JOIN system.errors AS errors + ON parts_info.exception_code = errors.code + ORDER BY parent_name, name +" +} + +function check() +{ + expect_broken_part="" + expected_error="" + if [ $# -ne 0 ]; then + expect_broken_part=$1 + expected_error=$2 + fi + + echo 'system.parts' + $CLICKHOUSE_CLIENT -q " + SELECT name, active, projections + FROM system.parts + WHERE table='test' AND database=currentDatabase() + ORDER BY name;" + + echo "select from projection 'proj'" + query_id=$(random 8) + + if [ "$expect_broken_part" = "proj" ] + then + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12;" 2>&1 | grep -o $expected_error + else + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16;" + echo 'used projections' + $CLICKHOUSE_CLIENT -nm -q " + SYSTEM FLUSH LOGS; + SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + " + fi + + echo "select from projection 'proj_2'" + query_id=$(random 8) + + if [ "$expect_broken_part" = "proj_2" ] + then + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12;" 2>&1 | grep -o $expected_error + else + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16;" + echo 'used projections' + $CLICKHOUSE_CLIENT -nm -q " + SYSTEM FLUSH LOGS; + SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + " + fi + + echo 'check table' + $CLICKHOUSE_CLIENT -q "CHECK TABLE test" +} + +function optimize_no_wait() +{ + echo 'optimize' + $CLICKHOUSE_CLIENT -nm -q "OPTIMIZE TABLE test SETTINGS alter_sync=0;" +} + +function reattach() +{ + echo 'Detach - Attach' + $CLICKHOUSE_CLIENT -nm -q " + DETACH TABLE test; + ATTACH TABLE test; + " +} + +function materialize_projection +{ + projection=$1 + echo "materialize projection $projection" + $CLICKHOUSE_CLIENT -q "ALTER TABLE test MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" +} + +function check_table_full() +{ + echo 'check table full' + $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" +} + + +insert 0 5 + +insert 5 5 + +insert 10 5 + +insert 15 5 + +check + +# Break metadata file of projection 'proj' +break_projection proj all_2_2_0 metadata + +# Do select and after "check table" query. +# Select works because it does not read columns.txt. +check + +# Projection 'proj' from part all_2_2_0 will now appear in broken parts info +# because it was marked broken during "check table" query. +# TODO: try to mark it during select as well +broken_projections_info + +# Check table query will also show a list of parts which have broken projections. +check_table_full + +# Break data file of projection 'proj_2' for part all_2_2_0 +break_projection proj_2 all_2_2_0 data + +# It will not yet appear in broken projections info. +broken_projections_info + +# Select now fails with error "File doesn't exist" +check "proj_2" "FILE_DOESNT_EXIST" + +# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. +broken_projections_info + +# Second select works, because projection is now marked as broken. +check + +# Break data file of projection 'proj_2' for part all_3_3_0 +break_projection proj_2 all_3_3_0 data + +# It will not yet appear in broken projections info. +broken_projections_info + +insert 20 5 + +insert 25 5 + +# Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. +# Parts all_4_4_0 and all_5_5_0 have both non-broken projections. +# So a merge will be create for future part all_3_5_1. +# During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. +# Merge will be retried and on second attempt it will succeed. +# The result part all_3_5_1 will have only 1 projection - 'proj', because +# it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. +optimize_no_wait +sleep 2 + +$CLICKHOUSE_CLIENT -nm -q " +SYSTEM FLUSH LOGS; +SELECT count() FROM system.text_log +WHERE level='Error' +AND logger_name='MergeTreeBackgroundExecutor' +AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' +" + +# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. +broken_projections_info + +check + +break_projection proj all_1_1_0 metadata + +reattach + +broken_projections_info + +break_projection proj_2 all_1_1_0 data + +reattach + +broken_projections_info + +check + +check_table_full + +materialize_projection proj + +check_table_full + +check + +materialize_projection proj_2 + +check_table_full + +check + +$CLICKHOUSE_CLIENT -nm -q " +DROP TABLE test; +" From 6c42a3fad6b58efdf91115c3b80f267f1f604c62 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 16 Nov 2023 16:43:34 +0100 Subject: [PATCH 0016/1081] Better --- src/Interpreters/MutationsInterpreter.cpp | 18 ++-- .../Optimizations/projectionsCommon.cpp | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 45 +++------ src/Storages/MergeTree/IMergeTreeDataPart.h | 11 +-- src/Storages/MergeTree/MergeTask.cpp | 5 +- src/Storages/MergeTree/MergeTreeData.cpp | 57 ++++++----- src/Storages/MergeTree/MergeTreeData.h | 4 +- src/Storages/MergeTree/MutateTask.cpp | 8 +- src/Storages/MergeTree/checkDataPart.cpp | 34 ++----- .../System/StorageSystemPartsBase.cpp | 8 +- src/Storages/System/StorageSystemPartsBase.h | 2 +- .../System/StorageSystemProjectionParts.cpp | 30 +++--- .../StorageSystemProjectionPartsColumns.cpp | 21 +--- .../02916_broken_projection.reference | 95 ++++++++----------- .../0_stateless/02916_broken_projection.sh | 12 +-- 15 files changed, 146 insertions(+), 206 deletions(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index a9a5d4f33d0..237bffe4a67 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -753,7 +753,7 @@ void MutationsInterpreter::prepare(bool dry_run) { mutation_kind.set(MutationKind::MUTATE_INDEX_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); - if (!source.hasProjection(projection.name)) + if (!source.hasProjection(projection.name) || source.hasBrokenProjection(projection.name)) { for (const auto & column : projection.required_columns) dependencies.emplace(column, ColumnDependency::PROJECTION); @@ -927,20 +927,18 @@ void MutationsInterpreter::prepare(bool dry_run) materialized_indices.insert(index.name); } - /// Always rebuild broken projections. - for (const auto & projection : metadata_snapshot->getProjections()) - { - if (!source.hasBrokenProjection(projection.name)) - continue; - - materialized_projections.insert(projection.name); - } - for (const auto & projection : metadata_snapshot->getProjections()) { if (!source.hasProjection(projection.name)) continue; + /// Always rebuild broken projections. + if (source.hasBrokenProjection(projection.name)) + { + materialized_projections.insert(projection.name); + continue; + } + if (need_rebuild_projections) { materialized_projections.insert(projection.name); diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index c3b3449857b..9ebd5aaa32f 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -224,7 +224,7 @@ bool analyzeProjectionCandidate( { const auto & created_projections = part_with_ranges.data_part->getProjectionParts(); auto it = created_projections.find(candidate.projection->name); - if (it != created_projections.end()) + if (it != created_projections.end() && !it->second->is_broken) { projection_parts.push_back(it->second); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index bc81758675e..85ce112d9a1 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -745,8 +745,7 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), "Cannot load projection {}, will consider it broken", projection.name); - addBrokenProjectionPart(projection.name, std::move(part), getCurrentExceptionMessage(false), getCurrentExceptionCode()); - continue; + part->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); } addProjectionPart(projection.name, std::move(part)); @@ -2147,44 +2146,30 @@ std::optional IMergeTreeDataPart::getStreamNameForColumn( return getStreamNameOrHash(stream_name, extension, storage_); } -void IMergeTreeDataPart::addBrokenProjectionPart( - const String & projection_name, - std::shared_ptr projection_part, - const String & message, - int code) -{ - projection_part->setBrokenReason(message, code); - bool inserted = broken_projection_parts.emplace(projection_name, projection_part).second; - if (!inserted) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection part {} in part {} is already added to a broken projection parts list", projection_name, name); -} - void IMergeTreeDataPart::markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const { - std::lock_guard lock(broken_projections_mutex); - auto it = projection_parts.find(projection_name); if (it == projection_parts.end()) throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no projection part '{}'", projection_name); - it->second->setBrokenReason(message, code); - - broken_projection_parts.emplace(projection_name, it->second); - projection_parts.erase(it); -} - -void IMergeTreeDataPart::setBrokenReason(const String & message, int code) -{ - std::lock_guard lock(broken_projections_mutex); - is_broken = true; - exception = message; - exception_code = code; } bool IMergeTreeDataPart::hasBrokenProjection(const String & projection_name) const { - std::lock_guard lock(broken_projections_mutex); - return broken_projection_parts.contains(projection_name); + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + return false; + return it->second->is_broken; +} + +void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const +{ + std::lock_guard lock(broken_reason_mutex); + if (is_broken) + return; + is_broken = true; + exception = message; + exception_code = code; } bool isCompactPart(const MergeTreeDataPartPtr & data_part) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 52a1541e15f..9af2c16f1e8 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -259,7 +259,7 @@ public: mutable std::atomic is_broken {false}; mutable std::string exception; mutable int exception_code = 0; - mutable std::mutex broken_projections_mutex; + mutable std::mutex broken_reason_mutex; /// Indicates that the part was marked Outdated by PartCheckThread because the part was not committed to ZooKeeper mutable bool is_unexpected_local_part = false; @@ -411,14 +411,10 @@ public: const std::map> & getProjectionParts() const { return projection_parts; } - const std::map> & getBrokenProjectionParts() const { return broken_projection_parts; } - MergeTreeDataPartBuilder getProjectionPartBuilder(const String & projection_name, bool is_temp_projection = false); void addProjectionPart(const String & projection_name, std::shared_ptr && projection_part); - void addBrokenProjectionPart(const String & projection_name, std::shared_ptr projection_part, const String & message, int code); - void markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const; bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } @@ -427,6 +423,8 @@ public: void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + void setBrokenReason(const String & message, int code) const; + /// Return set of metadata file names without checksums. For example, /// columns.txt or checksums.txt itself. NameSet getFileNamesWithoutChecksums() const; @@ -579,7 +577,6 @@ protected: String parent_part_name; mutable std::map> projection_parts; - mutable std::map> broken_projection_parts; mutable PartMetadataManagerPtr metadata_manager; @@ -693,8 +690,6 @@ private: void incrementStateMetric(MergeTreeDataPartState state) const; void decrementStateMetric(MergeTreeDataPartState state) const; - void setBrokenReason(const String & message, int code); - /// This ugly flag is needed for debug assertions only mutable bool part_is_probably_removed_from_disk = false; }; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index adb1ca72e46..53ba1a57b27 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -696,8 +696,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c MergeTreeData::DataPartsVector projection_parts; for (const auto & part : global_ctx->future_part->parts) { - auto it = part->getProjectionParts().find(projection.name); - if (it != part->getProjectionParts().end()) + auto actual_projection_parts = part->getProjectionParts(); + auto it = actual_projection_parts.find(projection.name); + if (it != actual_projection_parts.end() && !it->second->is_broken) projection_parts.push_back(it->second); } if (projection_parts.size() < global_ctx->future_part->parts.size()) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 152c386e188..0725c3cbf32 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5737,7 +5737,7 @@ MergeTreeData::getDataPartsVectorForInternalUsage(const DataPartStates & afforda } MergeTreeData::ProjectionPartsVector -MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, bool fill_states) const +MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, DataPartStateVector * out_states) const { auto lock = lockParts(); ProjectionPartsVector res; @@ -5749,20 +5749,14 @@ MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & a res.data_parts.push_back(part); for (const auto & [_, projection_part] : part->getProjectionParts()) res.projection_parts.push_back(projection_part); - for (const auto & [_, projection_part] : part->getBrokenProjectionParts()) - res.broken_projection_parts.push_back(projection_part); } } - if (fill_states) + if (out_states != nullptr) { - res.projection_parts_states.resize(res.projection_parts.size()); + out_states->resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); - - res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); - for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) - (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); + (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); } return res; @@ -5815,7 +5809,7 @@ bool MergeTreeData::supportsLightweightDelete() const return true; } -MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(bool fill_states) const +MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const { ProjectionPartsVector res; auto lock = lockParts(); @@ -5826,15 +5820,11 @@ MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector( res.projection_parts.push_back(projection_part); } - if (fill_states) + if (out_states != nullptr) { - res.projection_parts_states.resize(res.projection_parts.size()); + out_states->resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); - - res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); - for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) - (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); + (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); } return res; } @@ -7050,8 +7040,7 @@ std::pair MergeTreeData::cloneAn } } - auto projections = src_part->getProjectionParts(); - for (const auto & [name, projection_part] : projections) + for (const auto & [name, projection_part] : src_part->getProjectionParts()) { const auto & projection_storage = projection_part->getDataPartStorage(); for (auto it = projection_storage.iterate(); it->isValid(); it->next()) @@ -7654,21 +7643,39 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right, String & out_reason) { - if (left->getProjectionParts().size() != right->getProjectionParts().size()) + auto remove_broken_parts = [](auto & parts) + { + std::set broken_projection_parts; + for (const auto & [name, part] : parts) + { + if (part->is_broken) + broken_projection_parts.emplace(name); + } + for (const auto & name : broken_projection_parts) + parts.erase(name); + }; + + auto left_projection_parts = left->getProjectionParts(); + auto right_projection_parts = right->getProjectionParts(); + + remove_broken_parts(left_projection_parts); + remove_broken_parts(right_projection_parts); + + if (left_projection_parts.size() != right_projection_parts.size()) { out_reason = fmt::format( "Parts have different number of projections: {} in part '{}' and {} in part '{}'", - left->getProjectionParts().size(), + left_projection_parts.size(), left->name, - right->getProjectionParts().size(), + right_projection_parts.size(), right->name ); return false; } - for (const auto & [name, _] : left->getProjectionParts()) + for (const auto & [name, _] : left_projection_parts) { - if (!right->hasProjection(name)) + if (!right_projection_parts.contains(name)) { out_reason = fmt::format( "The part '{}' doesn't have projection '{}' while part '{}' does", right->name, name, left->name diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 4ef3b75988b..18087c6b059 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -489,7 +489,7 @@ public: const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; /// Same as above but only returns projection parts ProjectionPartsVector getProjectionPartsVectorForInternalUsage( - const DataPartStates & affordable_states, bool fill_states = false) const; + const DataPartStates & affordable_states, MergeTreeData::DataPartStateVector * out_states) const; /// Returns absolutely all parts (and snapshot of their states) @@ -501,7 +501,7 @@ public: size_t getTotalMarksCount() const; /// Same as above but only returns projection parts - ProjectionPartsVector getAllProjectionPartsVector(bool fill_states = false) const; + ProjectionPartsVector getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states = nullptr) const; /// Returns parts in Active state DataParts getDataPartsForInternalUsage() const; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 8ef1621b647..6a1ceec1cd3 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -514,8 +514,8 @@ static std::set getProjectionsToRecalculate( bool need_recalculate = materialized_projections.contains(projection.name) || (!is_full_part_storage - && (source_part->hasProjection(projection.name) - || source_part->hasBrokenProjection(projection.name))); + && source_part->hasProjection(projection.name) + && !source_part->hasBrokenProjection(projection.name)); if (need_recalculate) projections_to_recalc.insert(&projection); @@ -1370,8 +1370,8 @@ private: bool need_recalculate = ctx->materialized_projections.contains(projection.name) || (!is_full_part_storage - && (ctx->source_part->hasProjection(projection.name) - || ctx->source_part->hasBrokenProjection(projection.name))); + && ctx->source_part->hasProjection(projection.name) + && !ctx->source_part->hasBrokenProjection(projection.name)); if (need_recalculate) { diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 74af7cbb77c..8feabf344b5 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -272,14 +272,16 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } - auto check_projection = [&](const String & name, std::shared_ptr projection) + for (const auto & [name, projection] : data_part->getProjectionParts()) { + if (is_cancelled()) + return {}; + auto projection_file = name + ".proj"; if (!throw_on_broken_projection && projection->is_broken) { projections_on_disk.erase(projection_file); checksums_txt.remove(projection_file); - return; } IMergeTreeDataPart::Checksums projection_checksums; @@ -297,10 +299,11 @@ static IMergeTreeDataPart::Checksums checkDataPart( if (isRetryableException(std::current_exception())) throw; - LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); - - if (!data_part->hasBrokenProjection(name)) - data_part->markProjectionPartAsBroken(name, getCurrentExceptionMessage(false), getCurrentExceptionCode()); + if (!projection->is_broken) + { + LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); + projection->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); + } is_broken_projection = true; if (throw_on_broken_projection) @@ -308,7 +311,6 @@ static IMergeTreeDataPart::Checksums checkDataPart( projections_on_disk.erase(projection_file); checksums_txt.remove(projection_file); - return; } checksums_data.files[projection_file] = IMergeTreeDataPart::Checksums::Checksum( @@ -316,24 +318,6 @@ static IMergeTreeDataPart::Checksums checkDataPart( projection_checksums.getTotalChecksumUInt128()); projections_on_disk.erase(projection_file); - }; - - auto broken_projection_parts = data_part->getBrokenProjectionParts(); /// Iterate over copy - for (const auto & [name, projection] : broken_projection_parts) - { - if (is_cancelled()) - return {}; - else - check_projection(name, projection); - } - - auto projection_parts = data_part->getProjectionParts(); /// Iterate over copy - for (const auto & [name, projection] : projection_parts) - { - if (is_cancelled()) - return {}; - else - check_projection(name, projection); } if (require_checksums && !projections_on_disk.empty()) diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index e97c13b1fed..513af6cfc46 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -64,7 +64,7 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat } MergeTreeData::ProjectionPartsVector -StoragesInfo::getProjectionParts(bool fill_states, bool has_state_column) const +StoragesInfo::getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const { if (data->getInMemoryMetadataPtr()->projections.empty()) return {}; @@ -74,12 +74,12 @@ StoragesInfo::getProjectionParts(bool fill_states, bool has_state_column) const { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, fill_states); + return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, &state); - return data->getAllProjectionPartsVector(fill_states); + return data->getAllProjectionPartsVector(&state); } - return data->getProjectionPartsVectorForInternalUsage({State::Active}, fill_states); + return data->getProjectionPartsVectorForInternalUsage({State::Active}, &state); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index e0e81f0d24d..c3d2e64b303 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -25,7 +25,7 @@ struct StoragesInfo explicit operator bool() const { return storage != nullptr; } MergeTreeData::DataPartsVector getParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; - MergeTreeData::ProjectionPartsVector getProjectionParts(bool fill_states, bool has_state_column) const; + MergeTreeData::ProjectionPartsVector getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; }; /** A helper class that enumerates the storages that match given query. */ diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 44bdb294a2d..3dbe6823dac 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -97,14 +97,15 @@ void StorageSystemProjectionParts::processNextStorage( ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { using State = MergeTreeDataPartState; - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); - auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) + MergeTreeData::DataPartStateVector all_parts_state; + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) { - const auto & part = parts[part_number]; + const auto & part = all_parts.projection_parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = states[part_number]; + auto part_state = all_parts_state[part_number]; ColumnSize columns_size = part->getTotalColumnsSize(); ColumnSize parent_columns_size = parent_part->getTotalColumnsSize(); @@ -275,7 +276,12 @@ void StorageSystemProjectionParts::processNextStorage( add_ttl_info_map(part->ttl_infos.moves_ttl); if (columns_mask[src_index++]) - columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + { + if (part->default_codec) + columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + else + columns[res_index++]->insertDefault(); + } add_ttl_info_map(part->ttl_infos.recompression_ttl); add_ttl_info_map(part->ttl_infos.group_by_ttl); @@ -287,7 +293,7 @@ void StorageSystemProjectionParts::processNextStorage( if (part->is_broken) { - std::lock_guard lock(part->broken_projections_mutex); + std::lock_guard lock(part->broken_reason_mutex); if (columns_mask[src_index++]) columns[res_index++]->insert(part->exception_code); if (columns_mask[src_index++]) @@ -306,18 +312,6 @@ void StorageSystemProjectionParts::processNextStorage( /// Do not use part->getState*, it can be changed from different thread if (has_state_column) columns[res_index++]->insert(IMergeTreeDataPart::stateString(part_state)); - }; - - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) - { - auto part = all_parts.projection_parts[part_number]; - fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); - } - - for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) - { - auto part = all_parts.broken_projection_parts[part_number]; - fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index 3f4224e46bb..06becc6d91c 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -103,14 +103,15 @@ void StorageSystemProjectionPartsColumns::processNextStorage( } /// Go through the list of projection parts. - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); - auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) + MergeTreeData::DataPartStateVector all_parts_state; + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) { - const auto & part = parts[part_number]; + const auto & part = all_parts.projection_parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = states[part_number]; + auto part_state = all_parts_state[part_number]; auto columns_size = part->getTotalColumnsSize(); auto parent_columns_size = parent_part->getTotalColumnsSize(); @@ -259,18 +260,6 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (has_state_column) columns[res_index++]->insert(part->stateString()); } - }; - - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) - { - auto part = all_parts.projection_parts[part_number]; - fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); - } - - for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) - { - auto part = all_parts.broken_projection_parts[part_number]; - fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index d0b07e081db..62966036eed 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -8,15 +8,15 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 1 0 @@ -27,23 +27,20 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST check table full -all_0_0_0 1 -all_1_1_0 1 -all_3_3_0 1 all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. 0 broke data of part 'proj_2' (parent part: all_2_2_0) @@ -52,13 +49,13 @@ all_2_2_0 proj FILE_DOESNT_EXIST system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj_2'] +all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' FILE_DOESNT_EXIST check table @@ -69,18 +66,18 @@ all_2_2_0 proj_2 NO_FILE_IN_DATA_PART system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 [] +all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 0 @@ -99,21 +96,21 @@ all_3_3_0 proj_2 NO_FILE_IN_DATA_PART system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 [] -all_3_3_0 0 ['proj'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 0 @@ -135,76 +132,66 @@ all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST system.parts all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 [] -all_2_2_0 1 [] -all_3_3_0 0 ['proj'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 check table full -all_3_5_1 1 -all_0_0_0 1 -all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. +all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. materialize projection proj check table full -all_3_5_1_6 1 -all_0_0_0_6 1 -all_2_2_0_6 1 -all_1_1_0_6 1 system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 1 ['proj','proj_2'] -all_1_1_0 0 [] +all_1_1_0 0 ['proj','proj_2'] all_1_1_0_6 1 ['proj','proj_2'] -all_2_2_0 0 [] +all_2_2_0 0 ['proj','proj_2'] all_2_2_0_6 1 ['proj','proj_2'] -all_3_3_0 0 ['proj'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 0 ['proj'] all_3_5_1_6 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 1 materialize projection proj_2 check table full -all_3_5_1_7 1 -all_0_0_0_7 1 -all_2_2_0_7 1 -all_1_1_0_7 1 system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 0 ['proj','proj_2'] all_0_0_0_7 1 ['proj','proj_2'] -all_1_1_0 0 [] +all_1_1_0 0 ['proj','proj_2'] all_1_1_0_6 0 ['proj','proj_2'] all_1_1_0_7 1 ['proj','proj_2'] -all_2_2_0 0 [] +all_2_2_0 0 ['proj','proj_2'] all_2_2_0_6 0 ['proj','proj_2'] all_2_2_0_7 1 ['proj','proj_2'] -all_3_3_0 0 ['proj'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 0 ['proj'] all_3_5_1_6 0 ['proj'] all_3_5_1_7 1 ['proj','proj_2'] @@ -214,11 +201,11 @@ select from projection 'proj' 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 81adfe6e49d..4748506d9cf 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -23,7 +23,7 @@ CREATE TABLE test SELECT d ORDER BY c ) ) -ENGINE = ReplicatedMergeTree('/test2/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) +ENGINE = ReplicatedMergeTree('/test3/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -119,9 +119,9 @@ function check() if [ "$expect_broken_part" = "proj" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -134,9 +134,9 @@ function check() if [ "$expect_broken_part" = "proj_2" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -173,7 +173,7 @@ function materialize_projection function check_table_full() { echo 'check table full' - $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" + $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" | grep "broken" } From 8ea4e302a50db872a798c6cd39c6f5edb255ec49 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 16 Nov 2023 19:43:32 +0100 Subject: [PATCH 0017/1081] Fix style check --- .../0_stateless/02916_broken_projection.sh | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 4748506d9cf..bf0ec61fd76 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -125,7 +126,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -140,7 +141,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -148,10 +149,20 @@ function check() $CLICKHOUSE_CLIENT -q "CHECK TABLE test" } -function optimize_no_wait() +function optimize() { + final=$1 + no_wait=$2 + echo 'optimize' - $CLICKHOUSE_CLIENT -nm -q "OPTIMIZE TABLE test SETTINGS alter_sync=0;" + query="OPTIMIZE TABLE test" + + if [ $final -eq 1 ]; then + query="$query FINAL" + if [ $no_wait -eq 1 ]; then + query="$query SETTINGS alter_sync=0" + + $CLICKHOUSE_CLIENT -nm -q $query } function reattach() @@ -234,7 +245,7 @@ insert 25 5 # Merge will be retried and on second attempt it will succeed. # The result part all_3_5_1 will have only 1 projection - 'proj', because # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. -optimize_no_wait +optimize 0 1 sleep 2 $CLICKHOUSE_CLIENT -nm -q " @@ -276,6 +287,16 @@ materialize_projection proj_2 check_table_full +break_projection proj all_3_5_1_7 data + +insert 30 5 + +optimize 1 0 + +insert 35 5 + +optimize 1 0 + check $CLICKHOUSE_CLIENT -nm -q " From 961bf074daf0c901a3e9d14b6caa4ba6cb37cc7c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 20 Nov 2023 10:56:10 +0100 Subject: [PATCH 0018/1081] Initial draft version of adding backup support to AzureBlobStorage --- src/Backups/BackupFactory.cpp | 2 + src/Backups/BackupIO_AzureBlobStorage.cpp | 336 ++++++++++++++++++ src/Backups/BackupIO_AzureBlobStorage.h | 69 ++++ src/Backups/BackupImpl.cpp | 8 +- .../registerBackupEngineAzureBlobStorage.cpp | 134 +++++++ src/CMakeLists.txt | 3 + src/Common/ProfileEvents.cpp | 4 + .../copyAzureBlobStorageFile.cpp | 324 +++++++++++++++++ .../copyAzureBlobStorageFile.h | 58 +++ src/Storages/StorageAzureBlob.cpp | 11 + src/Storages/StorageAzureBlob.h | 1 + .../__init__.py | 1 + .../configs/config.xml | 11 + .../configs/disable_profilers.xml | 13 + .../configs/users.xml | 8 + .../test.py | 151 ++++++++ 16 files changed, 1132 insertions(+), 2 deletions(-) create mode 100644 src/Backups/BackupIO_AzureBlobStorage.cpp create mode 100644 src/Backups/BackupIO_AzureBlobStorage.h create mode 100644 src/Backups/registerBackupEngineAzureBlobStorage.cpp create mode 100644 src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp create mode 100644 src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/__init__.py create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/test.py diff --git a/src/Backups/BackupFactory.cpp b/src/Backups/BackupFactory.cpp index 898ac7bc490..31e87a21fc2 100644 --- a/src/Backups/BackupFactory.cpp +++ b/src/Backups/BackupFactory.cpp @@ -33,11 +33,13 @@ void BackupFactory::registerBackupEngine(const String & engine_name, const Creat void registerBackupEnginesFileAndDisk(BackupFactory &); void registerBackupEngineS3(BackupFactory &); +void registerBackupEngineAzureBlobStorage(BackupFactory &); void registerBackupEngines(BackupFactory & factory) { registerBackupEnginesFileAndDisk(factory); registerBackupEngineS3(factory); + registerBackupEngineAzureBlobStorage(factory); } BackupFactory::BackupFactory() diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp new file mode 100644 index 00000000000..d41d23e3c36 --- /dev/null +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -0,0 +1,336 @@ +#include + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + + +namespace fs = std::filesystem; + +namespace DB +{ +namespace ErrorCodes +{ + extern const int AZURE_BLOB_STORAGE_ERROR; + extern const int LOGICAL_ERROR; +} + +//using AzureClientPtr = std::shared_ptr; + +BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( + StorageAzureBlob::Configuration configuration_, + const ReadSettings & read_settings_, + const WriteSettings & write_settings_, + const ContextPtr & context_) + : BackupReaderDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupReaderAzureBlobStorage")) + , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , configuration(configuration_) +{ + client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); + auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); + object_storage = std::make_unique("BackupReaderAzureBlobStorage", + std::make_unique(*client.get()), + std::move(settings_as_unique_ptr)); +} + +BackupReaderAzureBlobStorage::~BackupReaderAzureBlobStorage() = default; + +bool BackupReaderAzureBlobStorage::fileExists(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + return object_storage->exists(StoredObject(key)); +} + +UInt64 BackupReaderAzureBlobStorage::getFileSize(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + ObjectMetadata object_metadata = object_storage->getObjectMetadata(key); + return object_metadata.size_bytes; +} + +std::unique_ptr BackupReaderAzureBlobStorage::readFile(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + return std::make_unique( + client, key, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); +} + +void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) +{ + LOG_INFO(&Poco::Logger::get("BackupReaderAzureBlobStorage"), "Enter copyFileToDisk"); + + /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. + /// We don't check for `has_throttling` here because the native copy almost doesn't use network. + auto destination_data_source_description = destination_disk->getDataSourceDescription(); + if (destination_data_source_description.sameKind(data_source_description) + && (destination_data_source_description.is_encrypted == encrypted_in_backup)) + { + LOG_TRACE(log, "Copying {} from AzureBlobStorage to disk {}", path_in_backup, destination_disk->getName()); + auto write_blob_function = [&](const Strings & blob_path, WriteMode mode, const std::optional & object_attributes) -> size_t + { + /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. + if (blob_path.size() != 2 || mode != WriteMode::Rewrite) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Blob writing function called with unexpected blob_path.size={} or mode={}", + blob_path.size(), mode); + + std::shared_ptr dest_client; + if (configuration.container == blob_path[1]) + { + dest_client = client; + } + else + { + StorageAzureBlob::Configuration dest_configuration = configuration; + dest_configuration.container = blob_path[1]; + dest_configuration.blob_path = blob_path[0]; + dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); + } + + + copyAzureBlobStorageFile( + client, + dest_client, + configuration.container, + fs::path(configuration.blob_path) / path_in_backup, + 0, + file_size, + /* dest_bucket= */ blob_path[1], + /* dest_key= */ blob_path[0], + settings, + read_settings, + object_attributes, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupReaderAzureBlobStorage"), + /* for_disk_azure_blob_storage= */ true); + + return file_size; + }; + + destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); + return; /// copied! + } + + /// Fallback to copy through buffers. + BackupReaderDefault::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode); +} + + +BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( + StorageAzureBlob::Configuration configuration_, + const ReadSettings & read_settings_, + const WriteSettings & write_settings_, + const ContextPtr & context_) + : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterAzureBlobStorage")) + , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , configuration(configuration_) +{ + client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); + auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); + object_storage = std::make_unique("BackupWriterAzureBlobStorage", + std::make_unique(*client.get()), + std::move(settings_as_unique_ptr)); +} + +void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) +{ + /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. + auto source_data_source_description = src_disk->getDataSourceDescription(); + if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) + { + /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in AzureBlobStorage bucket. + /// In this case we can't use the native copy. + if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) + { + + std::shared_ptr src_client; + if (configuration.container == blob_path[1]) + { + src_client = client; + } + else + { + StorageAzureBlob::Configuration src_configuration = configuration; + src_configuration.container = blob_path[1]; + src_configuration.blob_path = blob_path[0]; + src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); + } + + LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); + copyAzureBlobStorageFile( + src_client, + client, + /* src_bucket */ blob_path[1], + /* src_key= */ blob_path[0], + start_pos, + length, + configuration.container, + fs::path(configuration.blob_path) / path_in_backup, + settings, + read_settings, + {}, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); + return; /// copied! + } + } + + /// Fallback to copy through buffers. + BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); +} + +void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) +{ + copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); +} + +BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; + +bool BackupWriterAzureBlobStorage::fileExists(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + LOG_INFO(&Poco::Logger::get("BackupWriterAzureBlobStorage"), "Result fileExists {} ", object_storage->exists(StoredObject(key))); + + return object_storage->exists(StoredObject(key)); +} + +UInt64 BackupWriterAzureBlobStorage::getFileSize(const String & file_name) +{ + LOG_INFO(&Poco::Logger::get("BackupWriterAzureBlobStorage"), "Enter getFileSize"); + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + RelativePathsWithMetadata children; + object_storage->listObjects(key,children,/*max_keys*/0); + if (children.empty()) + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object {} must exist"); + return children[0].metadata.size_bytes; +} + +std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String & file_name, size_t /*expected_file_size*/) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + + return std::make_unique( + client, key, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); +} + +std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + return std::make_unique( + client, + key, + settings->max_single_part_upload_size, + DBMS_DEFAULT_BUFFER_SIZE, + write_settings); +} + +void BackupWriterAzureBlobStorage::removeFile(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + StoredObject object(key); + object_storage->removeObjectIfExists(object); +} + +void BackupWriterAzureBlobStorage::removeFiles(const Strings & keys) +{ + StoredObjects objects; + for (const auto & key : keys) + objects.emplace_back(key); + + object_storage->removeObjectsIfExist(objects); + +} + +void BackupWriterAzureBlobStorage::removeFilesBatch(const Strings & keys) +{ + StoredObjects objects; + for (const auto & key : keys) + objects.emplace_back(key); + + object_storage->removeObjectsIfExist(objects); +} + +} + +#endif diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h new file mode 100644 index 00000000000..6ef66fc432d --- /dev/null +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -0,0 +1,69 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include + + +namespace DB +{ + +// using AzureClientPtr = std::shared_ptr; + +/// Represents a backup stored to Azure + class BackupReaderAzureBlobStorage : public BackupReaderDefault + { + public: + BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupReaderAzureBlobStorage() override; + + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; + + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; + + private: + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; + }; + + + class BackupWriterAzureBlobStorage : public BackupWriterDefault + { + public: + BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupWriterAzureBlobStorage() override; + + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr writeFile(const String & file_name) override; + + void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + + void removeFile(const String & file_name) override; + void removeFiles(const Strings & file_names) override; + + private: + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; + void removeFilesBatch(const Strings & file_names); + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; + }; + +} + +#endif diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index bb97335d8fb..9363ca5e7a7 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -492,6 +492,7 @@ void BackupImpl::checkBackupDoesntExist() const else file_name_to_check_existence = ".backup"; + LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkBackupDoesntExist 1"); if (writer->fileExists(file_name_to_check_existence)) throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} already exists", backup_name_for_logging); @@ -499,6 +500,7 @@ void BackupImpl::checkBackupDoesntExist() const if (!is_internal_backup) { assert(!lock_file_name.empty()); + LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkBackupDoesntExist 2"); if (writer->fileExists(lock_file_name)) throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} is being written already", backup_name_for_logging); } @@ -522,6 +524,8 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const if (throw_if_failed) { + LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkLockFile"); + if (!writer->fileExists(lock_file_name)) { throw Exception( @@ -886,12 +890,12 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) } else if (src_disk && from_immutable_file) { - LOG_TRACE(log, "Writing backup for file {} from {} (disk {}): data file #{}", info.data_file_name, src_file_desc, src_disk->getName(), info.data_file_index); + LOG_INFO(log, "Writing backup for file {} from {} (disk {}): data file #{}", info.data_file_name, src_file_desc, src_disk->getName(), info.data_file_index); writer->copyFileFromDisk(info.data_file_name, src_disk, src_file_path, info.encrypted_by_disk, info.base_size, info.size - info.base_size); } else { - LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); + LOG_INFO(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); auto create_read_buffer = [entry, read_settings = writer->getReadSettings()] { return entry->getReadBuffer(read_settings); }; writer->copyDataToFile(info.data_file_name, create_read_buffer, info.base_size, info.size - info.base_size); } diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp new file mode 100644 index 00000000000..6f7b5f38c28 --- /dev/null +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -0,0 +1,134 @@ +#include "config.h" + +#include +#include + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include +#include +#include +#include +#endif + + +namespace DB +{ +namespace fs = std::filesystem; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int SUPPORT_IS_DISABLED; +} + +#if USE_AZURE_BLOB_STORAGE +namespace +{ + String removeFileNameFromURL(String & url) + { + Poco::URI url2{url}; + String path = url2.getPath(); + size_t slash_pos = path.find_last_of('/'); + String file_name = path.substr(slash_pos + 1); + path.resize(slash_pos + 1); + url2.setPath(path); + url = url2.toString(); + return file_name; + } +} +#endif + + +void registerBackupEngineAzureBlobStorage(BackupFactory & factory) +{ + auto creator_fn = []([[maybe_unused]] const BackupFactory::CreateParams & params) -> std::unique_ptr + { +#if USE_AZURE_BLOB_STORAGE + const String & id_arg = params.backup_info.id_arg; + const auto & args = params.backup_info.args; + + LOG_INFO(&Poco::Logger::get("registerBackupEngineAzureBlobStorage"), "Begin id_arg={} args.size={}", id_arg, args.size()); + + StorageAzureBlob::Configuration configuration; + + if (args.size() == 4) + { + configuration.connection_url = args[0].safeGet(); + configuration.is_connection_string = true; + + configuration.container = args[1].safeGet(); + configuration.blob_path = args[2].safeGet(); + configuration.format = args[3].safeGet(); + + LOG_TRACE(&Poco::Logger::get("registerBackupEngineAzureBlobStorage"), "configuration.connection_url = {}" + "configuration.container = {}" + "configuration.blob_path = {}" + "configuration.format = {}", + configuration.connection_url, configuration.container, configuration.blob_path, configuration.format); + } + + + BackupImpl::ArchiveParams archive_params; + if (hasRegisteredArchiveFileExtension(configuration.blob_path)) + { + if (params.is_internal_backup) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Using archives with backups on clusters is disabled"); + + archive_params.archive_name = removeFileNameFromURL(configuration.blob_path); + archive_params.compression_method = params.compression_method; + archive_params.compression_level = params.compression_level; + archive_params.password = params.password; + } + else + { + if (!params.password.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Password is not applicable, backup cannot be encrypted"); + } + + + if (params.open_mode == IBackup::OpenMode::READ) + { + auto reader = std::make_shared(configuration, + params.read_settings, + params.write_settings, + params.context); + + return std::make_unique( + params.backup_info, + archive_params, + params.base_backup_info, + reader, + params.context, + /*params.use_same_s3_credentials_for_base_backup*/ false); + } + else + { + auto writer = std::make_shared(configuration, + params.read_settings, + params.write_settings, + params.context); + + return std::make_unique( + params.backup_info, + archive_params, + params.base_backup_info, + writer, + params.context, + params.is_internal_backup, + params.backup_coordination, + params.backup_uuid, + params.deduplicate_files, + /*params.use_same_s3_credentials_for_base_backup*/ false); + } +#else + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "AzureBlobStorage support is disabled"); +#endif + }; + + factory.registerBackupEngine("AzureBlobStorage", creator_fn); +} + +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0257b7d329b..984594a6541 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -87,6 +87,7 @@ add_headers_and_sources(clickhouse_common_io IO) add_headers_and_sources(clickhouse_common_io IO/Archives) add_headers_and_sources(clickhouse_common_io IO/Resource) add_headers_and_sources(clickhouse_common_io IO/S3) +add_headers_and_sources(clickhouse_common_io IO/AzureBlobStorage) list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp) @@ -139,6 +140,7 @@ endif() if (TARGET ch_contrib::azure_sdk) add_headers_and_sources(dbms Disks/ObjectStorages/AzureBlobStorage) + add_headers_and_sources(dbms IO/AzureBlobStorage) endif() if (TARGET ch_contrib::hdfs) @@ -485,6 +487,7 @@ if (TARGET ch_contrib::aws_s3) endif() if (TARGET ch_contrib::azure_sdk) + target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::azure_sdk) dbms_target_link_libraries (PRIVATE ch_contrib::azure_sdk) endif() diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 58e860ebcaf..1655d19986a 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -361,6 +361,10 @@ The server successfully detected this situation and will download merged part fr M(S3PutObject, "Number of S3 API PutObject calls.") \ M(S3GetObject, "Number of S3 API GetObject calls.") \ \ + M(AzureUploadPart, "Number of Azure blob storage API UploadPart calls") \ + M(DiskAzureUploadPart, "Number of Disk Azure blob storage API UploadPart calls") \ + M(AzureCopyObject, "Number of Azure blob storage API CopyObject calls") \ + M(DiskAzureCopyObject, "Number of Disk Azure blob storage API CopyObject calls") \ M(AzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.") \ M(AzureListObjects, "Number of Azure blob storage API ListObjects calls.") \ \ diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp new file mode 100644 index 00000000000..bf0bcac664b --- /dev/null +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -0,0 +1,324 @@ +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event AzureCopyObject; + extern const Event AzureUploadPart; + + extern const Event DiskAzureCopyObject; + extern const Event DiskAzureUploadPart; +} + + +namespace DB +{ + +size_t max_single_operation_copy_size = 256 * 1024 * 1024; + + +namespace +{ + class UploadHelper + { + public: + UploadHelper( + const CreateReadBuffer & create_read_buffer_, + std::shared_ptr client_, + size_t offset_, + size_t total_size_, + const String & dest_bucket_, + const String & dest_key_, + std::shared_ptr settings_, + const std::optional> & object_metadata_, + ThreadPoolCallbackRunner schedule_, + bool for_disk_azure_blob_storage_) + : create_read_buffer(create_read_buffer_) + , client(client_) + , offset (offset_) + , total_size (total_size_) + , dest_bucket(dest_bucket_) + , dest_key(dest_key_) + , settings(settings_) + , object_metadata(object_metadata_) + , schedule(schedule_) + , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) + , log(&Poco::Logger::get("azureBlobStorageUploadHelper")) + , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) + { + } + + ~UploadHelper() {} + + protected: + std::function()> create_read_buffer; + std::shared_ptr client; + size_t offset; + size_t total_size; + const String & dest_bucket; + const String & dest_key; + std::shared_ptr settings; + const std::optional> & object_metadata; + ThreadPoolCallbackRunner schedule; + bool for_disk_azure_blob_storage; + const Poco::Logger * log; + size_t max_single_part_upload_size; + + struct UploadPartTask + { + char *data = nullptr; + size_t size = 0; + std::string block_id; + bool is_finished = false; + std::exception_ptr exception; + + ~UploadPartTask() + { + if (data != nullptr) + free(data); + } + }; + + size_t normal_part_size; + std::vector block_ids; + + std::list TSA_GUARDED_BY(bg_tasks_mutex) bg_tasks; + int num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + int num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + std::mutex bg_tasks_mutex; + std::condition_variable bg_tasks_condvar; + + public: + void performCopy() + { + performMultipartUpload(); + } + + void completeMultipartUpload() + { + auto block_blob_client = client->GetBlockBlobClient(dest_key); + block_blob_client.CommitBlockList(block_ids); + } + + void performMultipartUpload() + { + normal_part_size = 1024; + + size_t position = offset; + size_t end_position = offset + total_size; + + try + { + while (position < end_position) + { + size_t next_position = std::min(position + normal_part_size, end_position); + size_t part_size = next_position - position; /// `part_size` is either `normal_part_size` or smaller if it's the final part. + + uploadPart(position, part_size); + + position = next_position; + } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + waitForAllBackgroundTasks(); + throw; + } + + waitForAllBackgroundTasks(); + completeMultipartUpload(); + } + + + void uploadPart(size_t part_offset, size_t part_size) + { + LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Size: {}", dest_bucket, dest_key, part_size); + + if (!part_size) + { + LOG_TRACE(log, "Skipping writing an empty part."); + return; + } + + if (schedule) + { + UploadPartTask * task = nullptr; + + { + std::lock_guard lock(bg_tasks_mutex); + task = &bg_tasks.emplace_back(); + ++num_added_bg_tasks; + } + + /// Notify waiting thread when task finished + auto task_finish_notify = [this, task]() + { + std::lock_guard lock(bg_tasks_mutex); + task->is_finished = true; + ++num_finished_bg_tasks; + + /// Notification under mutex is important here. + /// Otherwise, WriteBuffer could be destroyed in between + /// Releasing lock and condvar notification. + bg_tasks_condvar.notify_one(); + }; + + try + { + auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); + auto buffer = std::make_unique(std::move(read_buffer), part_size); + task->data = new char[part_size]; + task->size = part_size; + buffer->read(task->data,part_size); + task->block_id = getRandomASCIIString(64); + + schedule([this, task, task_finish_notify]() + { + try + { + processUploadTask(*task); + } + catch (...) + { + task->exception = std::current_exception(); + } + task_finish_notify(); + }, Priority{}); + } + catch (...) + { + task_finish_notify(); + throw; + } + } + else + { + UploadPartTask task; + auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); + auto buffer = std::make_unique(std::move(read_buffer), part_size); + task.data = new char[part_size]; + buffer->read(task.data,part_size); + task.size = part_size; + processUploadTask(task); + block_ids.emplace_back(task.block_id); + } + } + + void processUploadTask(UploadPartTask & task) + { + auto block_id = processUploadPartRequest(task); + + std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race + task.block_id = block_id; + LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, block_id: {}, Parts: {}", dest_bucket, dest_key, block_id, bg_tasks.size()); + } + + String processUploadPartRequest(UploadPartTask & task) + { + ProfileEvents::increment(ProfileEvents::AzureUploadPart); + if (for_disk_azure_blob_storage) + ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); + + auto block_blob_client = client->GetBlockBlobClient(dest_key); + task.block_id = getRandomASCIIString(64); + Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); + block_blob_client.StageBlock(task.block_id, memory); + + return task.block_id; + } + + + void waitForAllBackgroundTasks() + { + if (!schedule) + return; + + std::unique_lock lock(bg_tasks_mutex); + /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock + bg_tasks_condvar.wait(lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); }); + + auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(bg_tasks); + for (auto & task : tasks) + { + if (task.exception) + std::rethrow_exception(task.exception); + block_ids.emplace_back(task.block_id); + } + } + }; +} + + +void copyDataToAzureBlobStorageFile( + const std::function()> & create_read_buffer, + size_t offset, + size_t size, + std::shared_ptr & dest_client, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const std::optional> & object_metadata, + ThreadPoolCallbackRunner schedule, + bool for_disk_azure_blob_storage) +{ + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + helper.performCopy(); +} + + +void copyAzureBlobStorageFile( + std::shared_ptr src_client, + std::shared_ptr dest_client, + const String & src_bucket, + const String & src_key, + size_t offset, + size_t size, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const ReadSettings & read_settings, + const std::optional> & object_metadata, + ThreadPoolCallbackRunner schedule, + bool for_disk_azure_blob_storage) +{ + + if (size < max_single_operation_copy_size) + { + ProfileEvents::increment(ProfileEvents::AzureCopyObject); + if (for_disk_azure_blob_storage) + ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); + auto block_blob_client_src = src_client->GetBlockBlobClient(src_key); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_key); + auto uri = block_blob_client_src.GetUrl(); + block_blob_client_dest.CopyFromUri(uri); + } + else + { + LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Bucket: {}, Key: {}", src_bucket, src_key); + auto create_read_buffer = [&] + { + return std::make_unique(src_client, src_key, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); + }; + + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + helper.performCopy(); + } +} + +} + +#endif diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h new file mode 100644 index 00000000000..31228fbcb23 --- /dev/null +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -0,0 +1,58 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +class SeekableReadBuffer; + +using CreateReadBuffer = std::function()>; + +/// Copies a file from AzureBlobStorage to AzureBlobStorage. +/// The parameters `src_offset` and `src_size` specify a part in the source to copy. +void copyAzureBlobStorageFile( + std::shared_ptr src_client, + std::shared_ptr dest_client, + const String & src_bucket, + const String & src_key, + size_t src_offset, + size_t src_size, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const ReadSettings & read_settings, + const std::optional> & object_metadata = std::nullopt, + ThreadPoolCallbackRunner schedule_ = {}, + bool for_disk_azure_blob_storage = false); + + +/// Copies data from any seekable source to AzureBlobStorage. +/// The same functionality can be done by using the function copyData() and the class WriteBufferFromS3 +/// however copyDataToS3File() is faster and spends less memory. +/// The callback `create_read_buffer` can be called from multiple threads in parallel, so that should be thread-safe. +/// The parameters `offset` and `size` specify a part in the source to copy. +void copyDataToAzureBlobStorageFile( + const std::function()> & create_read_buffer, + size_t offset, + size_t size, + std::shared_ptr & client, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const std::optional> & object_metadata = std::nullopt, + ThreadPoolCallbackRunner schedule_ = {}, + bool for_disk_azure_blob_storage = false); + +} + +#endif diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 2e0703a8df3..e36604cfb1a 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -258,6 +258,17 @@ AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr loca return settings_ptr; } +std::shared_ptr StorageAzureBlob::createSettingsAsSharedPtr(ContextPtr local_context) +{ + const auto & context_settings = local_context->getSettingsRef(); + auto settings_ptr = std::make_shared(); + settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; + settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; + settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + + return settings_ptr; +} + void registerStorageAzureBlob(StorageFactory & factory) { factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index b97dee0caed..570e4124d73 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -80,6 +80,7 @@ public: static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); + static std::shared_ptr createSettingsAsSharedPtr(ContextPtr local_context); static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); diff --git a/tests/integration/test_backup_restore_azure_blob_storage/__init__.py b/tests/integration/test_backup_restore_azure_blob_storage/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml new file mode 100644 index 00000000000..5725dce40cd --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml @@ -0,0 +1,11 @@ + + 1 + 0 + 0.0 + 0 + 1 + 1 + 0 + 16 + 16 + \ No newline at end of file diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml new file mode 100644 index 00000000000..b74bb1502ce --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml @@ -0,0 +1,13 @@ + + + + + 0 + 0 + 0 + 1000 + 1 + 1 + + + diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml new file mode 100644 index 00000000000..c12eb2f79f4 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml @@ -0,0 +1,8 @@ + + + + + default + + + diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py new file mode 100644 index 00000000000..2ecf08a4f40 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +import gzip +import json +import logging +import os +import io +import random +import threading +import time + +from azure.storage.blob import BlobServiceClient +import helpers.client +import pytest +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.network import PartitionManager +from helpers.mock_servers import start_mock_servers +from helpers.test_tools import exec_query_with_retry + + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["configs/config.xml"], + user_configs=["configs/disable_profilers.xml", "configs/users.xml"], + with_azurite=True, + ) + cluster.start() + + yield cluster + finally: + cluster.shutdown() + + +def azure_query( + node, query, expect_error="false", try_num=10, settings={}, query_on_retry=None +): + for i in range(try_num): + try: + if expect_error == "true": + return node.query_and_get_error(query, settings=settings) + else: + return node.query(query, settings=settings) + except Exception as ex: + retriable_errors = [ + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Connection closed before getting full response or response is less than expected", + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Error while polling for socket ready read", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Connection closed before getting full response or response is less than expected", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Error while polling for socket ready read", + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + print(f"Try num: {i}. Having retriable error: {ex}") + time.sleep(i) + break + if not retry or i == try_num - 1: + raise Exception(ex) + if query_on_retry is not None: + node.query(query_on_retry) + continue + + +def get_azure_file_content(filename, port): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string( + str(connection_string) + ) + container_client = blob_service_client.get_container_client(container_name) + blob_client = container_client.get_blob_client(filename) + download_stream = blob_client.download_blob() + return download_stream.readall().decode("utf-8") + + +def put_azure_file_content(filename, port, data): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + try: + container_client = blob_service_client.create_container(container_name) + except: + container_client = blob_service_client.get_container_client(container_name) + + blob_client = container_client.get_blob_client(filename) + buf = io.BytesIO(data) + blob_client.upload_blob(buf) + +@pytest.fixture(autouse=True, scope="function") +def delete_all_files(cluster): + port = cluster.env_variables["AZURITE_PORT"] + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + containers = blob_service_client.list_containers() + for container in containers: + container_client = blob_service_client.get_container_client(container) + blob_list = container_client.list_blobs() + for blob in blob_list: + print(blob) + blob_client = container_client.get_blob_client(blob) + blob_client.delete_blob() + + assert len(list(container_client.list_blobs())) == 0 + + yield + + +def test_create_table_connection_string(cluster): + node = cluster.instances["node"] + azure_query( + node, + f"CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_create_connection_string', 'CSV')", + ) + +def test_backup_restore(cluster): + node = cluster.instances["node"] + port = cluster.env_variables["AZURITE_PORT"] + azure_query( + node, + f"CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c.csv', 'CSV')", + ) + azure_query(node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_c.csv", port)) + assert get_azure_file_content("test_simple_write_c.csv", port) == '1,"a"\n' + + backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c_backup.csv', 'CSV')" + azure_query(node,f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}") + print (get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) + azure_query(node, f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};") + assert(azure_query(node,f"SELECT * from test_simple_write_connection_string_restored") == "1\ta\n") \ No newline at end of file From 05b608cd76da8995086887f812e1ab3fceb99551 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 20 Nov 2023 10:12:45 +0000 Subject: [PATCH 0019/1081] Automatic style fix --- .../test.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py index 2ecf08a4f40..cda3cab07e4 100644 --- a/tests/integration/test_backup_restore_azure_blob_storage/test.py +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -18,7 +18,6 @@ from helpers.mock_servers import start_mock_servers from helpers.test_tools import exec_query_with_retry - @pytest.fixture(scope="module") def cluster(): try: @@ -103,6 +102,7 @@ def put_azure_file_content(filename, port, data): buf = io.BytesIO(data) blob_client.upload_blob(buf) + @pytest.fixture(autouse=True, scope="function") def delete_all_files(cluster): port = cluster.env_variables["AZURITE_PORT"] @@ -133,6 +133,7 @@ def test_create_table_connection_string(cluster): f"CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_create_connection_string', 'CSV')", ) + def test_backup_restore(cluster): node = cluster.instances["node"] port = cluster.env_variables["AZURITE_PORT"] @@ -140,12 +141,23 @@ def test_backup_restore(cluster): node, f"CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c.csv', 'CSV')", ) - azure_query(node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") + azure_query( + node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')" + ) print(get_azure_file_content("test_simple_write_c.csv", port)) assert get_azure_file_content("test_simple_write_c.csv", port) == '1,"a"\n' backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c_backup.csv', 'CSV')" - azure_query(node,f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}") - print (get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) - azure_query(node, f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};") - assert(azure_query(node,f"SELECT * from test_simple_write_connection_string_restored") == "1\ta\n") \ No newline at end of file + azure_query( + node, + f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}", + ) + print(get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) + azure_query( + node, + f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};", + ) + assert ( + azure_query(node, f"SELECT * from test_simple_write_connection_string_restored") + == "1\ta\n" + ) From 6dfb1c25ec6a4a61a4fe329191c10263eb19ad07 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 20 Nov 2023 11:37:06 +0100 Subject: [PATCH 0020/1081] Added docs --- docs/en/operations/backup.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 6068b185ede..15d953249a0 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -451,3 +451,24 @@ To disallow concurrent backup/restore, you can use these settings respectively. The default value for both is true, so by default concurrent backup/restores are allowed. When these settings are false on a cluster, only 1 backup/restore is allowed to run on a cluster at a time. + +## Configuring BACKUP/RESTORE to use an AzureBlobStorage Endpoint + +To write backups to an AzureBlobStorage container you need the following pieces of information: +- AzureBlobStorage endpoint connection string / url, +- Container, +- Path, +- Account name (if url is specified) +- Account Key (if url is specified) + +The destination for a backup will be specified like this: +``` +AzureBlobStorage('/', '', '', '', ') +``` + +```sql +BACKUP TABLE data TO AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'data_backup'); +RESTORE TABLE data AS data_restored FROM AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'data_backup'); +``` From 96c4b6bc35ee818afd2d2963dec7afdb5583969c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 20 Nov 2023 14:41:14 +0100 Subject: [PATCH 0021/1081] Updated to not analyze create parameterized view for analyzer & old analyzer --- src/Interpreters/InterpreterCreateQuery.cpp | 48 +++++-------------- src/Storages/StorageView.cpp | 3 +- .../0_stateless/02428_parameterized_view.sh | 2 +- 3 files changed, 14 insertions(+), 39 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 747c0be009e..4ee666e2a9a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -649,6 +649,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (!attach && !is_restore_from_backup && context_->getSettingsRef().flatten_nested) res.flattenNested(); + if (res.getAllPhysical().empty()) throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED, "Cannot CREATE table without physical columns"); @@ -755,49 +756,22 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti { Block as_select_sample; - if (getContext()->getSettingsRef().allow_experimental_analyzer) + if (!create.isParameterizedView()) { - if (create.isParameterizedView()) - { - auto select = create.select->clone(); - - ///Get all query parameters - const auto parameters = analyzeReceiveQueryParamsWithType(select); - NameToNameMap parameter_values; - - for (const auto & parameter : parameters) - { - const auto data_type = DataTypeFactory::instance().get(parameter.second); - /// Todo improve getting default values & include more datatypes - if (data_type->isValueRepresentedByNumber() || parameter.second == "String") - parameter_values[parameter.first] = "1"; - else if (parameter.second.starts_with("Array") || parameter.second.starts_with("Map")) - parameter_values[parameter.first] = "[]"; - else - parameter_values[parameter.first] = " "; - } - - /// Replace with default parameters - ReplaceQueryParameterVisitor visitor(parameter_values); - visitor.visit(select); - - as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(select, getContext()); - } - else + if (getContext()->getSettingsRef().allow_experimental_analyzer) { as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(create.select->clone(), getContext()); } + else + { + as_select_sample = InterpreterSelectWithUnionQuery::getSampleBlock(create.select->clone(), + getContext(), + false /* is_subquery */, + create.isParameterizedView()); + } + properties.columns = ColumnsDescription(as_select_sample.getNamesAndTypesList()); } - else - { - as_select_sample = InterpreterSelectWithUnionQuery::getSampleBlock(create.select->clone(), - getContext(), - false /* is_subquery */, - create.isParameterizedView()); - } - - properties.columns = ColumnsDescription(as_select_sample.getNamesAndTypesList()); } else if (create.as_table_function) { diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index f0f9b9540de..2f7267e3701 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,7 +112,8 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (is_parameterized_view_ && !query.isParameterizedView()) + storage_metadata.setColumns(columns_); storage_metadata.setComment(comment); if (!query.select) diff --git a/tests/queries/0_stateless/02428_parameterized_view.sh b/tests/queries/0_stateless/02428_parameterized_view.sh index ad9c672f4c5..499b8697ffc 100755 --- a/tests/queries/0_stateless/02428_parameterized_view.sh +++ b/tests/queries/0_stateless/02428_parameterized_view.sh @@ -37,7 +37,7 @@ $CLICKHOUSE_CLIENT -q "CREATE VIEW test_02428_pv1 AS SELECT * FROM test_02428_Ca $CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1(price=20)" $CLICKHOUSE_CLIENT -q "SELECT Price FROM \`test_02428_pv1\`(price=20)" -$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -Fq "UNKNOWN_QUERY_PARAMETER" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -q "UNKNOWN_QUERY_PARAMETER\|UNKNOWN_IDENTIFIER" && echo 'ERROR' || echo 'OK' $CLICKHOUSE_CLIENT --param_p 10 -q "SELECT Price FROM test_02428_pv1(price={p:UInt64})" $CLICKHOUSE_CLIENT --param_l 1 -q "SELECT Price FROM test_02428_pv1(price=50) LIMIT ({l:UInt64})" From 42b2fe9adcf4596e8e36231068911c5dbdc4948f Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 Nov 2023 13:21:35 +0100 Subject: [PATCH 0022/1081] Fxi --- src/Common/ErrorCodes.cpp | 1 + .../ReplicatedMergeTreePartCheckThread.cpp | 10 +- src/Storages/MergeTree/checkDataPart.cpp | 17 +++- .../02916_broken_projection.reference | 93 ++++++++++--------- .../0_stateless/02916_broken_projection.sh | 39 ++++---- 5 files changed, 94 insertions(+), 66 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 28f8e6c6021..9c3aab5ad01 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -586,6 +586,7 @@ M(704, CANNOT_USE_QUERY_CACHE_WITH_NONDETERMINISTIC_FUNCTIONS) \ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ + M(707, BROKEN_PROJECTION) \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ M(1001, STD_EXCEPTION) \ diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 4468cf8e3bf..ba4d4869025 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -63,7 +63,7 @@ void ReplicatedMergeTreePartCheckThread::enqueuePart(const String & name, time_t if (parts_set.contains(name)) return; - LOG_TRACE(log, "Enqueueing {} for check after after {}s", name, delay_to_check_seconds); + LOG_TRACE(log, "Enqueueing {} for check after {}s", name, delay_to_check_seconds); parts_queue.emplace_back(name, std::chrono::steady_clock::now() + std::chrono::seconds(delay_to_check_seconds)); parts_set.insert(name); task->schedule(); @@ -385,17 +385,19 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St if (isRetryableException(std::current_exception())) throw; - tryLogCurrentException(log, __PRETTY_FUNCTION__); - PreformattedMessage message; if (is_broken_projection) { - message = PreformattedMessage::create("Part {} has a broken projection. It will be ignored.", part_name); + message = PreformattedMessage::create( + "Part {} has a broken projections. It will be ignored. Broken projections info: \n{}", + part_name, getCurrentExceptionMessage(false)); LOG_DEBUG(log, message); result.action = ReplicatedCheckResult::DoNothing; } else { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); LOG_ERROR(log, message); result.action = ReplicatedCheckResult::TryFetchMissing; diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 8feabf344b5..3bb6f763c8b 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -42,6 +42,7 @@ namespace ErrorCodes extern const int NO_FILE_IN_DATA_PART; extern const int NETWORK_ERROR; extern const int SOCKET_TIMEOUT; + extern const int BROKEN_PROJECTION; } @@ -272,6 +273,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } + std::string broken_projections_message; for (const auto & [name, projection] : data_part->getProjectionParts()) { if (is_cancelled()) @@ -307,7 +309,15 @@ static IMergeTreeDataPart::Checksums checkDataPart( is_broken_projection = true; if (throw_on_broken_projection) - throw; + { + if (!broken_projections_message.empty()) + broken_projections_message += "\n"; + + broken_projections_message += fmt::format( + "Part {} has a broken projection {} (error: {})", + data_part->name, name, getCurrentExceptionMessage(false)); + continue; + } projections_on_disk.erase(projection_file); checksums_txt.remove(projection_file); @@ -320,6 +330,11 @@ static IMergeTreeDataPart::Checksums checkDataPart( projections_on_disk.erase(projection_file); } + if (throw_on_broken_projection && !broken_projections_message.empty()) + { + throw Exception(ErrorCodes::BROKEN_PROJECTION, broken_projections_message.data()); + } + if (require_checksums && !projections_on_disk.empty()) { throw Exception(ErrorCodes::UNEXPECTED_FILE_IN_DATA_PART, diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 62966036eed..aee18a21fb8 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -7,42 +7,40 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 1 -0 broke metadata of part 'proj' (parent part: all_2_2_0) system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST check table full -all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. -0 +all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -51,13 +49,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: proj_2 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' -FILE_DOESNT_EXIST +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: proj_2 check table 0 broken projections info @@ -68,19 +65,18 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 -0 broke data of part 'proj_2' (parent part: all_3_3_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -88,6 +84,7 @@ all_2_2_0 proj_2 NO_FILE_IN_DATA_PART insert new part insert new part optimize +OPTIMIZE TABLE test SETTINGS alter_sync=0 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -101,19 +98,18 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 -0 broke metadata of part 'proj' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -121,7 +117,6 @@ all_1_1_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST -0 broke data of part 'proj_2' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -138,21 +133,21 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 check table full -all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. -all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. +all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_2_2_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) +all_1_1_0 0 Part all_1_1_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_1_1_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_1_1_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_1_1_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) materialize projection proj check table full system.parts @@ -167,45 +162,55 @@ all_3_5_1 0 ['proj'] all_3_5_1_6 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 1 materialize projection proj_2 check table full +broke data of part 'proj' (parent part: all_3_5_1_7) +insert new part +optimize +OPTIMIZE TABLE test FINAL +insert new part +optimize +OPTIMIZE TABLE test FINAL system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 0 ['proj','proj_2'] -all_0_0_0_7 1 ['proj','proj_2'] +all_0_0_0_7 0 ['proj','proj_2'] +all_0_8_2_7 1 ['proj_2'] all_1_1_0 0 ['proj','proj_2'] all_1_1_0_6 0 ['proj','proj_2'] -all_1_1_0_7 1 ['proj','proj_2'] +all_1_1_0_7 0 ['proj','proj_2'] all_2_2_0 0 ['proj','proj_2'] all_2_2_0_6 0 ['proj','proj_2'] -all_2_2_0_7 1 ['proj','proj_2'] +all_2_2_0_7 0 ['proj','proj_2'] all_3_3_0 0 ['proj','proj_2'] all_3_5_1 0 ['proj'] all_3_5_1_6 0 ['proj'] -all_3_5_1_7 1 ['proj','proj_2'] +all_3_5_1_7 0 ['proj','proj_2'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +all_8_8_0 0 ['proj','proj_2'] +all_9_9_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj_2'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index bf0ec61fd76..bf382624787 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -24,7 +24,7 @@ CREATE TABLE test SELECT d ORDER BY c ) ) -ENGINE = ReplicatedMergeTree('/test3/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) +ENGINE = ReplicatedMergeTree('/test4/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -65,18 +65,13 @@ function break_projection() LIMIT 1; ") - path=$($CLICKHOUSE_CLIENT -q "SELECT path FROM system.disks WHERE name='$disk_name'") - - # make sure path is absolute - $CLICKHOUSE_CLIENT -q "select throwIf(substring('$path', 1, 1) != '/', 'Path is relative: $path')" || exit - if [ "$break_type" = "data" ] then - rm "$path/$part_path/d.bin" - rm "$path/$part_path/c.bin" + rm "$part_path/d.bin" + rm "$part_path/c.bin" echo "broke data of part '$part_name' (parent part: $parent_name)" else - rm "$path/$part_path/columns.txt" + rm "$part_path/columns.txt" echo "broke metadata of part '$part_name' (parent part: $parent_name)" fi } @@ -115,12 +110,12 @@ function check() WHERE table='test' AND database=currentDatabase() ORDER BY name;" - echo "select from projection 'proj'" + echo "select from projection 'proj', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' @@ -130,12 +125,12 @@ function check() " fi - echo "select from projection 'proj_2'" + echo "select from projection 'proj_2', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj_2" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' @@ -146,7 +141,9 @@ function check() fi echo 'check table' - $CLICKHOUSE_CLIENT -q "CHECK TABLE test" + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test;" } function optimize() @@ -159,16 +156,21 @@ function optimize() if [ $final -eq 1 ]; then query="$query FINAL" + fi if [ $no_wait -eq 1 ]; then query="$query SETTINGS alter_sync=0" + fi - $CLICKHOUSE_CLIENT -nm -q $query + echo $query + + $CLICKHOUSE_CLIENT -q "$query" } function reattach() { echo 'Detach - Attach' $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; DETACH TABLE test; ATTACH TABLE test; " @@ -184,7 +186,10 @@ function materialize_projection function check_table_full() { echo 'check table full' - $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" | grep "broken" + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test SETTINGS check_query_single_value_result = 0; +" | grep "broken" } @@ -300,5 +305,5 @@ optimize 1 0 check $CLICKHOUSE_CLIENT -nm -q " -DROP TABLE test; +DROP TABLE test SYNC; " From bcc87c01771414806fca705b5c9b5e0e925dea5f Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 Nov 2023 17:17:36 +0100 Subject: [PATCH 0023/1081] Better test --- .../ReplicatedMergeTreePartCheckThread.cpp | 3 +- src/Storages/System/StorageSystemDisks.cpp | 2 +- .../02916_broken_projection.reference | 43 +++++++++---------- .../0_stateless/02916_broken_projection.sh | 27 ++++++++---- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index ba4d4869025..d058113e134 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -388,8 +388,9 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St PreformattedMessage message; if (is_broken_projection) { + WriteBufferFromOwnString wb; message = PreformattedMessage::create( - "Part {} has a broken projections. It will be ignored. Broken projections info: \n{}", + "Part {} has a broken projections. It will be ignored. Broken projections info: {}", part_name, getCurrentExceptionMessage(false)); LOG_DEBUG(log, message); result.action = ReplicatedCheckResult::DoNothing; diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 250fcdba641..23a00cc7ae5 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -63,7 +63,7 @@ Pipe StorageSystemDisks::read( for (const auto & [disk_name, disk_ptr] : context->getDisksMap()) { col_name->insert(disk_name); - col_path->insert(fs::absolute(disk_ptr->getPath()).string()); + col_path->insert(disk_ptr->getPath()); col_free->insert(disk_ptr->getAvailableSpace().value_or(std::numeric_limits::max())); col_total->insert(disk_ptr->getTotalSpace().value_or(std::numeric_limits::max())); col_unreserved->insert(disk_ptr->getUnreservedSpace().value_or(std::numeric_limits::max())); diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index aee18a21fb8..1b84ca96840 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -11,12 +11,12 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 broke metadata of part 'proj' (parent part: all_2_2_0) @@ -29,18 +29,18 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST -check table full -all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) +check table full (all_2_2_0) +all_2_2_0 broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -53,7 +53,7 @@ select from projection 'proj', expect error: proj_2 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: proj_2 check table 0 @@ -69,12 +69,12 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broke data of part 'proj_2' (parent part: all_3_3_0) @@ -102,12 +102,12 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broke metadata of part 'proj' (parent part: all_1_1_0) @@ -137,19 +137,18 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 -check table full -all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_2_2_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) -all_1_1_0 0 Part all_1_1_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_1_1_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_1_1_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_1_1_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) +check table full (all_1_1_0) +all_1_1_0 materialize projection proj -check table full +check table full () system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 1 ['proj','proj_2'] @@ -166,16 +165,16 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 materialize projection proj_2 -check table full +check table full () broke data of part 'proj' (parent part: all_3_5_1_7) insert new part optimize @@ -206,11 +205,11 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj_2'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj_2 select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index bf382624787..a522de42c89 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -121,7 +121,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -136,7 +136,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -185,11 +185,20 @@ function materialize_projection function check_table_full() { - echo 'check table full' - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE test SETTINGS check_query_single_value_result = 0; -" | grep "broken" + echo "check table full ($1)" + expect_broken_part=$1 + if [ "$expect_broken_part" = "" ] + then + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test SETTINGS check_query_single_value_result = 0; + " | grep "broken" + else + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test SETTINGS check_query_single_value_result = 0; + " | grep "broken" | grep -o $expect_broken_part | head -n 1 + fi } @@ -216,7 +225,7 @@ check broken_projections_info # Check table query will also show a list of parts which have broken projections. -check_table_full +check_table_full "all_2_2_0" # Break data file of projection 'proj_2' for part all_2_2_0 break_projection proj_2 all_2_2_0 data @@ -280,7 +289,7 @@ broken_projections_info check -check_table_full +check_table_full all_1_1_0 materialize_projection proj From e8d99cb29654645c5a89d6cb15856b48a55d7bdf Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 22 Nov 2023 12:34:31 +0100 Subject: [PATCH 0024/1081] Fix style check --- tests/queries/0_stateless/02916_broken_projection.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index a522de42c89..6ed92e2e06e 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -54,8 +54,8 @@ function break_projection() parent_name=$2 break_type=$3 - read -r disk_name part_path <<< $($CLICKHOUSE_CLIENT -nm -q " - SELECT disk_name, path + read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " + SELECT path FROM system.projection_parts WHERE table='test' AND database=currentDatabase() From a57e612cf2ef657801cdeefb8410caf5cab804a2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 24 Nov 2023 16:08:49 +0100 Subject: [PATCH 0025/1081] Fxi tests --- src/Storages/StorageMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 74277616e95..84b48bb650b 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2247,7 +2247,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); + auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); calculated_checksums.checkEqual(part->checksums, true); auto & part_mutable = const_cast(*part); @@ -2268,7 +2268,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - checkDataPart(part, true, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); + checkDataPart(part, true, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); return CheckResult(part->name, true, ""); } catch (...) From 8ebbc8d85dc3f1e37d109ddb1ad1a05a55283a79 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 24 Nov 2023 18:37:40 +0100 Subject: [PATCH 0026/1081] Update 02117_show_create_table_system.reference --- .../0_stateless/02117_show_create_table_system.reference | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 9ed905a0df8..e122de8ef6c 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -685,6 +685,9 @@ CREATE TABLE system.projection_parts `rows_where_ttl_info.expression` Array(String), `rows_where_ttl_info.min` Array(DateTime), `rows_where_ttl_info.max` Array(DateTime), + `is_broken` UInt8, + `exception_code` Int32, + `exception` String, `bytes` UInt64 ALIAS bytes_on_disk, `marks_size` UInt64 ALIAS marks_bytes, `part_name` String ALIAS name From b4dab194954845b76d1ce9a6bf8b18dded059d74 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 Nov 2023 12:42:09 +0100 Subject: [PATCH 0027/1081] Fix test --- .../0_stateless/02916_broken_projection.reference | 1 + tests/queries/0_stateless/02916_broken_projection.sh | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 1b84ca96840..1f072e207a7 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -55,6 +55,7 @@ select from projection 'proj', expect error: proj_2 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: proj_2 +FILE_DOESNT_EXIST check table 0 broken projections info diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 6ed92e2e06e..80805330577 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -115,7 +115,10 @@ function check() if [ "$expect_broken_part" = "proj" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " +SET send_logs_level='fatal'; +SELECT c FROM test WHERE d == 12 ORDER BY c; +" 2>&1 | grep -oF "$expected_error" else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' @@ -130,7 +133,10 @@ function check() if [ "$expect_broken_part" = "proj_2" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " +SET send_logs_level='fatal'; +SELECT d FROM test WHERE c == 12 ORDER BY d; +" 2>&1 | grep -oF "$expected_error" else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' @@ -234,7 +240,7 @@ break_projection proj_2 all_2_2_0 data broken_projections_info # Select now fails with error "File doesn't exist" -check "proj_2" "FILE_DOESNT_EXIST" +check "proj_2" FILE_DOESNT_EXIST # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. broken_projections_info From a6972e7c90fd8ff775855cac13f47f9cd46b2da1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 Nov 2023 10:22:10 +0100 Subject: [PATCH 0028/1081] Fxi --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 85ce112d9a1..be665a64f1c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1145,7 +1145,7 @@ void IMergeTreeDataPart::loadChecksums(bool require) LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); bool noop; - checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */false); + checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */false); writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); From 0e11eeaea546dd41231a4f180b877ada1291a23d Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 30 Nov 2023 13:52:08 +0100 Subject: [PATCH 0029/1081] Allow to backup and restore parts with broken projections --- src/Backups/BackupSettings.cpp | 2 + src/Backups/BackupSettings.h | 6 + .../MergeTree/DataPartStorageOnDiskBase.cpp | 35 +- .../MergeTree/DataPartStorageOnDiskBase.h | 4 +- src/Storages/MergeTree/IDataPartStorage.h | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 15 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 3 +- src/Storages/MergeTree/MergeTreeData.cpp | 9 +- .../02916_broken_projection.reference | 226 +++++++++- .../0_stateless/02916_broken_projection.sh | 426 +++++++++++++----- 10 files changed, 588 insertions(+), 142 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 68d825e9468..51d713f03e1 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -32,6 +32,8 @@ namespace ErrorCodes M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(Bool, check_parts) \ + M(Bool, check_projection_parts) \ + M(Bool, allow_backup_broken_projections) \ M(Bool, internal) \ M(String, host_id) \ M(OptionalUUID, backup_uuid) diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index f26b992b348..ec430905f51 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -62,6 +62,12 @@ struct BackupSettings /// Check checksums of the data parts before writing them to a backup. bool check_parts = true; + /// Check checksums of the projection data parts before writing them to a backup. + bool check_projection_parts = true; + + /// Allow to create backup with broken projections. + bool allow_backup_broken_projections = false; + /// Internal, should not be specified by user. /// Whether this backup is a part of a distributed backup created by BACKUP ON CLUSTER. bool internal = false; diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 7fc8187aee5..6e5cbdde355 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -334,7 +334,9 @@ void DataPartStorageOnDiskBase::backup( const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const { fs::path part_path_on_disk = fs::path{root_path} / part_dir; fs::path part_path_in_backup = fs::path{path_in_backup} / part_dir; @@ -376,7 +378,7 @@ void DataPartStorageOnDiskBase::backup( bool copy_encrypted = !backup_settings.decrypt_files_from_encrypted_disks; - for (const auto & filepath : files_to_backup) + auto backup_file = [&](const String & filepath) { auto filepath_on_disk = part_path_on_disk / filepath; auto filepath_in_backup = part_path_in_backup / filepath; @@ -384,8 +386,10 @@ void DataPartStorageOnDiskBase::backup( if (files_without_checksums.contains(filepath)) { backup_entries.emplace_back(filepath_in_backup, std::make_unique(disk, filepath_on_disk, read_settings, copy_encrypted)); - continue; + return; } + else if (is_projection_part && allow_backup_broken_projection && !disk->exists(filepath_on_disk)) + return; if (make_temporary_hard_links) { @@ -410,6 +414,31 @@ void DataPartStorageOnDiskBase::backup( backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries.emplace_back(filepath_in_backup, std::move(backup_entry)); + }; + + auto * log = &Poco::Logger::get("DataPartStorageOnDiskBase::backup"); + + for (const auto & filepath : files_to_backup) + { + if (is_projection_part && allow_backup_broken_projection) + { + try + { + backup_file(filepath); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::FILE_DOESNT_EXIST) + throw; + + LOG_ERROR(log, "Cannot backup file {} of projection part {}. Will try to ignore it", filepath, part_dir); + continue; + } + } + else + { + backup_file(filepath); + } } } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 1826e84c28d..6176a13c27b 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -58,7 +58,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const override; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const override; MutableDataPartStoragePtr freeze( const std::string & to, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 072cb29626e..b3a6ab203d5 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -222,7 +222,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const = 0; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const = 0; /// Creates hardlinks into 'to/dir_path' for every file in data part. /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index be665a64f1c..940b3991067 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -647,13 +647,14 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. loadPartitionAndMinMaxIndex(); + bool has_broken_projections = false; if (!parent_part) { loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); + has_broken_projections = !loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); } - if (check_consistency) + if (check_consistency && !has_broken_projections) checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); @@ -715,9 +716,10 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) +bool IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); + bool has_broken_projection = false; for (const auto & projection : metadata_snapshot->projections) { auto path = projection.name + ".proj"; @@ -742,16 +744,19 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch if (isRetryableException(std::current_exception())) throw; + auto message = getCurrentExceptionMessage(true); LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), - "Cannot load projection {}, will consider it broken", projection.name); + "Cannot load projection {}, will consider it broken. Reason: {}", projection.name, message); - part->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); + has_broken_projection = true; + part->setBrokenReason(message, getCurrentExceptionCode()); } addProjectionPart(projection.name, std::move(part)); } } } + return has_broken_projection; } void IMergeTreeDataPart::loadIndexGranularity() diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 9af2c16f1e8..6e276284f4c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -421,7 +421,8 @@ public: bool hasBrokenProjection(const String & projection_name) const; - void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + /// Return true, if all projections were loaded successfully and none was marked as broken. + bool loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); void setBrokenReason(const String & message, int code) const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8683e4293e9..c95aee88aee 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5113,7 +5113,7 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( if (hold_table_lock && !table_lock) table_lock = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); - if (backup_settings.check_parts) + if (backup_settings.check_projection_parts) part->checkConsistencyWithProjections(/* require_part_metadata= */ true); BackupEntries backup_entries_from_part; @@ -5125,7 +5125,8 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + false, false); auto projection_parts = part->getProjectionParts(); for (const auto & [projection_name, projection_part] : projection_parts) @@ -5138,7 +5139,9 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + projection_part->is_broken, + backup_settings.allow_backup_broken_projections); } if (hold_storage_and_part_ptrs) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 1f072e207a7..4c4901ae99f 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -19,6 +19,7 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 +0 broke metadata of part 'proj' (parent part: all_2_2_0) system.parts all_0_0_0 1 ['proj','proj_2'] @@ -39,8 +40,9 @@ check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST -check table full (all_2_2_0) +check table full (test - all_2_2_0) all_2_2_0 +0 broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -78,6 +80,7 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 +0 broke data of part 'proj_2' (parent part: all_3_3_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -111,6 +114,7 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 +0 broke metadata of part 'proj' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -118,6 +122,7 @@ all_1_1_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST +0 broke data of part 'proj_2' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -146,10 +151,10 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 -check table full (all_1_1_0) +check table full (test - all_1_1_0) all_1_1_0 materialize projection proj -check table full () +check table full (test - ) system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 1 ['proj','proj_2'] @@ -175,7 +180,8 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 materialize projection proj_2 -check table full () +check table full (test - ) +0 broke data of part 'proj' (parent part: all_3_5_1_7) insert new part optimize @@ -214,3 +220,215 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 +insert new part +insert new part +insert new part +insert new part +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +system.parts +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2_replica WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +0 +broke data of part 'proj' (parent part: all_0_0_0) +check table full (test2 - all_0_0_0) +all_0_0_0 +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broke data of part 'all_0_0_0' +check table full (test2 - all_0_0_0) +all_0_0_0 +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +insert new part +insert new part +insert new part +insert new part +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +0 +broke data of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +FILE_DOESNT_EXIST +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +BACKUP_CREATED +RESTORED +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +FILE_DOESNT_EXIST +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +0 +broke all data of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +FILE_DOESNT_EXIST +materialize projection proj +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_0_0_0_4 1 ['proj','proj_2'] +all_1_1_0 0 ['proj','proj_2'] +all_1_1_0_4 1 ['proj','proj_2'] +all_2_2_0 0 ['proj','proj_2'] +all_2_2_0_4 1 ['proj','proj_2'] +all_3_3_0 0 ['proj','proj_2'] +all_3_3_0_4 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +BACKUP_CREATED +RESTORED +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +0 +broke all data of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +BACKUP_CREATED +RESTORED +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 80805330577..1555139e16f 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -5,35 +5,40 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -nm -q " -DROP TABLE IF EXISTS test SYNC; -CREATE TABLE test -( - a String, - b String, - c Int32, - d Int32, - e Int32, +function create_table() +{ + test_id=$1 + name=$2 + replica=$3 + $CLICKHOUSE_CLIENT -nm -q " + DROP TABLE IF EXISTS $name SYNC; + CREATE TABLE $name + ( + a String, + b String, + c Int64, + d Int64, + e Int64, - PROJECTION proj - ( - SELECT c ORDER BY d - ), - PROJECTION proj_2 - ( - SELECT d ORDER BY c + PROJECTION proj + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT d ORDER BY c + ) ) -) -ENGINE = ReplicatedMergeTree('/test4/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) -SETTINGS min_bytes_for_wide_part = 0, - max_parts_to_merge_at_once=3, - enable_vertical_merge_algorithm=1, - vertical_merge_algorithm_min_rows_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1; -" - -table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") + ENGINE = ReplicatedMergeTree('/test_broken_projection_24_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a + SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once=3, + enable_vertical_merge_algorithm=1, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + compress_primary_key=0; + " +} function random() { @@ -42,49 +47,88 @@ function random() function insert() { - offset=$1 - size=$2 + table=$1 + offset=$2 + size=$3 echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO test SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" + $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" } function break_projection() { - part_name=$1 - parent_name=$2 - break_type=$3 + table=$1 + part_name=$2 + parent_name=$3 + break_type=$4 read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " SELECT path FROM system.projection_parts - WHERE table='test' + WHERE table='$table' AND database=currentDatabase() AND active=1 AND part_name='$part_name' AND parent_name='$parent_name' + ORDER BY modification_time DESC LIMIT 1; ") + $CLICKHOUSE_CLIENT -q "select throwIf(substring('$part_path', 1, 1) != '/', 'Path is relative: $part_path')" || exit + if [ "$break_type" = "data" ] then rm "$part_path/d.bin" rm "$part_path/c.bin" echo "broke data of part '$part_name' (parent part: $parent_name)" - else + fi + if [ "$break_type" = "metadata" ] + then rm "$part_path/columns.txt" echo "broke metadata of part '$part_name' (parent part: $parent_name)" fi + if [ "$break_type" = "part" ] + then + rm -r "$part_path" + echo "broke all data of part '$part_name' (parent part: $parent_name)" + fi +} + +function break_part() +{ + table=$1 + part_name=$2 + + read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " + SELECT path + FROM system.parts + WHERE table='$table' + AND database=currentDatabase() + AND active=1 + AND part_name='$part_name' + ORDER BY modification_time DESC + LIMIT 1; + ") + + if [ "$part_path" = "" ] + then + echo "Part path is empty" + exit + fi + + rm $part_path/columns.txt + echo "broke data of part '$part_name'" } function broken_projections_info() { + table=$1 echo 'broken projections info' $CLICKHOUSE_CLIENT -q " SELECT parent_name, name, errors.name FROM ( SELECT parent_name, name, exception_code FROM system.projection_parts - WHERE table='test' + WHERE table='$table' AND database=currentDatabase() AND is_broken = 1 ) AS parts_info @@ -96,18 +140,19 @@ function broken_projections_info() function check() { + table=$1 expect_broken_part="" expected_error="" - if [ $# -ne 0 ]; then - expect_broken_part=$1 - expected_error=$2 + if [ $# -gt 1 ]; then + expect_broken_part=$2 + expected_error=$3 fi echo 'system.parts' $CLICKHOUSE_CLIENT -q " SELECT name, active, projections FROM system.parts - WHERE table='test' AND database=currentDatabase() + WHERE table='$table' AND database=currentDatabase() ORDER BY name;" echo "select from projection 'proj', expect error: $expect_broken_part" @@ -117,10 +162,10 @@ function check() then $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " SET send_logs_level='fatal'; -SELECT c FROM test WHERE d == 12 ORDER BY c; +SELECT c FROM $table WHERE d == 12 ORDER BY c; " 2>&1 | grep -oF "$expected_error" else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM $table WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -135,10 +180,10 @@ SELECT c FROM test WHERE d == 12 ORDER BY c; then $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " SET send_logs_level='fatal'; -SELECT d FROM test WHERE c == 12 ORDER BY d; +SELECT d FROM $table WHERE c == 12 ORDER BY d; " 2>&1 | grep -oF "$expected_error" else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM $table WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -149,7 +194,7 @@ SELECT d FROM test WHERE c == 12 ORDER BY d; echo 'check table' $CLICKHOUSE_CLIENT -nm -q " SET send_logs_level='fatal'; - CHECK TABLE test;" + CHECK TABLE $table;" } function optimize() @@ -184,141 +229,274 @@ function reattach() function materialize_projection { - projection=$1 + table=$1 + projection=$2 echo "materialize projection $projection" - $CLICKHOUSE_CLIENT -q "ALTER TABLE test MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" + $CLICKHOUSE_CLIENT -q "ALTER TABLE $table MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" } function check_table_full() { - echo "check table full ($1)" - expect_broken_part=$1 + table=$1 + expect_broken_part=$2 + echo "check table full ($1 - $2)" if [ "$expect_broken_part" = "" ] then $CLICKHOUSE_CLIENT -nm -q " SET send_logs_level='fatal'; - CHECK TABLE test SETTINGS check_query_single_value_result = 0; + CHECK TABLE $table SETTINGS check_query_single_value_result = 0; " | grep "broken" else $CLICKHOUSE_CLIENT -nm -q " SET send_logs_level='fatal'; - CHECK TABLE test SETTINGS check_query_single_value_result = 0; + CHECK TABLE $table SETTINGS check_query_single_value_result = 0; " | grep "broken" | grep -o $expect_broken_part | head -n 1 fi } +function test1() +{ + create_table test1 test 1 -insert 0 5 + table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") -insert 5 5 + insert test 0 5 -insert 10 5 + insert test 5 5 -insert 15 5 + insert test 10 5 -check + insert test 15 5 -# Break metadata file of projection 'proj' -break_projection proj all_2_2_0 metadata + check test -# Do select and after "check table" query. -# Select works because it does not read columns.txt. -check + # Break metadata file of projection 'proj' + break_projection test proj all_2_2_0 metadata -# Projection 'proj' from part all_2_2_0 will now appear in broken parts info -# because it was marked broken during "check table" query. -# TODO: try to mark it during select as well -broken_projections_info + # Do select and after "check table" query. + # Select works because it does not read columns.txt. + check test -# Check table query will also show a list of parts which have broken projections. -check_table_full "all_2_2_0" + # Projection 'proj' from part all_2_2_0 will now appear in broken parts info + # because it was marked broken during "check table" query. + # TODO: try to mark it during select as well + broken_projections_info test -# Break data file of projection 'proj_2' for part all_2_2_0 -break_projection proj_2 all_2_2_0 data + # Check table query will also show a list of parts which have broken projections. + check_table_full test "all_2_2_0" -# It will not yet appear in broken projections info. -broken_projections_info + # Break data file of projection 'proj_2' for part all_2_2_0 + break_projection test proj_2 all_2_2_0 data -# Select now fails with error "File doesn't exist" -check "proj_2" FILE_DOESNT_EXIST + # It will not yet appear in broken projections info. + broken_projections_info test -# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. -broken_projections_info + # Select now fails with error "File doesn't exist" + check test "proj_2" FILE_DOESNT_EXIST -# Second select works, because projection is now marked as broken. -check + # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. + broken_projections_info test -# Break data file of projection 'proj_2' for part all_3_3_0 -break_projection proj_2 all_3_3_0 data + # Second select works, because projection is now marked as broken. + check test -# It will not yet appear in broken projections info. -broken_projections_info + # Break data file of projection 'proj_2' for part all_3_3_0 + break_projection test proj_2 all_3_3_0 data -insert 20 5 + # It will not yet appear in broken projections info. + broken_projections_info test -insert 25 5 + insert test 20 5 -# Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. -# Parts all_4_4_0 and all_5_5_0 have both non-broken projections. -# So a merge will be create for future part all_3_5_1. -# During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. -# Merge will be retried and on second attempt it will succeed. -# The result part all_3_5_1 will have only 1 projection - 'proj', because -# it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. -optimize 0 1 -sleep 2 + insert test 25 5 -$CLICKHOUSE_CLIENT -nm -q " -SYSTEM FLUSH LOGS; -SELECT count() FROM system.text_log -WHERE level='Error' -AND logger_name='MergeTreeBackgroundExecutor' -AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' -" + # Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. + # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. + # So a merge will be create for future part all_3_5_1. + # During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. + # Merge will be retried and on second attempt it will succeed. + # The result part all_3_5_1 will have only 1 projection - 'proj', because + # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. + optimize 0 1 + sleep 2 -# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. -broken_projections_info + $CLICKHOUSE_CLIENT -nm -q " + SYSTEM FLUSH LOGS; + SELECT count() FROM system.text_log + WHERE level='Error' + AND logger_name='MergeTreeBackgroundExecutor' + AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' + " -check + # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. + broken_projections_info test -break_projection proj all_1_1_0 metadata + check test -reattach + break_projection test proj all_1_1_0 metadata -broken_projections_info + reattach -break_projection proj_2 all_1_1_0 data + broken_projections_info test -reattach + break_projection test proj_2 all_1_1_0 data -broken_projections_info + reattach -check + broken_projections_info test -check_table_full all_1_1_0 + check test -materialize_projection proj + check_table_full test all_1_1_0 -check_table_full + materialize_projection test proj -check + check_table_full test -materialize_projection proj_2 + check test -check_table_full + materialize_projection test proj_2 -break_projection proj all_3_5_1_7 data + check_table_full test -insert 30 5 + break_projection test proj all_3_5_1_7 data -optimize 1 0 + insert test 30 5 -insert 35 5 + optimize 1 0 -optimize 1 0 + insert test 35 5 -check + optimize 1 0 -$CLICKHOUSE_CLIENT -nm -q " -DROP TABLE test SYNC; -" + check test +} + +function test2() +{ + create_table test2 test2 1 + + insert test2 0 5 + + insert test2 5 5 + + insert test 10 5 + + insert test 15 5 + + check test2 + + create_table test2 test2_replica 2 + + check test2_replica + + break_projection test2 proj all_0_0_0 data + + check_table_full test2 all_0_0_0 + + check test2 + + break_part test2 all_0_0_0 + + check_table_full test2 all_0_0_0 + + check test2 + + $CLICKHOUSE_CLIENT -q "SYSTEM SYNC REPLICA test2;" + + check test2 +} + +CLICKHOUSE_TEST_UNIQUE_NAME="gghhhhhhhhhhhhhhhhhhh" +function test3() +{ + create_table test3 test 1 + + insert test 0 5 + + insert test 5 5 + + insert test 10 5 + + insert test 15 5 + + check test + + break_projection test proj all_2_2_0 data + + check test proj FILE_DOESNT_EXIST + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; + " | grep -o "BACKUP_CREATED" + + ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " + drop table test sync; + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); + " | grep -o "RESTORED" + + check test proj FILE_DOESNT_EXIST + + broken_projections_info test + + break_projection test proj all_2_2_0 part + + check test proj + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + set send_logs_level='fatal'; + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') + " 2>&1 | grep -o "FILE_DOESNT_EXIST" + + materialize_projection test proj + + check test proj + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') + " | grep -o "BACKUP_CREATED" + + ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " + drop table test sync; + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); + " | grep -o "RESTORED" + + check test proj + + break_projection test proj all_2_2_0 part + + check test proj FILE_DOESNT_EXIST + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') + settings check_projection_parts=false, allow_backup_broken_projections=true; + " | grep -o "BACKUP_CREATED" + + ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " + drop table test sync; + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2'); + " | grep -o "RESTORED" + + check test + + broken_projections_info test +} + +test1 +test2 +test3 + + +#$CLICKHOUSE_CLIENT -nm -q " +#DROP TABLE test SYNC; +#DROP TABLE test2 SYNC; +#DROP TABLE test2_replica SYNC; +#" From 6632589d72ed270626e012c86a78a8f0c8411fb3 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 30 Nov 2023 13:54:22 +0100 Subject: [PATCH 0030/1081] Review fix --- src/Storages/MergeTree/MergeTreeData.cpp | 6 +++--- tests/queries/0_stateless/02916_broken_projection.sh | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c95aee88aee..1ba4153bc3e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7669,7 +7669,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right, String & out_reason) { - auto remove_broken_parts = [](auto & parts) + auto remove_broken_parts_from_consideration = [](auto & parts) { std::set broken_projection_parts; for (const auto & [name, part] : parts) @@ -7684,8 +7684,8 @@ bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const auto left_projection_parts = left->getProjectionParts(); auto right_projection_parts = right->getProjectionParts(); - remove_broken_parts(left_projection_parts); - remove_broken_parts(right_projection_parts); + remove_broken_parts_from_consideration(left_projection_parts); + remove_broken_parts_from_consideration(right_projection_parts); if (left_projection_parts.size() != right_projection_parts.size()) { diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 1555139e16f..60b21216d1a 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -495,8 +495,8 @@ test2 test3 -#$CLICKHOUSE_CLIENT -nm -q " -#DROP TABLE test SYNC; -#DROP TABLE test2 SYNC; -#DROP TABLE test2_replica SYNC; -#" +$CLICKHOUSE_CLIENT -nm -q " +DROP TABLE IF EXISTS test SYNC; +DROP TABLE IF EXISTS test2 SYNC; +DROP TABLE IF EXISTS test2_replica SYNC; +" From caf4dc7e14e594da3c254822b345b79c57e76d19 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 1 Dec 2023 12:21:47 +0100 Subject: [PATCH 0031/1081] Fix style check --- src/Common/ErrorCodes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index b0ed754536d..57aa82f3639 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -587,7 +587,7 @@ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ M(707, GCP_ERROR) \ - M(708, ILLEGAL_STATISTIC) \ + M(708, ILLEGAL_STATISTIC) \ M(709, BROKEN_PROJECTION) \ \ M(999, KEEPER_EXCEPTION) \ From f609c44eb83fc769ba9e8fc5875bbc10e3e17b9b Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:38:28 +0100 Subject: [PATCH 0032/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 60b21216d1a..bd141d1a122 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -459,11 +459,13 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " + set send_logs_level='fatal'; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; + set send_logs_level='fatal'; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); " | grep -o "RESTORED" @@ -476,12 +478,14 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " + set send_logs_level='fatal'; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') settings check_projection_parts=false, allow_backup_broken_projections=true; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; + set send_logs_level='fatal'; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2'); " | grep -o "RESTORED" From d0827e3ea77ff432c4a6a66145827428bcd62b5e Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 1 Dec 2023 17:45:23 +0000 Subject: [PATCH 0033/1081] Add a test. --- .../0_stateless/02932_set_ttl_where.reference | 0 .../0_stateless/02932_set_ttl_where.sql | 22 +++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tests/queries/0_stateless/02932_set_ttl_where.reference create mode 100644 tests/queries/0_stateless/02932_set_ttl_where.sql diff --git a/tests/queries/0_stateless/02932_set_ttl_where.reference b/tests/queries/0_stateless/02932_set_ttl_where.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql new file mode 100644 index 00000000000..85fddf613e8 --- /dev/null +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -0,0 +1,22 @@ +create or replace table temp ( + a UInt32 +) +engine = MergeTree +order by a; + +insert into temp select number from system.numbers limit 100_000; + +create or replace table t_temp ( + a UInt32, + timestamp DateTime +) +engine = MergeTree +order by a +TTL timestamp + INTERVAL 2 SECOND WHERE a in (select a from temp); + +select sleep(1); +insert into t_temp select rand(), now() from system.numbers limit 1_000_000; +select sleep(1); +insert into t_temp select rand(), now() from system.numbers limit 1_000_000; +select sleep(1); +optimize table t_temp final; From 508046e6922c0cb163ce5611f1e6ef6a22f8b7f1 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 1 Dec 2023 20:31:26 +0000 Subject: [PATCH 0034/1081] Attempt to support subqueries in TTL. --- src/Interpreters/PreparedSets.cpp | 8 ++- src/Interpreters/PreparedSets.h | 1 + src/Processors/TTL/ITTLAlgorithm.cpp | 5 +- src/Processors/TTL/ITTLAlgorithm.h | 9 ++- .../TTL/TTLAggregationAlgorithm.cpp | 11 ++-- src/Processors/TTL/TTLAggregationAlgorithm.h | 1 + src/Processors/TTL/TTLColumnAlgorithm.cpp | 5 +- src/Processors/TTL/TTLColumnAlgorithm.h | 1 + src/Processors/TTL/TTLDeleteAlgorithm.cpp | 10 +-- src/Processors/TTL/TTLDeleteAlgorithm.h | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp | 5 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.h | 1 + src/Processors/Transforms/TTLTransform.cpp | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 19 +++--- src/Storages/StorageInMemoryMetadata.cpp | 21 +++---- src/Storages/TTLDescription.cpp | 62 ++++++++++++------- src/Storages/TTLDescription.h | 15 ++++- 17 files changed, 116 insertions(+), 62 deletions(-) diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index 955d8892284..ea8d9a62b8b 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -189,11 +189,17 @@ SetPtr FutureSetFromSubquery::buildOrderedSetInplace(const ContextPtr & context) } } + set_and_key->set->fillSetElements(); + + return buildSetInplace(context); +} + +SetPtr FutureSetFromSubquery::buildSetInplace(const ContextPtr & context) +{ auto plan = build(context); if (!plan) return nullptr; - set_and_key->set->fillSetElements(); auto builder = plan->buildQueryPipeline(QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); pipeline.complete(std::make_shared(Block())); diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index e237789c63c..3e751d309ba 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -107,6 +107,7 @@ public: SetPtr get() const override; DataTypes getTypes() const override; SetPtr buildOrderedSetInplace(const ContextPtr & context) override; + SetPtr buildSetInplace(const ContextPtr & context); std::unique_ptr build(const ContextPtr & context); diff --git a/src/Processors/TTL/ITTLAlgorithm.cpp b/src/Processors/TTL/ITTLAlgorithm.cpp index 79140137df8..af6c4e4ac35 100644 --- a/src/Processors/TTL/ITTLAlgorithm.cpp +++ b/src/Processors/TTL/ITTLAlgorithm.cpp @@ -11,8 +11,9 @@ namespace ErrorCodes } ITTLAlgorithm::ITTLAlgorithm( - const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : description(description_) + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + : ttl_expressions(ttl_expressions_) + , description(description_) , old_ttl_info(old_ttl_info_) , current_time(current_time_) , force(force_) diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index 49cd2c46d9d..6e73286b564 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -8,6 +8,12 @@ namespace DB { +struct TTlExpressions +{ + ExpressionActionsPtr expression; + ExpressionActionsPtr where_expression; +}; + /** * Represents the actions, which are required to do * with data, when TTL is expired: delete, aggregate, etc. @@ -18,7 +24,7 @@ public: using TTLInfo = IMergeTreeDataPart::TTLInfo; using MutableDataPartPtr = MergeTreeMutableDataPartPtr; - ITTLAlgorithm(const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + ITTLAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); virtual ~ITTLAlgorithm() = default; virtual void execute(Block & block) = 0; @@ -39,6 +45,7 @@ protected: bool isTTLExpired(time_t ttl) const; UInt32 getTimestampByIndex(const IColumn * column, size_t index) const; + const TTlExpressions ttl_expressions; const TTLDescription description; const TTLInfo old_ttl_info; const time_t current_time; diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp index fa3436ec55d..ab2ba5f58fc 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp +++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp @@ -5,13 +5,14 @@ namespace DB { TTLAggregationAlgorithm::TTLAggregationAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_, const Block & header_, const MergeTreeData & storage_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , header(header_) { current_key_value.resize(description.group_by_keys.size()); @@ -73,8 +74,8 @@ void TTLAggregationAlgorithm::execute(Block & block) const auto & column_names = header.getNames(); MutableColumns aggregate_columns = header.cloneEmptyColumns(); - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); size_t rows_aggregated = 0; size_t current_key_start = 0; @@ -145,8 +146,8 @@ void TTLAggregationAlgorithm::execute(Block & block) /// If some rows were aggregated we have to recalculate ttl info's if (some_rows_were_aggregated) { - auto ttl_column_after_aggregation = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column_after_aggregation = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column_after_aggregation = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column_after_aggregation = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); for (size_t i = 0; i < block.rows(); ++i) { bool where_filter_passed = !where_column_after_aggregation || where_column_after_aggregation->getBool(i); diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.h b/src/Processors/TTL/TTLAggregationAlgorithm.h index 0e4bf092ed6..9fd074efba8 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.h +++ b/src/Processors/TTL/TTLAggregationAlgorithm.h @@ -13,6 +13,7 @@ class TTLAggregationAlgorithm final : public ITTLAlgorithm { public: TTLAggregationAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.cpp b/src/Processors/TTL/TTLColumnAlgorithm.cpp index 04c4d7b9348..cb99dcf99b1 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.cpp +++ b/src/Processors/TTL/TTLColumnAlgorithm.cpp @@ -4,6 +4,7 @@ namespace DB { TTLColumnAlgorithm::TTLColumnAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, @@ -12,7 +13,7 @@ TTLColumnAlgorithm::TTLColumnAlgorithm( const ExpressionActionsPtr & default_expression_, const String & default_column_name_, bool is_compact_part_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , column_name(column_name_) , default_expression(default_expression_) , default_column_name(default_column_name_) @@ -49,7 +50,7 @@ void TTLColumnAlgorithm::execute(Block & block) if (default_column) default_column = default_column->convertToFullColumnIfConst(); - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); auto & column_with_type = block.getByName(column_name); const IColumn * values_column = column_with_type.column.get(); diff --git a/src/Processors/TTL/TTLColumnAlgorithm.h b/src/Processors/TTL/TTLColumnAlgorithm.h index 30de77dcc2a..efcd7c74454 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.h +++ b/src/Processors/TTL/TTLColumnAlgorithm.h @@ -11,6 +11,7 @@ class TTLColumnAlgorithm final : public ITTLAlgorithm { public: TTLColumnAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.cpp b/src/Processors/TTL/TTLDeleteAlgorithm.cpp index f176df2d003..6a172e9c3c3 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.cpp +++ b/src/Processors/TTL/TTLDeleteAlgorithm.cpp @@ -4,8 +4,8 @@ namespace DB { TTLDeleteAlgorithm::TTLDeleteAlgorithm( - const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) { if (!isMinTTLExpired()) new_ttl_info = old_ttl_info; @@ -19,8 +19,8 @@ void TTLDeleteAlgorithm::execute(Block & block) if (!block || !isMinTTLExpired()) return; - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); MutableColumns result_columns; const auto & column_names = block.getNames(); @@ -54,7 +54,7 @@ void TTLDeleteAlgorithm::execute(Block & block) void TTLDeleteAlgorithm::finalize(const MutableDataPartPtr & data_part) const { - if (description.where_expression) + if (ttl_expressions.where_expression) data_part->ttl_infos.rows_where_ttl[description.result_column] = new_ttl_info; else data_part->ttl_infos.table_ttl = new_ttl_info; diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.h b/src/Processors/TTL/TTLDeleteAlgorithm.h index 292a29bfa27..23389070774 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.h +++ b/src/Processors/TTL/TTLDeleteAlgorithm.h @@ -10,7 +10,7 @@ namespace DB class TTLDeleteAlgorithm final : public ITTLAlgorithm { public: - TTLDeleteAlgorithm(const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + TTLDeleteAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); void execute(Block & block) override; void finalize(const MutableDataPartPtr & data_part) const override; diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index eba364aa2b8..34c0cad70ea 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -4,13 +4,14 @@ namespace DB { TTLUpdateInfoAlgorithm::TTLUpdateInfoAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , ttl_update_field(ttl_update_field_) , ttl_update_key(ttl_update_key_) { @@ -21,7 +22,7 @@ void TTLUpdateInfoAlgorithm::execute(Block & block) if (!block) return; - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); for (size_t i = 0; i < block.rows(); ++i) { UInt32 cur_ttl = ITTLAlgorithm::getTimestampByIndex(ttl_column.get(), i); diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h index 45eecbde3d0..e9bcfcdec88 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h @@ -20,6 +20,7 @@ class TTLUpdateInfoAlgorithm : public ITTLAlgorithm { public: TTLUpdateInfoAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index 7cde86098c7..d3d45f68d46 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -36,7 +36,7 @@ TTLTransform::TTLTransform( rows_ttl, old_ttl_infos.table_ttl, current_time_, force_); /// Skip all data if table ttl is expired for part - if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression) + if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression_ast) all_data_dropped = true; delete_algorithm = algorithm.get(); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 2a381afa805..d080240b066 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -125,13 +125,18 @@ void buildScatterSelector( /// Computes ttls and updates ttl infos void updateTTL( + const ContextPtr context, const TTLDescription & ttl_entry, IMergeTreeDataPart::TTLInfos & ttl_infos, DB::MergeTreeDataPartTTLInfo & ttl_info, const Block & block, bool update_part_min_max_ttls) { - auto ttl_column = ITTLAlgorithm::executeExpressionAndGetColumn(ttl_entry.expression, block, ttl_entry.result_column); + auto expr_and_set = ttl_entry.buildExpression(); + for (auto & subquery : expr_and_set.sets->getSubqueries()) + subquery->buildSetInplace(context); + + auto ttl_column = ITTLAlgorithm::executeExpressionAndGetColumn(expr_and_set.expression, block, ttl_entry.result_column); if (const ColumnUInt16 * column_date = typeid_cast(ttl_column.get())) { @@ -488,7 +493,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( DB::IMergeTreeDataPart::TTLInfos move_ttl_infos; const auto & move_ttl_entries = metadata_snapshot->getMoveTTLs(); for (const auto & ttl_entry : move_ttl_entries) - updateTTL(ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); + updateTTL(context, ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); ReservationPtr reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true); VolumePtr volume = data.getStoragePolicy()->getVolume(0); @@ -543,20 +548,20 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( } if (metadata_snapshot->hasRowsTTL()) - updateTTL(metadata_snapshot->getRowsTTL(), new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); + updateTTL(context, metadata_snapshot->getRowsTTL(), new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); for (const auto & ttl_entry : metadata_snapshot->getGroupByTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.group_by_ttl[ttl_entry.result_column], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.group_by_ttl[ttl_entry.result_column], block, true); for (const auto & ttl_entry : metadata_snapshot->getRowsWhereTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.rows_where_ttl[ttl_entry.result_column], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.rows_where_ttl[ttl_entry.result_column], block, true); for (const auto & [name, ttl_entry] : metadata_snapshot->getColumnTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); const auto & recompression_ttl_entries = metadata_snapshot->getRecompressionTTLs(); for (const auto & ttl_entry : recompression_ttl_entries) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.recompression_ttl[ttl_entry.result_column], block, false); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.recompression_ttl[ttl_entry.result_column], block, false); new_data_part->ttl_infos.update(move_ttl_infos); diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index af285a953dc..7db5af82e0b 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -193,7 +193,7 @@ TTLDescription StorageInMemoryMetadata::getRowsTTL() const bool StorageInMemoryMetadata::hasRowsTTL() const { - return table_ttl.rows_ttl.expression != nullptr; + return table_ttl.rows_ttl.expression_ast != nullptr; } TTLDescriptions StorageInMemoryMetadata::getRowsWhereTTLs() const @@ -251,9 +251,8 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( NameSet required_ttl_columns; NameSet updated_ttl_columns; - auto add_dependent_columns = [&updated_columns](const auto & expression, auto & to_set) + auto add_dependent_columns = [&updated_columns](const Names & required_columns, auto & to_set) { - auto required_columns = expression->getRequiredColumns(); for (const auto & dependency : required_columns) { if (updated_columns.contains(dependency)) @@ -269,13 +268,13 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( for (const auto & index : getSecondaryIndices()) { if (has_dependency(index.name, ColumnDependency::SKIP_INDEX)) - add_dependent_columns(index.expression, indices_columns); + add_dependent_columns(index.expression->getRequiredColumns(), indices_columns); } for (const auto & projection : getProjections()) { if (has_dependency(projection.name, ColumnDependency::PROJECTION)) - add_dependent_columns(&projection, projections_columns); + add_dependent_columns(projection.getRequiredColumns(), projections_columns); } auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) @@ -289,25 +288,25 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( }; if (hasRowsTTL()) - add_for_rows_ttl(getRowsTTL().expression, required_ttl_columns); + add_for_rows_ttl(getRowsTTL().expression_columns, required_ttl_columns); for (const auto & entry : getRowsWhereTTLs()) - add_for_rows_ttl(entry.expression, required_ttl_columns); + add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getGroupByTTLs()) - add_for_rows_ttl(entry.expression, required_ttl_columns); + add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getRecompressionTTLs()) - add_dependent_columns(entry.expression, required_ttl_columns); + add_dependent_columns(entry.expression_columns, required_ttl_columns); for (const auto & [name, entry] : getColumnTTLs()) { - if (add_dependent_columns(entry.expression, required_ttl_columns) && include_ttl_target) + if (add_dependent_columns(entry.expression_columns, required_ttl_columns) && include_ttl_target) updated_ttl_columns.insert(name); } for (const auto & entry : getMoveTTLs()) - add_dependent_columns(entry.expression, required_ttl_columns); + add_dependent_columns(entry.expression_columns, required_ttl_columns); //TODO what about rows_where_ttl and group_by_ttl ?? diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index f601fed06ac..47138f30e4f 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -113,11 +113,11 @@ TTLDescription::TTLDescription(const TTLDescription & other) , if_exists(other.if_exists) , recompression_codec(other.recompression_codec) { - if (other.expression) - expression = other.expression->clone(); + // if (other.expression) + // expression = other.expression->clone(); - if (other.where_expression) - where_expression = other.where_expression->clone(); + // if (other.where_expression) + // where_expression = other.where_expression->clone(); } TTLDescription & TTLDescription::operator=(const TTLDescription & other) @@ -131,16 +131,16 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else expression_ast.reset(); - if (other.expression) - expression = other.expression->clone(); - else - expression.reset(); + // if (other.expression) + // expression = other.expression->clone(); + // else + // expression.reset(); result_column = other.result_column; - if (other.where_expression) - where_expression = other.where_expression->clone(); - else - where_expression.reset(); + // if (other.where_expression) + // where_expression = other.where_expression->clone(); + // else + // where_expression.reset(); where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; @@ -158,6 +158,17 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) return * this; } +static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndTypesList & columns, const ContextPtr & context) +{ + ExpressionAndSets result; + auto syntax_analyzer_result = TreeRewriter(context).analyze(ast, columns); + ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context); + result.expression = analyzer.getActions(false); + result.sets = analyzer.getPreparedSets(); + + return result; +} + TTLDescription TTLDescription::getTTLFromAST( const ASTPtr & definition_ast, const ColumnsDescription & columns, @@ -174,10 +185,15 @@ TTLDescription TTLDescription::getTTLFromAST( result.expression_ast = definition_ast->clone(); auto ttl_ast = result.expression_ast->clone(); - auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); - result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); + auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; + result.expression_columns = expression->getRequiredColumns(); + + // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); + // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); result.result_column = ttl_ast->getColumnName(); + ExpressionActionsPtr where_expression; + if (ttl_element == nullptr) /// columns TTL { result.destination_type = DataDestinationType::DELETE; @@ -194,8 +210,10 @@ TTLDescription TTLDescription::getTTLFromAST( { if (ASTPtr where_expr_ast = ttl_element->where()) { - auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); - result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); + result.where_expression_ast = where_expr_ast->clone(); + where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; + // auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); + // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); result.where_result_column = where_expr_ast->getColumnName(); } } @@ -221,17 +239,17 @@ TTLDescription TTLDescription::getTTLFromAST( for (const auto & ast : ttl_element->group_by_assignments) { const auto assignment = ast->as(); - auto expression = assignment.expression(); + auto ass_expression = assignment.expression(); FindAggregateFunctionVisitor::Data data{false}; - FindAggregateFunctionVisitor(data).visit(expression); + FindAggregateFunctionVisitor(data).visit(ass_expression); if (!data.has_aggregate_function) throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, "Invalid expression for assignment of column {}. Should contain an aggregate function", assignment.column_name); - expression = addTypeConversionToAST(std::move(expression), columns.getPhysical(assignment.column_name).type->getName()); - aggregations.emplace_back(assignment.column_name, std::move(expression)); + ass_expression = addTypeConversionToAST(std::move(ass_expression), columns.getPhysical(assignment.column_name).type->getName()); + aggregations.emplace_back(assignment.column_name, std::move(ass_expression)); aggregation_columns_set.insert(assignment.column_name); } @@ -289,7 +307,7 @@ TTLDescription TTLDescription::getTTLFromAST( } } - checkTTLExpression(result.expression, result.result_column); + checkTTLExpression(expression, result.result_column); return result; } @@ -341,7 +359,7 @@ TTLTableDescription TTLTableDescription::getTTLForTableFromAST( auto ttl = TTLDescription::getTTLFromAST(ttl_element_ptr, columns, context, primary_key); if (ttl.mode == TTLMode::DELETE) { - if (!ttl.where_expression) + if (!ttl.where_expression_ast) { if (have_unconditional_delete_ttl) throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, "More than one DELETE TTL expression without WHERE expression is not allowed"); diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index 8f60eb604b5..5ea243424cb 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -33,6 +33,15 @@ struct TTLAggregateDescription using TTLAggregateDescriptions = std::vector; +class PreparedSets; +using PreparedSetsPtr = std::shared_ptr; + +struct ExpressionAndSets +{ + ExpressionActionsPtr expression; + PreparedSetsPtr sets; +}; + /// Common struct for TTL record in storage struct TTLDescription { @@ -42,9 +51,10 @@ struct TTLDescription /// TTL d + INTERVAL 1 DAY /// ^~~~~~~~~~~~~~~~~~~^ ASTPtr expression_ast; + Names expression_columns; /// Expression actions evaluated from AST - ExpressionActionsPtr expression; + ExpressionAndSets buildExpression() const; /// Result column of this TTL expression String result_column; @@ -52,7 +62,8 @@ struct TTLDescription /// WHERE part in TTL expression /// TTL ... WHERE x % 10 == 0 and y > 5 /// ^~~~~~~~~~~~~~~~~~~~~~^ - ExpressionActionsPtr where_expression; + ASTPtr where_expression_ast; + ExpressionAndSets buildWhereExpression() const; /// Name of result column from WHERE expression String where_result_column; From 2b903003b4795eb3768fec3f84ec8321fa5485f6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 4 Dec 2023 13:21:18 +0100 Subject: [PATCH 0035/1081] Update reference --- .../0_stateless/02916_broken_projection.reference | 8 +++----- tests/queries/0_stateless/02916_broken_projection.sh | 9 ++++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 4c4901ae99f..acd1b87eb30 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -332,16 +332,14 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj -FILE_DOESNT_EXIST select from projection 'proj_2', expect error: proj 12 16 used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table -0 +1 broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART 0 broke all data of part 'proj' (parent part: all_2_2_0) system.parts @@ -358,7 +356,7 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj FILE_DOESNT_EXIST FILE_DOESNT_EXIST materialize projection proj system.parts @@ -379,7 +377,7 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED system.parts diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index bd141d1a122..7315cf5ce61 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -29,7 +29,7 @@ function create_table() SELECT d ORDER BY c ) ) - ENGINE = ReplicatedMergeTree('/test_broken_projection_24_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a + ENGINE = ReplicatedMergeTree('/test_broken_projection_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -407,7 +407,6 @@ function test2() check test2 } -CLICKHOUSE_TEST_UNIQUE_NAME="gghhhhhhhhhhhhhhhhhhh" function test3() { create_table test3 test 1 @@ -437,7 +436,7 @@ function test3() restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); " | grep -o "RESTORED" - check test proj FILE_DOESNT_EXIST + check test proj broken_projections_info test @@ -479,14 +478,14 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4') settings check_projection_parts=false, allow_backup_broken_projections=true; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; set send_logs_level='fatal'; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2'); + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4'); " | grep -o "RESTORED" check test From b77a6073aea98c7c5f5fcc28492a34e801d11b6b Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 4 Dec 2023 16:57:09 +0100 Subject: [PATCH 0036/1081] Fix test --- .../02916_broken_projection.reference | 90 +++++++++++-------- .../0_stateless/02916_broken_projection.sh | 36 ++++---- 2 files changed, 70 insertions(+), 56 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index acd1b87eb30..b7764a6434e 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -7,12 +7,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -26,12 +26,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -51,7 +51,7 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj_2 +select from projection 'proj' 12 16 used projections @@ -68,12 +68,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -102,12 +102,12 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -139,12 +139,12 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -167,12 +167,12 @@ all_3_5_1 0 ['proj'] all_3_5_1_6 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -208,12 +208,12 @@ all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] all_8_8_0 0 ['proj','proj_2'] all_9_9_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj_2 -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -227,20 +227,19 @@ insert new part system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 system.parts -all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2_replica WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -252,10 +251,10 @@ all_0_0_0 system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -266,10 +265,10 @@ all_0_0_0 system.parts all_0_0_0 0 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -277,10 +276,10 @@ check table system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -294,12 +293,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -315,7 +314,7 @@ all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj FILE_DOESNT_EXIST -select from projection 'proj_2', expect error: proj +select from projection 'proj_2' 12 16 used projections @@ -331,8 +330,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2' 12 16 used projections @@ -348,7 +351,8 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +STD_EXCEPTION +select from projection 'proj_2' 12 16 used projections @@ -368,8 +372,12 @@ all_2_2_0 0 ['proj','proj_2'] all_2_2_0_4 1 ['proj','proj_2'] all_3_3_0 0 ['proj','proj_2'] all_3_3_0_4 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2' 12 16 used projections @@ -385,8 +393,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2' 12 16 used projections @@ -401,7 +413,7 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj_2' 12 16 used projections @@ -417,12 +429,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 7315cf5ce61..eeea512f14a 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -29,7 +29,7 @@ function create_table() SELECT d ORDER BY c ) ) - ENGINE = ReplicatedMergeTree('/test_broken_projection_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a + ENGINE = ReplicatedMergeTree('/test_broken_projection_32_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -155,39 +155,41 @@ function check() WHERE table='$table' AND database=currentDatabase() ORDER BY name;" - echo "select from projection 'proj', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj" ] then + echo "select from projection 'proj', expect error: $expect_broken_part" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " -SET send_logs_level='fatal'; -SELECT c FROM $table WHERE d == 12 ORDER BY c; -" 2>&1 | grep -oF "$expected_error" + SET send_logs_level='fatal'; + SELECT c FROM $table WHERE d == 12 ORDER BY c; + " 2>&1 | grep -oF "$expected_error" else + echo "select from projection 'proj'" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM $table WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi - echo "select from projection 'proj_2', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj_2" ] then + echo "select from projection 'proj_2', expect error: $expect_broken_part" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " -SET send_logs_level='fatal'; -SELECT d FROM $table WHERE c == 12 ORDER BY d; -" 2>&1 | grep -oF "$expected_error" + SET send_logs_level='fatal'; + SELECT d FROM $table WHERE c == 12 ORDER BY d; + " 2>&1 | grep -oF "$expected_error" else + echo "select from projection 'proj_2'" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM $table WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -436,13 +438,13 @@ function test3() restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); " | grep -o "RESTORED" - check test proj + check test broken_projections_info test break_projection test proj all_2_2_0 part - check test proj + check test proj STD_EXCEPTION broken_projections_info test @@ -453,7 +455,7 @@ function test3() materialize_projection test proj - check test proj + check test broken_projections_info test @@ -468,7 +470,7 @@ function test3() restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); " | grep -o "RESTORED" - check test proj + check test break_projection test proj all_2_2_0 part From 7ab4af06df0d78e6728e3cc5c727e5c9e4cc33ef Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 4 Dec 2023 18:04:42 +0000 Subject: [PATCH 0037/1081] Attempt to support subqueries in TTL. (2) --- src/Processors/QueryPlan/CreatingSetsStep.cpp | 29 +++++++++++ src/Processors/QueryPlan/CreatingSetsStep.h | 2 + src/Processors/TTL/ITTLAlgorithm.cpp | 2 +- src/Processors/TTL/ITTLAlgorithm.h | 6 +-- .../TTL/TTLAggregationAlgorithm.cpp | 2 +- src/Processors/TTL/TTLAggregationAlgorithm.h | 2 +- src/Processors/TTL/TTLColumnAlgorithm.cpp | 2 +- src/Processors/TTL/TTLColumnAlgorithm.h | 2 +- src/Processors/TTL/TTLDeleteAlgorithm.cpp | 2 +- src/Processors/TTL/TTLDeleteAlgorithm.h | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.h | 2 +- .../Transforms/TTLCalcTransform.cpp | 33 ++++++++++--- src/Processors/Transforms/TTLCalcTransform.h | 4 ++ src/Processors/Transforms/TTLTransform.cpp | 33 ++++++++++--- src/Processors/Transforms/TTLTransform.h | 5 ++ src/Storages/MergeTree/MergeTask.cpp | 36 +++++++++----- .../MergeTree/MergeTreeDataWriter.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 49 ++++++++++++++----- src/Storages/StorageInMemoryMetadata.cpp | 8 +-- src/Storages/TTLDescription.cpp | 21 +++++++- src/Storages/TTLDescription.h | 7 +-- 22 files changed, 197 insertions(+), 56 deletions(-) diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 3e4dfb0c7d1..11415e8d815 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -157,6 +157,35 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::Subqueries subque query_plan.unitePlans(std::move(creating_sets), std::move(plans)); } +QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipeline, PreparedSets::Subqueries subqueries, ContextPtr context) +{ + DataStreams input_streams; + input_streams.emplace_back(DataStream{pipeline->getHeader()}); + + QueryPipelineBuilders pipelines; + pipelines.reserve(1 + subqueries.size()); + pipelines.push_back(std::move(pipeline)); + + auto plan_settings = QueryPlanOptimizationSettings::fromContext(context); + auto pipeline_settings = BuildQueryPipelineSettings::fromContext(context); + + for (auto & future_set : subqueries) + { + if (future_set->get()) + continue; + + auto plan = future_set->build(context); + if (!plan) + continue; + + input_streams.emplace_back(plan->getCurrentDataStream()); + pipelines.emplace_back(plan->buildQueryPipeline(plan_settings, pipeline_settings)); + } + + CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); + return std::move(pipelines.front()); +} + std::vector> DelayedCreatingSetsStep::makePlansForSets(DelayedCreatingSetsStep && step) { std::vector> plans; diff --git a/src/Processors/QueryPlan/CreatingSetsStep.h b/src/Processors/QueryPlan/CreatingSetsStep.h index a90b70a2fa4..292ec19914c 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.h +++ b/src/Processors/QueryPlan/CreatingSetsStep.h @@ -72,4 +72,6 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::Subqueries subque void addCreatingSetsStep(QueryPlan & query_plan, PreparedSetsPtr prepared_sets, ContextPtr context); +QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipeline, PreparedSets::Subqueries subqueries, ContextPtr context); + } diff --git a/src/Processors/TTL/ITTLAlgorithm.cpp b/src/Processors/TTL/ITTLAlgorithm.cpp index af6c4e4ac35..761f43e2422 100644 --- a/src/Processors/TTL/ITTLAlgorithm.cpp +++ b/src/Processors/TTL/ITTLAlgorithm.cpp @@ -11,7 +11,7 @@ namespace ErrorCodes } ITTLAlgorithm::ITTLAlgorithm( - const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) : ttl_expressions(ttl_expressions_) , description(description_) , old_ttl_info(old_ttl_info_) diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index 6e73286b564..d79aa8a8dfc 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -8,7 +8,7 @@ namespace DB { -struct TTlExpressions +struct TTLExpressions { ExpressionActionsPtr expression; ExpressionActionsPtr where_expression; @@ -24,7 +24,7 @@ public: using TTLInfo = IMergeTreeDataPart::TTLInfo; using MutableDataPartPtr = MergeTreeMutableDataPartPtr; - ITTLAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + ITTLAlgorithm(const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); virtual ~ITTLAlgorithm() = default; virtual void execute(Block & block) = 0; @@ -45,7 +45,7 @@ protected: bool isTTLExpired(time_t ttl) const; UInt32 getTimestampByIndex(const IColumn * column, size_t index) const; - const TTlExpressions ttl_expressions; + const TTLExpressions ttl_expressions; const TTLDescription description; const TTLInfo old_ttl_info; const time_t current_time; diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp index ab2ba5f58fc..0c6184a56e5 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp +++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp @@ -5,7 +5,7 @@ namespace DB { TTLAggregationAlgorithm::TTLAggregationAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.h b/src/Processors/TTL/TTLAggregationAlgorithm.h index 9fd074efba8..f7bf19a202b 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.h +++ b/src/Processors/TTL/TTLAggregationAlgorithm.h @@ -13,7 +13,7 @@ class TTLAggregationAlgorithm final : public ITTLAlgorithm { public: TTLAggregationAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.cpp b/src/Processors/TTL/TTLColumnAlgorithm.cpp index cb99dcf99b1..e27050564ce 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.cpp +++ b/src/Processors/TTL/TTLColumnAlgorithm.cpp @@ -4,7 +4,7 @@ namespace DB { TTLColumnAlgorithm::TTLColumnAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.h b/src/Processors/TTL/TTLColumnAlgorithm.h index efcd7c74454..f34dae952d1 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.h +++ b/src/Processors/TTL/TTLColumnAlgorithm.h @@ -11,7 +11,7 @@ class TTLColumnAlgorithm final : public ITTLAlgorithm { public: TTLColumnAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.cpp b/src/Processors/TTL/TTLDeleteAlgorithm.cpp index 6a172e9c3c3..6f9bc315276 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.cpp +++ b/src/Processors/TTL/TTLDeleteAlgorithm.cpp @@ -4,7 +4,7 @@ namespace DB { TTLDeleteAlgorithm::TTLDeleteAlgorithm( - const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) { if (!isMinTTLExpired()) diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.h b/src/Processors/TTL/TTLDeleteAlgorithm.h index 23389070774..622e45acecb 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.h +++ b/src/Processors/TTL/TTLDeleteAlgorithm.h @@ -10,7 +10,7 @@ namespace DB class TTLDeleteAlgorithm final : public ITTLAlgorithm { public: - TTLDeleteAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + TTLDeleteAlgorithm(const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); void execute(Block & block) override; void finalize(const MutableDataPartPtr & data_part) const override; diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index 34c0cad70ea..b7cddf3c165 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -4,7 +4,7 @@ namespace DB { TTLUpdateInfoAlgorithm::TTLUpdateInfoAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h index e9bcfcdec88..0cf31765aef 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h @@ -20,7 +20,7 @@ class TTLUpdateInfoAlgorithm : public ITTLAlgorithm { public: TTLUpdateInfoAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/Transforms/TTLCalcTransform.cpp b/src/Processors/Transforms/TTLCalcTransform.cpp index 31fb61239ef..204dfe21733 100644 --- a/src/Processors/Transforms/TTLCalcTransform.cpp +++ b/src/Processors/Transforms/TTLCalcTransform.cpp @@ -4,7 +4,22 @@ namespace DB { +static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) +{ + auto expr = ttl_descr.buildExpression(context); + auto where_expr = ttl_descr.buildWhereExpression(context); + + auto expr_queries = expr.sets->getSubqueries(); + auto where_expr_queries = expr.sets->getSubqueries(); + + subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + return {expr.expression, where_expr.expression}; +} + TTLCalcTransform::TTLCalcTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -21,33 +36,39 @@ TTLCalcTransform::TTLCalcTransform( { const auto & rows_ttl = metadata_snapshot_->getRowsTTL(); algorithms.emplace_back(std::make_unique( - rows_ttl, TTLUpdateField::TABLE_TTL, rows_ttl.result_column, old_ttl_infos.table_ttl, current_time_, force_)); + getExpressions(rows_ttl, subqueries_for_sets, context), rows_ttl, + TTLUpdateField::TABLE_TTL, rows_ttl.result_column, old_ttl_infos.table_ttl, current_time_, force_)); } for (const auto & where_ttl : metadata_snapshot_->getRowsWhereTTLs()) algorithms.emplace_back(std::make_unique( - where_ttl, TTLUpdateField::ROWS_WHERE_TTL, where_ttl.result_column, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); + getExpressions(where_ttl, subqueries_for_sets, context), where_ttl, + TTLUpdateField::ROWS_WHERE_TTL, where_ttl.result_column, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs()) algorithms.emplace_back(std::make_unique( - group_by_ttl, TTLUpdateField::GROUP_BY_TTL, group_by_ttl.result_column, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_)); + getExpressions(group_by_ttl, subqueries_for_sets, context), group_by_ttl, + TTLUpdateField::GROUP_BY_TTL, group_by_ttl.result_column, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_)); if (metadata_snapshot_->hasAnyColumnTTL()) { for (const auto & [name, description] : metadata_snapshot_->getColumnTTLs()) { algorithms.emplace_back(std::make_unique( - description, TTLUpdateField::COLUMNS_TTL, name, old_ttl_infos.columns_ttl[name], current_time_, force_)); + getExpressions(description, subqueries_for_sets, context), description, + TTLUpdateField::COLUMNS_TTL, name, old_ttl_infos.columns_ttl[name], current_time_, force_)); } } for (const auto & move_ttl : metadata_snapshot_->getMoveTTLs()) algorithms.emplace_back(std::make_unique( - move_ttl, TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); + getExpressions(move_ttl, subqueries_for_sets, context), move_ttl, + TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); for (const auto & recompression_ttl : metadata_snapshot_->getRecompressionTTLs()) algorithms.emplace_back(std::make_unique( - recompression_ttl, TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); + getExpressions(recompression_ttl, subqueries_for_sets, context), recompression_ttl, + TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); } void TTLCalcTransform::consume(Chunk chunk) diff --git a/src/Processors/Transforms/TTLCalcTransform.h b/src/Processors/Transforms/TTLCalcTransform.h index 495879400dc..960438f5f2b 100644 --- a/src/Processors/Transforms/TTLCalcTransform.h +++ b/src/Processors/Transforms/TTLCalcTransform.h @@ -15,6 +15,7 @@ class TTLCalcTransform : public IAccumulatingTransform { public: TTLCalcTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -23,6 +24,8 @@ public: bool force_ ); + PreparedSets::Subqueries getSubqueries() { return std::move(subqueries_for_sets); } + String getName() const override { return "TTL_CALC"; } Status prepare() override; @@ -35,6 +38,7 @@ protected: private: std::vector algorithms; + PreparedSets::Subqueries subqueries_for_sets; /// ttl_infos and empty_columns are updating while reading const MergeTreeData::MutableDataPartPtr & data_part; diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index d3d45f68d46..69e2e6e5fc0 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -16,7 +16,22 @@ namespace DB { +static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) +{ + auto expr = ttl_descr.buildExpression(context); + auto where_expr = ttl_descr.buildWhereExpression(context); + + auto expr_queries = expr.sets->getSubqueries(); + auto where_expr_queries = expr.sets->getSubqueries(); + + subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + return {expr.expression, where_expr.expression}; +} + TTLTransform::TTLTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -33,7 +48,8 @@ TTLTransform::TTLTransform( { const auto & rows_ttl = metadata_snapshot_->getRowsTTL(); auto algorithm = std::make_unique( - rows_ttl, old_ttl_infos.table_ttl, current_time_, force_); + getExpressions(rows_ttl, subqueries_for_sets, context), rows_ttl, + old_ttl_infos.table_ttl, current_time_, force_); /// Skip all data if table ttl is expired for part if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression_ast) @@ -45,11 +61,13 @@ TTLTransform::TTLTransform( for (const auto & where_ttl : metadata_snapshot_->getRowsWhereTTLs()) algorithms.emplace_back(std::make_unique( - where_ttl, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); + getExpressions(where_ttl, subqueries_for_sets, context), where_ttl, + old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs()) algorithms.emplace_back(std::make_unique( - group_by_ttl, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, + getExpressions(group_by_ttl, subqueries_for_sets, context), group_by_ttl, + old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, getInputPort().getHeader(), storage_)); if (metadata_snapshot_->hasAnyColumnTTL()) @@ -75,18 +93,21 @@ TTLTransform::TTLTransform( } algorithms.emplace_back(std::make_unique( - description, old_ttl_infos.columns_ttl[name], current_time_, + getExpressions(description, subqueries_for_sets, context), description, + old_ttl_infos.columns_ttl[name], current_time_, force_, name, default_expression, default_column_name, isCompactPart(data_part))); } } for (const auto & move_ttl : metadata_snapshot_->getMoveTTLs()) algorithms.emplace_back(std::make_unique( - move_ttl, TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); + getExpressions(move_ttl, subqueries_for_sets, context), move_ttl, + TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); for (const auto & recompression_ttl : metadata_snapshot_->getRecompressionTTLs()) algorithms.emplace_back(std::make_unique( - recompression_ttl, TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); + getExpressions(recompression_ttl, subqueries_for_sets, context), recompression_ttl, + TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); } Block reorderColumns(Block block, const Block & header) diff --git a/src/Processors/Transforms/TTLTransform.h b/src/Processors/Transforms/TTLTransform.h index 3f0dffd1998..47da456a2e3 100644 --- a/src/Processors/Transforms/TTLTransform.h +++ b/src/Processors/Transforms/TTLTransform.h @@ -16,6 +16,7 @@ class TTLTransform : public IAccumulatingTransform { public: TTLTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -28,6 +29,8 @@ public: Status prepare() override; + PreparedSets::Subqueries getSubqueries() { return std::move(subqueries_for_sets); } + protected: void consume(Chunk chunk) override; Chunk generate() override; @@ -40,6 +43,8 @@ private: const TTLDeleteAlgorithm * delete_algorithm = nullptr; bool all_data_dropped = false; + PreparedSets::Subqueries subqueries_for_sets; + /// ttl_infos and empty_columns are updating while reading const MergeTreeData::MutableDataPartPtr & data_part; Poco::Logger * log; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index e8e307bb148..26b290d33d5 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -31,6 +31,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -1004,8 +1007,9 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() break; } - auto res_pipe = Pipe::unitePipes(std::move(pipes)); - res_pipe.addTransform(std::move(merged_transform)); + auto builder = std::make_unique(); + builder->init(Pipe::unitePipes(std::move(pipes))); + builder->addTransform(std::move(merged_transform)); if (global_ctx->deduplicate) { @@ -1021,26 +1025,34 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() } if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns)) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + builder->addTransform(std::make_shared( + builder->getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); else - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + builder->addTransform(std::make_shared( + builder->getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); } + PreparedSets::Subqueries subqueries; + if (ctx->need_remove_expired_values) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl)); + { + auto transform = std::make_shared(global_ctx->context, builder->getHeader(), *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (global_ctx->metadata_snapshot->hasSecondaryIndices()) { const auto & indices = global_ctx->metadata_snapshot->getSecondaryIndices(); - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); - res_pipe.addTransform(std::make_shared(res_pipe.getHeader())); + builder->addTransform(std::make_shared( + builder->getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); + builder->addTransform(std::make_shared(builder->getHeader())); } - global_ctx->merged_pipeline = QueryPipeline(std::move(res_pipe)); + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), global_ctx->context); + + global_ctx->merged_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); /// Dereference unique_ptr and pass horizontal_stage_progress by reference global_ctx->merged_pipeline.setProgressCallback(MergeProgressCallback(global_ctx->merge_list_element_ptr, global_ctx->watch_prev_elapsed, *global_ctx->horizontal_stage_progress)); /// Is calculated inside MergeProgressCallback. diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index d080240b066..ce9e5762cb4 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -132,7 +132,7 @@ void updateTTL( const Block & block, bool update_part_min_max_ttls) { - auto expr_and_set = ttl_entry.buildExpression(); + auto expr_and_set = ttl_entry.buildExpression(context); for (auto & subquery : expr_and_set.sets->getSubqueries()) subquery->buildSetInplace(context); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 6b6b5947581..61849f94e44 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1507,21 +1509,34 @@ private: if (!ctx->mutating_pipeline_builder.initialized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot mutate part columns with uninitialized mutations stream. It's a bug"); - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + auto builder = std::make_unique(std::move(ctx->mutating_pipeline_builder)); if (ctx->metadata_snapshot->hasPrimaryKey() || ctx->metadata_snapshot->hasSecondaryIndices()) { - builder.addTransform(std::make_shared( - builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); + builder->addTransform(std::make_shared( + builder->getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); - builder.addTransform(std::make_shared(builder.getHeader())); + builder->addTransform(std::make_shared(builder->getHeader())); } + PreparedSets::Subqueries subqueries; + if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } + + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), ctx->context); ctx->minmax_idx = std::make_shared(); @@ -1537,7 +1552,7 @@ private: /*blocks_are_granules_size=*/ false, ctx->context->getWriteSettings()); - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); /// Is calculated inside MergeProgressCallback. ctx->mutating_pipeline.disableProfileEventUpdate(); @@ -1712,13 +1727,25 @@ private: if (ctx->mutating_pipeline_builder.initialized()) { - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + auto builder = std::make_unique(std::move(ctx->mutating_pipeline_builder)); + PreparedSets::Subqueries subqueries; if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } + + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), ctx->context); ctx->out = std::make_shared( ctx->new_data_part, @@ -1732,7 +1759,7 @@ private: &ctx->source_part->index_granularity_info ); - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); /// Is calculated inside MergeProgressCallback. ctx->mutating_pipeline.disableProfileEventUpdate(); diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 7db5af82e0b..158c13b653d 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -279,7 +279,7 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) { - if (add_dependent_columns(expression, to_set) && include_ttl_target) + if (add_dependent_columns(expression.getNames(), to_set) && include_ttl_target) { /// Filter all columns, if rows TTL expression have to be recalculated. for (const auto & column : getColumns().getAllPhysical()) @@ -297,16 +297,16 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getRecompressionTTLs()) - add_dependent_columns(entry.expression_columns, required_ttl_columns); + add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns); for (const auto & [name, entry] : getColumnTTLs()) { - if (add_dependent_columns(entry.expression_columns, required_ttl_columns) && include_ttl_target) + if (add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns) && include_ttl_target) updated_ttl_columns.insert(name); } for (const auto & entry : getMoveTTLs()) - add_dependent_columns(entry.expression_columns, required_ttl_columns); + add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns); //TODO what about rows_where_ttl and group_by_ttl ?? diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index 47138f30e4f..e02ac933028 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -169,6 +169,23 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType return result; } +ExpressionAndSets TTLDescription::buildExpression(const ContextPtr & context) const +{ + auto ast = expression_ast->clone(); + return buildExpressionAndSets(ast, expression_columns, context); +} + +ExpressionAndSets TTLDescription::buildWhereExpression(const ContextPtr & context) const +{ + if (where_expression_ast) + { + auto ast = where_expression_ast->clone(); + return buildExpressionAndSets(ast, where_expression_columns, context); + } + + return {}; +} + TTLDescription TTLDescription::getTTLFromAST( const ASTPtr & definition_ast, const ColumnsDescription & columns, @@ -186,7 +203,7 @@ TTLDescription TTLDescription::getTTLFromAST( auto ttl_ast = result.expression_ast->clone(); auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; - result.expression_columns = expression->getRequiredColumns(); + result.expression_columns = expression->getRequiredColumnsWithTypes(); // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); @@ -214,6 +231,8 @@ TTLDescription TTLDescription::getTTLFromAST( where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; // auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); + + result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); result.where_result_column = where_expr_ast->getColumnName(); } } diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index 5ea243424cb..7dfc736ded2 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -51,10 +51,10 @@ struct TTLDescription /// TTL d + INTERVAL 1 DAY /// ^~~~~~~~~~~~~~~~~~~^ ASTPtr expression_ast; - Names expression_columns; + NamesAndTypesList expression_columns; /// Expression actions evaluated from AST - ExpressionAndSets buildExpression() const; + ExpressionAndSets buildExpression(const ContextPtr & context) const; /// Result column of this TTL expression String result_column; @@ -63,7 +63,8 @@ struct TTLDescription /// TTL ... WHERE x % 10 == 0 and y > 5 /// ^~~~~~~~~~~~~~~~~~~~~~^ ASTPtr where_expression_ast; - ExpressionAndSets buildWhereExpression() const; + NamesAndTypesList where_expression_columns; + ExpressionAndSets buildWhereExpression(const ContextPtr & context) const; /// Name of result column from WHERE expression String where_result_column; From 16558ccc840d7a15efb2ab0fe691a79c38dd5086 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 4 Dec 2023 18:13:34 +0000 Subject: [PATCH 0038/1081] Fix some tests --- src/Storages/TTLDescription.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index e02ac933028..e32ff11860b 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -103,7 +103,10 @@ using FindAggregateFunctionVisitor = InDepthNodeVisitorclone() : nullptr) + , expression_columns(other.expression_columns) , result_column(other.result_column) + , where_expression_ast(other.where_expression_ast ? other.where_expression_ast->clone() : nullptr) + , where_expression_columns(other.where_expression_columns) , where_result_column(other.where_result_column) , group_by_keys(other.group_by_keys) , set_parts(other.set_parts) @@ -136,12 +139,20 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) // else // expression.reset(); + expression_columns = other.expression_columns; result_column = other.result_column; + + if (other.where_expression_ast) + where_expression_ast = other.where_expression_ast->clone(); + else + where_expression_ast.reset(); + // if (other.where_expression) // where_expression = other.where_expression->clone(); // else // where_expression.reset(); + where_expression_columns = other.where_expression_columns; where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; set_parts = other.set_parts; From 4de048904a3cbb6ff30e20b5a8defd1564f2e722 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Dec 2023 19:14:06 +0100 Subject: [PATCH 0039/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index eeea512f14a..261342da103 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-random-merge-tree-settings # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From d3b80ac60cdb1fa17fb8907a7a6f11afde759bab Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Dec 2023 19:14:55 +0100 Subject: [PATCH 0040/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 261342da103..0910ba177fb 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-random-merge-tree-settings +# Tags: long, no-random-merge-tree-settings # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From 6a821f9e737373b28bc98f25e10439dd04e7bdb8 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 4 Dec 2023 19:24:27 +0000 Subject: [PATCH 0041/1081] Fix some staff --- src/Processors/QueryPlan/CreatingSetsStep.cpp | 3 +-- src/Processors/Transforms/TTLCalcTransform.cpp | 12 +++++++----- src/Processors/Transforms/TTLTransform.cpp | 12 +++++++----- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 11415e8d815..f13a717004f 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -182,8 +182,7 @@ QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipelin pipelines.emplace_back(plan->buildQueryPipeline(plan_settings, pipeline_settings)); } - CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); - return std::move(pipelines.front()); + return CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); } std::vector> DelayedCreatingSetsStep::makePlansForSets(DelayedCreatingSetsStep && step) diff --git a/src/Processors/Transforms/TTLCalcTransform.cpp b/src/Processors/Transforms/TTLCalcTransform.cpp index 204dfe21733..0af9f38b20f 100644 --- a/src/Processors/Transforms/TTLCalcTransform.cpp +++ b/src/Processors/Transforms/TTLCalcTransform.cpp @@ -7,13 +7,15 @@ namespace DB static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) { auto expr = ttl_descr.buildExpression(context); - auto where_expr = ttl_descr.buildWhereExpression(context); - auto expr_queries = expr.sets->getSubqueries(); - auto where_expr_queries = expr.sets->getSubqueries(); - subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); - subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + auto where_expr = ttl_descr.buildWhereExpression(context); + if (where_expr.sets) + { + auto where_expr_queries = where_expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + } return {expr.expression, where_expr.expression}; } diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index 69e2e6e5fc0..69b7d80c563 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -19,13 +19,15 @@ namespace DB static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) { auto expr = ttl_descr.buildExpression(context); - auto where_expr = ttl_descr.buildWhereExpression(context); - auto expr_queries = expr.sets->getSubqueries(); - auto where_expr_queries = expr.sets->getSubqueries(); - subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); - subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + auto where_expr = ttl_descr.buildWhereExpression(context); + if (where_expr.sets) + { + auto where_expr_queries = where_expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + } return {expr.expression, where_expr.expression}; } From 0015ec28f9f70548c31e220f2dd826e4ac21f007 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 5 Dec 2023 12:45:25 +0000 Subject: [PATCH 0042/1081] Fixing test. --- src/Storages/TTLDescription.cpp | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index e32ff11860b..bfd3afc30d8 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -172,11 +173,26 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndTypesList & columns, const ContextPtr & context) { ExpressionAndSets result; + auto ttl_string = queryToString(ast); auto syntax_analyzer_result = TreeRewriter(context).analyze(ast, columns); ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context); - result.expression = analyzer.getActions(false); + auto dag = analyzer.getActionsDAG(false); + + const auto * col = &dag->findInOutputs(ast->getColumnName()); + // std::cerr << "buildExpressionAndSets " << ttl_string << std::endl; + if (col->result_name != ttl_string) + col = &dag->addAlias(*col, ttl_string); + + dag->getOutputs() = {col}; + dag->removeUnusedActions(); + + result.expression = std::make_shared(dag, ExpressionActionsSettings::fromContext(context)); result.sets = analyzer.getPreparedSets(); + // std::cerr << "--------- buildExpressionAndSets\n"; + // std::cerr << result.expression->dumpActions() << std::endl; + // std::cerr << result.sets->getSubqueries().size() << std::endl; + return result; } @@ -218,7 +234,7 @@ TTLDescription TTLDescription::getTTLFromAST( // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); - result.result_column = ttl_ast->getColumnName(); + result.result_column = expression->getSampleBlock().safeGetByPosition(0).name; ExpressionActionsPtr where_expression; @@ -244,7 +260,7 @@ TTLDescription TTLDescription::getTTLFromAST( // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); - result.where_result_column = where_expr_ast->getColumnName(); + result.where_result_column = where_expression->getSampleBlock().safeGetByPosition(0).name; } } else if (ttl_element->mode == TTLMode::GROUP_BY) From 43a23898e0ddb71fe810dafd850cef911dace902 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 5 Dec 2023 14:20:07 +0000 Subject: [PATCH 0043/1081] Updating the tests. --- .../0_stateless/01465_ttl_recompression.reference | 6 +++--- .../queries/0_stateless/02932_set_ttl_where.reference | 3 +++ tests/queries/0_stateless/02932_set_ttl_where.sql | 10 +--------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/tests/queries/0_stateless/01465_ttl_recompression.reference b/tests/queries/0_stateless/01465_ttl_recompression.reference index 108df565669..90661a5dc78 100644 --- a/tests/queries/0_stateless/01465_ttl_recompression.reference +++ b/tests/queries/0_stateless/01465_ttl_recompression.reference @@ -13,9 +13,9 @@ CREATE TABLE default.recompression_table\n(\n `dt` DateTime,\n `key` UInt6 1_1_1 LZ4 2_2_2 ZSTD(12) 3_3_3 ZSTD(12) -1_1_1 ['plus(dt, toIntervalDay(1))'] -2_2_2 ['plus(dt, toIntervalDay(1))'] -3_3_3 ['plus(dt, toIntervalDay(1))'] +1_1_1 ['dt + toIntervalDay(1)'] +2_2_2 ['dt + toIntervalDay(1)'] +3_3_3 ['dt + toIntervalDay(1)'] 1_1_1 LZ4 2_2_2 LZ4 3_3_3 LZ4 diff --git a/tests/queries/0_stateless/02932_set_ttl_where.reference b/tests/queries/0_stateless/02932_set_ttl_where.reference index e69de29bb2d..bb0b1cf658d 100644 --- a/tests/queries/0_stateless/02932_set_ttl_where.reference +++ b/tests/queries/0_stateless/02932_set_ttl_where.reference @@ -0,0 +1,3 @@ +0 +0 +0 diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql index 85fddf613e8..bf2b317c4bf 100644 --- a/tests/queries/0_stateless/02932_set_ttl_where.sql +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -1,18 +1,10 @@ -create or replace table temp ( - a UInt32 -) -engine = MergeTree -order by a; - -insert into temp select number from system.numbers limit 100_000; - create or replace table t_temp ( a UInt32, timestamp DateTime ) engine = MergeTree order by a -TTL timestamp + INTERVAL 2 SECOND WHERE a in (select a from temp); +TTL timestamp + INTERVAL 2 SECOND WHERE a in (select number from system.numbers limit 100_000); select sleep(1); insert into t_temp select rand(), now() from system.numbers limit 1_000_000; From 7dc7062dadd5ddf3bed3dea4364cabfa97bcd61a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 6 Dec 2023 12:53:14 +0000 Subject: [PATCH 0044/1081] Fixing test. --- src/Interpreters/PreparedSets.cpp | 3 ++- src/Interpreters/Set.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index ea8d9a62b8b..9f646825d9f 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -189,7 +189,8 @@ SetPtr FutureSetFromSubquery::buildOrderedSetInplace(const ContextPtr & context) } } - set_and_key->set->fillSetElements(); + if (!set_and_key->set->hasSetElements()) + set_and_key->set->fillSetElements(); return buildSetInplace(context); } diff --git a/src/Interpreters/Set.h b/src/Interpreters/Set.h index 7136b090c42..7e8e0f2371b 100644 --- a/src/Interpreters/Set.h +++ b/src/Interpreters/Set.h @@ -77,6 +77,7 @@ public: const DataTypes & getElementsTypes() const { return set_elements_types; } bool hasExplicitSetElements() const { return fill_set_elements || (!set_elements.empty() && set_elements.front()->size() == data.getTotalRowCount()); } + bool hasSetElements() const { return !set_elements.empty(); } Columns getSetElements() const { checkIsCreated(); return { set_elements.begin(), set_elements.end() }; } void checkColumnsNumber(size_t num_key_columns) const; From 23bde28ac4fc18e296daf6b04283ab50ee58d025 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 8 Dec 2023 19:11:47 +0100 Subject: [PATCH 0045/1081] Fxi --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 ++---- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/MutateTask.cpp | 6 ++++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 52310802c9d..5418bcd83f3 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -677,7 +677,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks if (!parent_part) { loadTTLInfos(); - has_broken_projections = !loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); + loadProjections(require_columns_checksums, check_consistency, has_broken_projections, false /* if_not_loaded */); } if (check_consistency && !has_broken_projections) @@ -742,10 +742,9 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -bool IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) +void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); - bool has_broken_projection = false; for (const auto & projection : metadata_snapshot->projections) { auto path = projection.name + ".proj"; @@ -782,7 +781,6 @@ bool IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch } } } - return has_broken_projection; } void IMergeTreeDataPart::loadIndexGranularity() diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 050bd76121c..9812529086b 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -434,7 +434,7 @@ public: bool hasBrokenProjection(const String & projection_name) const; /// Return true, if all projections were loaded successfully and none was marked as broken. - bool loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + void loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded = false); void setBrokenReason(const String & message, int code) const; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 93b241deae7..2b0cf60a7f1 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -870,7 +870,8 @@ void finalizeMutatedPart( new_data_part->modification_time = time(nullptr); /// Load rest projections which are hardlinked - new_data_part->loadProjections(false, false, true /* if_not_loaded */); + bool noop; + new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); /// All information about sizes is stored in checksums. /// It doesn't make sense to touch filesystem for sizes. @@ -1570,8 +1571,9 @@ private: void finalize() { + bool noop; ctx->new_data_part->minmax_idx = std::move(ctx->minmax_idx); - ctx->new_data_part->loadProjections(false, false, true /* if_not_loaded */); + ctx->new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); ctx->mutating_executor.reset(); ctx->mutating_pipeline.reset(); From c8c4db5984bf9101478e0d1f33c3432c257ea7a0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Dec 2023 13:24:31 +0100 Subject: [PATCH 0046/1081] Fxi test --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 7 +++++++ .../queries/0_stateless/02916_broken_projection.reference | 7 ++++--- tests/queries/0_stateless/02916_broken_projection.sh | 4 +++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 5418bcd83f3..7af49edf788 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -780,6 +780,13 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch addProjectionPart(projection.name, std::move(part)); } } + else if (checksums.has(path)) + { + auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); + part->setBrokenReason("Projection directory " + path + " does not exist while loading projections", ErrorCodes::NO_FILE_IN_DATA_PART); + addProjectionPart(projection.name, std::move(part)); + has_broken_projection = true; + } } } diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index b7764a6434e..358304de74a 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -406,7 +406,7 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 0 -broke all data of part 'proj' (parent part: all_2_2_0) +broke all data of part 'proj' (parent part: all_1_1_0) system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] @@ -421,13 +421,13 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST +all_1_1_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj_2'] +all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 @@ -442,3 +442,4 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info +all_1_1_0 proj NO_FILE_IN_DATA_PART diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 0910ba177fb..eb68f8621a2 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -410,6 +410,8 @@ function test2() check test2 } +CLICKHOUSE_DATABASE="default" +CLICKHOUSE_TEST_UNIQUE_NAME="test123456" function test3() { create_table test3 test 1 @@ -473,7 +475,7 @@ function test3() check test - break_projection test proj all_2_2_0 part + break_projection test proj all_1_1_0 part check test proj FILE_DOESNT_EXIST From cd41802d7e5b056e0114c8ad7523f00828ad5940 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:37:44 +0100 Subject: [PATCH 0047/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index eb68f8621a2..a52570f3d52 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -410,8 +410,6 @@ function test2() check test2 } -CLICKHOUSE_DATABASE="default" -CLICKHOUSE_TEST_UNIQUE_NAME="test123456" function test3() { create_table test3 test 1 From 457032d2998a085fb9c10c0b9d536e79dbcc5dab Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Dec 2023 20:40:25 +0100 Subject: [PATCH 0048/1081] Disable fault injection because it breaks .reference --- tests/queries/0_stateless/02916_broken_projection.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index a52570f3d52..2049610e45b 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -52,7 +52,7 @@ function insert() offset=$2 size=$3 echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" + $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability;" } function break_projection() @@ -431,11 +431,12 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false, backup_restore_keeper_fault_injection_probability=0.0; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; + set backup_restore_keeper_fault_injection_probability=0.0; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); " | grep -o "RESTORED" @@ -451,6 +452,7 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') " 2>&1 | grep -o "FILE_DOESNT_EXIST" @@ -462,12 +464,14 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); " | grep -o "RESTORED" @@ -481,6 +485,7 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4') settings check_projection_parts=false, allow_backup_broken_projections=true; " | grep -o "BACKUP_CREATED" @@ -488,6 +493,7 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4'); " | grep -o "RESTORED" From 8ef2638cfce90031213bbbd595a50d584406a916 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 12 Dec 2023 14:22:14 +0100 Subject: [PATCH 0049/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 2049610e45b..0418759eb26 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -52,7 +52,7 @@ function insert() offset=$2 size=$3 echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability;" + $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability=0.0;" } function break_projection() From d81edb4adf65c8c3724ec27fc83b65d5d1b3ebad Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:29:28 +0100 Subject: [PATCH 0050/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 0418759eb26..07495c45214 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -431,7 +431,8 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false, backup_restore_keeper_fault_injection_probability=0.0; + set backup_restore_keeper_fault_injection_probability=0.0; + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " From 79432255df02f696962858347c2207dbdbf2b69f Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:45:24 +0100 Subject: [PATCH 0051/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 07495c45214..55e613b8f3a 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings +# Tags: long, no-random-merge-tree-settings, no-random-settings # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From 59153e865d4ffeda3c67cbdd945e14fdc860e446 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 19 Dec 2023 09:53:04 +0000 Subject: [PATCH 0052/1081] materialize column not to override past values Signed-off-by: Duc Canh Le --- src/Storages/MergeTree/MutateTask.cpp | 23 +++++++-- .../0_stateless/02008_materialize_column.sql | 1 + ..._column_not_override_past_values.reference | 29 +++++++++++ ...ialize_column_not_override_past_values.sql | 49 +++++++++++++++++++ 4 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference create mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 827749aa094..a04d9cdb886 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -65,6 +65,7 @@ static void splitAndModifyMutationCommands( Poco::Logger * log) { auto part_columns = part->getColumnsDescription(); + const auto & table_columns = metadata_snapshot->getColumns(); if (!isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) { @@ -73,9 +74,16 @@ static void splitAndModifyMutationCommands( for (const auto & command : commands) { + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default expression, materialize column should not override past values + /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + mutated_columns.emplace(command.column_name); + } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -85,8 +93,6 @@ static void splitAndModifyMutationCommands( for (const auto & [column_name, expr] : command.column_to_update_expression) mutated_columns.emplace(column_name); - if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) - mutated_columns.emplace(command.column_name); } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION @@ -196,8 +202,15 @@ static void splitAndModifyMutationCommands( { for (const auto & command : commands) { - if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default expression, materialize column should not override past values + /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + for_interpreter.push_back(command); + } + else if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL diff --git a/tests/queries/0_stateless/02008_materialize_column.sql b/tests/queries/0_stateless/02008_materialize_column.sql index a78920d2525..cc7d3096402 100644 --- a/tests/queries/0_stateless/02008_materialize_column.sql +++ b/tests/queries/0_stateless/02008_materialize_column.sql @@ -17,6 +17,7 @@ ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+2); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; +ALTER TABLE tmp CLEAR COLUMN s; -- Need to clear because MATERIALIZE COLUMN won't override past values; ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+3); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference new file mode 100644 index 00000000000..6b0d88bd09b --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference @@ -0,0 +1,29 @@ +--Origin-- +1 2 +2 54321 +--After materialize-- +1 2 +2 54321 +--Origin-- +1 2 +2 54321 +--After materialize-- +1 2 +2 54321 +--Origin-- +1 2 +2 \N +3 54321 +--After materialize-- +1 2 +2 \N +3 54321 +--Origin-- +1 2 +2 54321 +--After rename-- +1 2 +2 54321 +--After materialize-- +1 2 +2 54321 diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql new file mode 100644 index 00000000000..1815661e097 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql @@ -0,0 +1,49 @@ + +SET mutations_sync = 2; +-- Compact parts +CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id ) values ( 2 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN foo; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; + +-- Wide parts +CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id ) values ( 2 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN foo; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; + +-- Nullable column != physically absent +CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id, foo ) values ( 2, NULL ); +INSERT INTO test ( id ) values ( 3 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN foo; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; + +-- Parts with renamed column +CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id ) values ( 2 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test RENAME COLUMN foo TO bar; +SELECT '--After rename--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN bar; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; \ No newline at end of file From a924b01a023512727d6a36fc12052f67438ba199 Mon Sep 17 00:00:00 2001 From: Justin de Guzman Date: Tue, 19 Dec 2023 02:05:32 -0800 Subject: [PATCH 0053/1081] [Docs] Clarify to use query level settings in ClickHouse Cloud --- docs/en/operations/query-cache.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index def0f48b968..2f05599e666 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -31,6 +31,10 @@ This reduces maintenance effort and avoids redundancy. ## Configuration Settings and Usage +:::note +In ClickHouse Cloud, you must use [query level settings](/en/operations/settings/query-level) to edit query cache settings. Editing [config level settings](/en/operations/configuration-files) is currently not supported. +::: + Setting [use_query_cache](settings/settings.md#use-query-cache) can be used to control whether a specific query or all queries of the current session should utilize the query cache. For example, the first execution of query From 319ae440b6ba09b1dc21b355fab22a99d073592c Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 16:43:30 +0000 Subject: [PATCH 0054/1081] Implement Variant data type --- docs/en/operations/settings/settings.md | 52 + docs/en/sql-reference/data-types/variant.md | 217 ++ .../functions/other-functions.md | 36 + src/Columns/ColumnNullable.cpp | 22 +- src/Columns/ColumnNullable.h | 1 + src/Columns/ColumnVariant.cpp | 1360 +++++++++ src/Columns/ColumnVariant.h | 306 ++ src/Columns/IColumn.cpp | 6 + src/Columns/IColumn.h | 14 + src/Columns/MaskOperations.cpp | 6 +- src/Columns/MaskOperations.h | 2 +- src/Columns/tests/gtest_column_variant.cpp | 688 +++++ src/Core/Settings.h | 2 + src/Core/TypeId.h | 1 + src/DataTypes/DataTypeFactory.cpp | 1 + src/DataTypes/DataTypeFactory.h | 1 + src/DataTypes/DataTypeNullable.cpp | 28 + src/DataTypes/DataTypeNullable.h | 3 + src/DataTypes/DataTypeTuple.cpp | 9 +- src/DataTypes/DataTypeVariant.cpp | 197 ++ src/DataTypes/DataTypeVariant.h | 64 + src/DataTypes/EnumValues.cpp | 21 + src/DataTypes/EnumValues.h | 8 +- src/DataTypes/IDataType.cpp | 23 +- src/DataTypes/IDataType.h | 3 + .../Serializations/ISerialization.cpp | 61 +- src/DataTypes/Serializations/ISerialization.h | 13 + .../Serializations/SerializationArray.cpp | 138 +- .../Serializations/SerializationArray.h | 3 + .../Serializations/SerializationBool.cpp | 179 +- .../Serializations/SerializationBool.h | 8 +- .../SerializationCustomSimpleText.cpp | 56 + .../SerializationCustomSimpleText.h | 6 + .../Serializations/SerializationDate.cpp | 46 + .../Serializations/SerializationDate.h | 5 + .../Serializations/SerializationDate32.cpp | 45 + .../Serializations/SerializationDate32.h | 5 + .../Serializations/SerializationDateTime.cpp | 157 +- .../Serializations/SerializationDateTime.h | 5 + .../SerializationDateTime64.cpp | 112 + .../Serializations/SerializationDateTime64.h | 6 + .../Serializations/SerializationDecimal.cpp | 46 +- .../Serializations/SerializationDecimal.h | 6 +- .../Serializations/SerializationEnum.cpp | 97 + .../Serializations/SerializationEnum.h | 13 + .../SerializationFixedString.cpp | 56 + .../Serializations/SerializationFixedString.h | 6 + .../SerializationIPv4andIPv6.cpp | 188 ++ .../Serializations/SerializationIPv4andIPv6.h | 129 +- .../SerializationLowCardinality.cpp | 47 +- .../SerializationLowCardinality.h | 12 + .../Serializations/SerializationMap.cpp | 108 +- .../Serializations/SerializationMap.h | 7 +- .../Serializations/SerializationNamed.cpp | 1 + .../Serializations/SerializationNothing.h | 1 + .../Serializations/SerializationNullable.cpp | 532 +++- .../Serializations/SerializationNullable.h | 53 +- .../Serializations/SerializationNumber.cpp | 80 +- .../Serializations/SerializationNumber.h | 3 + .../Serializations/SerializationString.cpp | 101 +- .../Serializations/SerializationString.h | 5 + .../Serializations/SerializationTuple.cpp | 318 ++- .../Serializations/SerializationTuple.h | 12 + .../Serializations/SerializationUUID.cpp | 41 +- .../Serializations/SerializationUUID.h | 6 +- .../Serializations/SerializationVariant.cpp | 828 ++++++ .../Serializations/SerializationVariant.h | 116 + .../SerializationVariantElement.cpp | 241 ++ .../SerializationVariantElement.h | 87 + .../Serializations/SerializationWrapper.cpp | 25 + .../Serializations/SerializationWrapper.h | 5 + .../Serializations/SimpleTextSerialization.h | 38 + src/DataTypes/Utils.cpp | 1 + src/Databases/DatabaseReplicated.cpp | 1 + src/Formats/EscapingRuleUtils.cpp | 10 +- src/Formats/JSONUtils.cpp | 4 +- src/Formats/SchemaInferenceUtils.cpp | 2 +- src/Functions/FunctionsConversion.h | 262 +- src/Functions/if.cpp | 50 +- src/Functions/isNotNull.cpp | 13 + src/Functions/isNull.cpp | 13 + src/Functions/multiIf.cpp | 10 + src/Functions/variantElement.cpp | 238 ++ src/IO/ReadHelpers.cpp | 298 +- src/IO/ReadHelpers.h | 198 +- src/IO/readDecimalText.h | 20 + src/Interpreters/InterpreterCreateQuery.cpp | 14 + src/Interpreters/InterpreterInsertQuery.cpp | 2 +- src/Interpreters/inplaceBlockConversions.cpp | 35 +- .../parseColumnsListForTableFunction.cpp | 11 + .../parseColumnsListForTableFunction.h | 2 + src/Parsers/ExpressionElementParsers.cpp | 2 +- .../Formats/Impl/CSVRowInputFormat.cpp | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.cpp | 2 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 2 +- .../Impl/TabSeparatedRowInputFormat.cpp | 4 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 2 +- ...940_variant_text_deserialization.reference | 516 ++++ .../02940_variant_text_deserialization.sql | 266 ++ .../02941_variant_type_1.reference | 2472 +++++++++++++++++ .../0_stateless/02941_variant_type_1.sh | 124 + .../02941_variant_type_2.reference | 51 + .../0_stateless/02941_variant_type_2.sh | 71 + .../02941_variant_type_3.reference | 51 + .../0_stateless/02941_variant_type_3.sh | 71 + .../02941_variant_type_4.reference | 56 + .../0_stateless/02941_variant_type_4.sh | 66 + .../0_stateless/02942_variant_cast.reference | 25 + .../0_stateless/02942_variant_cast.sql | 23 + .../02943_variant_element.reference | 44 + .../0_stateless/02943_variant_element.sql | 16 + ...44_variant_as_if_multi_if_result.reference | 96 + .../02944_variant_as_if_multi_if_result.sql | 64 + 113 files changed, 11750 insertions(+), 584 deletions(-) create mode 100644 docs/en/sql-reference/data-types/variant.md create mode 100644 src/Columns/ColumnVariant.cpp create mode 100644 src/Columns/ColumnVariant.h create mode 100644 src/Columns/tests/gtest_column_variant.cpp create mode 100644 src/DataTypes/DataTypeVariant.cpp create mode 100644 src/DataTypes/DataTypeVariant.h create mode 100644 src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp create mode 100644 src/DataTypes/Serializations/SerializationVariant.cpp create mode 100644 src/DataTypes/Serializations/SerializationVariant.h create mode 100644 src/DataTypes/Serializations/SerializationVariantElement.cpp create mode 100644 src/DataTypes/Serializations/SerializationVariantElement.h create mode 100644 src/Functions/variantElement.cpp create mode 100644 tests/queries/0_stateless/02940_variant_text_deserialization.reference create mode 100644 tests/queries/0_stateless/02940_variant_text_deserialization.sql create mode 100644 tests/queries/0_stateless/02941_variant_type_1.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_1.sh create mode 100644 tests/queries/0_stateless/02941_variant_type_2.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_2.sh create mode 100644 tests/queries/0_stateless/02941_variant_type_3.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_3.sh create mode 100644 tests/queries/0_stateless/02941_variant_type_4.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_4.sh create mode 100644 tests/queries/0_stateless/02942_variant_cast.reference create mode 100644 tests/queries/0_stateless/02942_variant_cast.sql create mode 100644 tests/queries/0_stateless/02943_variant_element.reference create mode 100644 tests/queries/0_stateless/02943_variant_element.sql create mode 100644 tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference create mode 100644 tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index dc46a3f0dcd..dbf5bc341cc 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5134,3 +5134,55 @@ When set to `true` than for all s3 requests first two attempts are made with low When set to `false` than all attempts are made with identical timeouts. Default value: `true`. + +## allow_experimental_variant_type {#allow_experimental_variant_type} + +Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md). + +Default value: `false`. + +## use_variant_when_no_common_type_in_if {#use_variant_when_no_common_type_in_if} + +Allows to use `Variant` type as a result type for [if](../../sql-reference/functions/conditional-functions.md/#if)/[multiIf](../../sql-reference/functions/conditional-functions.md/#multiif) functions when there is no common type for argument types. + +Example: + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT toTypeName(if(number % 2, number, range(number))) as variant_type FROM numbers(1); +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant_type───────────────────┐ +│ Variant(Array(UInt64), UInt64) │ +└────────────────────────────────┘ +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT toTypeName(multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL)) AS variant_type FROM numbers(1); +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +─variant_type─────────────────────────┐ +│ Variant(Array(UInt8), String, UInt8) │ +└──────────────────────────────────────┘ + +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +Default value: `false`. diff --git a/docs/en/sql-reference/data-types/variant.md b/docs/en/sql-reference/data-types/variant.md new file mode 100644 index 00000000000..34966d79079 --- /dev/null +++ b/docs/en/sql-reference/data-types/variant.md @@ -0,0 +1,217 @@ +--- +slug: /en/sql-reference/data-types/json +sidebar_position: 55 +sidebar_label: Variant +--- + +# Variant(T1, T2, T3, ...) + +This type represents a union of other data types. Type `Variant(T1, T2, ..., TN)` means that each row of this type +has a value of either type `T1` or `T2` or ... or `TN` or none of them (`NULL` value). + +The order of nested types doesn't matter: Variant(T1, T2) = Variant(T2, T1). +Nested types can be arbitrary types except Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types. + +:::note +The Variant data type is an experimental feature. To use it, set `allow_experimental_variant_type = 1`. +::: + +## Creating Variant + +Using `Variant` type in table column definition: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v FROM test; +``` + +```text +┌─v─────────────┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ Hello, World! │ +│ [1,2,3] │ +└───────────────┘ +``` + +Using CAST from ordinary columns: + +```sql +SELECT toTypeName(variant) as type_name, 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant; +``` + +```text +┌─type_name──────────────────────────────┬─variant───────┐ +│ Variant(Array(UInt64), String, UInt64) │ Hello, World! │ +└────────────────────────────────────────┴───────────────┘ +``` + +Using functions `if/multiIf` when arguments doesn't have common type (setting `use_variant_when_no_common_type_in_if` should be enabled for it): + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +## Reading Variant nested types as subcolumns + +Variant type supports reading a single nested type from a Variant column using the type name as a subcolumn. +So, if you have column `variant Variant(T1, T2, T3)` you can read a subcolumn of type `T2` using syntax `variant.T2`, +this subcolumn will have type `Nullable(T2)` if `T2` can be inside `Nullable` and `T2` otherwise. This subcolumn will +be the same size as original `Variant` column and will contain `NULL` values (or empty values if `T2` cannot be inside `Nullable`) +in all rows in which original `Variant` column doesn't have type `T2`. + +Variant subcolumns can be also read using function `variantElement(variant_column, type_name)`. + +Examples: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, v.String, v.UInt64, v.`Array(UInt64)` FROM test; +``` + +```text +┌─v─────────────┬─v.String──────┬─v.UInt64─┬─v.Array(UInt64)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴───────────────┴──────────┴─────────────────┘ +``` + +```sql +SELECT toTypeName(v.String), toTypeName(v.UInt64), toTypeName(v.`Array(UInt64)`) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(v.String)─┬─toTypeName(v.UInt64)─┬─toTypeName(v.Array(UInt64))─┐ +│ Nullable(String) │ Nullable(UInt64) │ Array(UInt64) │ +└──────────────────────┴──────────────────────┴─────────────────────────────┘ +``` + +```sql +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test; +``` + +```text +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + +## Conversion between Variant column and other columns + +There are 3 possible conversions that can be performed with Variant column. + +### Converting an ordinary column to a Variant column + +It is possible to convert ordinary column with type `T` to a `Variant` column containing this type: + +```sql +SELECT toTypeName(variant) as type_name, 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant; +``` + +```text +┌─type_name──────────────────────────────┬─variant───────┐ +│ Variant(Array(UInt64), String, UInt64) │ Hello, World! │ +└────────────────────────────────────────┴───────────────┘ +``` + +### Converting a Variant column to an ordinary column + +It is possible to convert a `Variant` column to an ordinary column. In this case all nested variants will be converted to a destination type: + +```sql +CREATE TABLE test (v Variant(UInt64, String)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('42.42'); +SELECT v::Nullable(Float64) FROM test; +``` + +```text +┌─CAST(v, 'Nullable(Float64)')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ 42.42 │ +└──────────────────────────────┘ +``` + +### Converting a Variant to another Variant + +It is possible to convert a `Variant` column to another `Variant` column, but only if the destination `Variant` column contains all nested types from the original `Variant`: + +```sql +CREATE TABLE test (v Variant(UInt64, String)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('String'); +SELECT v::Variant(UInt64, String, Array(UInt64)) FROM test; +``` + +```text +┌─CAST(v, 'Variant(UInt64, String, Array(UInt64))')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ String │ +└───────────────────────────────────────────────────┘ +``` + + +## Reading Variant type from the data + +All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Variant` type. During data parsing ClickHouse tries to insert value into most appropriate variant type. + +Example: + +```sql +SELECT + v, + variantElement(v, 'String') AS str, + variantElement(v, 'UInt64') AS num, + variantElement(v, 'Float64') AS float, + variantElement(v, 'DateTime') AS date, + variantElement(v, 'Array(UInt64)') AS arr +FROM format(JSONEachRow, 'v Variant(String, UInt64, Float64, DateTime, Array(UInt64))', $$ +{"v" : "Hello, World!"}, +{"v" : 42}, +{"v" : 42.42}, +{"v" : "2020-01-01 00:00:00"}, +{"v" : [1, 2, 3]} +$$) +``` + +```text +┌─v───────────────────┬─str───────────┬──num─┬─float─┬────────────────date─┬─arr─────┐ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42.42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ +│ 2020-01-01 00:00:00 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 00:00:00 │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└─────────────────────┴───────────────┴──────┴───────┴─────────────────────┴─────────┘ +``` diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5b9d01985dd..47b5ac7b724 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2831,3 +2831,39 @@ Result: │ SELECT a, b FROM tab WHERE (a > 3) AND (b < 3) │ └─────────────────────────────────────────────────────────────────────────┘ ``` + +## variantElement + +Extracts a column with specified type from a `Variant` column. + +**Syntax** + +``` sql +tupleElement(variant, type_name, [, default_value]) +``` + +- `variant` — Variant column. [Variant](../../sql-reference/data-types/variant.md). +- `type_name` — The name of the variant type to extract. [String](../../sql-reference/data-types/string.md). +- `default_value` - The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional. + +**Returned value** + +- Subcolumn of a `Variant` column with specified type. + +**Example** + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test; +``` + +```text +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 4ee6bb3d586..d2a579d6800 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -890,10 +890,7 @@ ColumnPtr makeNullable(const ColumnPtr & column) ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column) { - if (isColumnNullable(*column)) - return column; - - if (isColumnLowCardinalityNullable(*column)) + if (isColumnNullableOrLowCardinalityNullable(*column)) return column; if (isColumnConst(*column)) @@ -919,4 +916,21 @@ ColumnPtr makeNullableSafe(const ColumnPtr & column) return column; } +ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column) +{ + if (isColumnNullableOrLowCardinalityNullable(*column)) + return column; + + if (isColumnConst(*column)) + return ColumnConst::create(makeNullableOrLowCardinalityNullableSafe(assert_cast(*column).getDataColumnPtr()), column->size()); + + if (column->lowCardinality()) + return assert_cast(*column).cloneNullable(); + + if (column->canBeInsideNullable()) + return makeNullableSafe(column); + + return column; +} + } diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index b57fdf3064d..60c7750f8fc 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -231,5 +231,6 @@ private: ColumnPtr makeNullable(const ColumnPtr & column); ColumnPtr makeNullableSafe(const ColumnPtr & column); ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column); +ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column); } diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp new file mode 100644 index 00000000000..67754e77992 --- /dev/null +++ b/src/Columns/ColumnVariant.cpp @@ -0,0 +1,1360 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_COLUMN; + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; + extern const int PARAMETER_OUT_OF_BOUND; + extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT; + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; +} + +std::string ColumnVariant::getName() const +{ + WriteBufferFromOwnString res; + res << "Variant("; + bool is_first = true; + for (const auto & local_variant : global_to_local_discriminators) + { + if (!is_first) + res << ", "; + is_first = false; + res << variants[local_variant]->getName(); + } + res << ")"; + return res.str(); +} + + +void ColumnVariant::initIdentityGlobalToLocalDiscriminatorsMapping() +{ + local_to_global_discriminators.reserve(variants.size()); + global_to_local_discriminators.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + { + local_to_global_discriminators.push_back(i); + global_to_local_discriminators.push_back(i); + } +} + +ColumnVariant::ColumnVariant(MutableColumns && variants_) : ColumnVariant(std::move(variants_), {}) +{ +} + +ColumnVariant::ColumnVariant(MutableColumns && variants_, const std::vector & local_to_global_discriminators_) +{ + /// Empty local_to_global_discriminators mapping means that variants are already in the global order. + if (!local_to_global_discriminators_.empty() && local_to_global_discriminators_.size() != variants_.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "The number of values in local_to_global_discriminators mapping ({}) doesn't match the number of variants ({})", + local_to_global_discriminators_.size(), + variants_.size()); + + /// As variants are empty, column with local discriminators will be also empty and we can reorder variants according to global discriminators. + variants.resize(variants_.size()); + for (size_t i = 0; i != variants_.size(); ++i) + { + if (isColumnConst(*variants_[i])) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + + if (!variants_[i]->empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not empty column passed to ColumnVariant, but no local_discriminators passed"); + + if (!local_to_global_discriminators_.empty() && local_to_global_discriminators_[i] > variants_.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator {}. The number of variants: {}", UInt64(local_to_global_discriminators_[i]), variants_.size()); + + if (local_to_global_discriminators_.empty()) + variants[i] = std::move(variants_[i]); + else + variants[local_to_global_discriminators_[i]] = std::move(variants_[i]); + } + + local_discriminators = ColumnDiscriminators::create(); + offsets = ColumnOffsets::create(); + + /// Now global and local discriminators are the same. + initIdentityGlobalToLocalDiscriminatorsMapping(); +} + +ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), {}) +{ +} + +ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & global_discriminators) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), global_discriminators) +{ +} + +ColumnVariant::ColumnVariant(DB::MutableColumnPtr local_discriminators_, DB::MutableColumnPtr offsets_, DB::MutableColumns && variants_) : ColumnVariant(std::move(local_discriminators_), std::move(offsets_), std::move(variants_), {}) +{ +} + +ColumnVariant::ColumnVariant(DB::MutableColumnPtr local_discriminators_, DB::MutableColumnPtr offsets_, DB::MutableColumns && variants_, const std::vector & local_to_global_discriminators_) +{ + if (variants_.size() > MAX_NESTED_COLUMNS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); + + local_discriminators = std::move(local_discriminators_); + const ColumnDiscriminators * discriminators_concrete = typeid_cast(local_discriminators.get()); + if (!discriminators_concrete) + throw Exception(ErrorCodes::LOGICAL_ERROR, "discriminator column must be a ColumnUInt8"); + + variants.reserve(variants_.size()); + size_t total_size = 0; + for (auto & variant : variants_) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + + total_size += variant->size(); + variants.push_back(std::move(variant)); + } + + /// We can have more discriminators than values in columns + /// (because of NULL discriminators), but not less. + if (total_size > local_discriminators->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested columns sizes are inconsistent with local_discriminators column size. Total column sizes: {}, local_discriminators size: {}", total_size, local_discriminators->size()); + + if (offsets_) + { + if (!typeid_cast(offsets_.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "offsets column must be a ColumnUInt64"); + + offsets = std::move(offsets_); + } + else + { + /// If no offsets column was provided, construct offsets based on discriminators. + offsets = ColumnOffsets::create(); + Offsets & offsets_data = typeid_cast(offsets.get())->getData(); + offsets_data.reserve(discriminators_concrete->size()); + /// If we have only NULLs, offsets column will not contain any real offsets. + if (hasOnlyNulls()) + { + offsets_data.resize(discriminators_concrete->size()); + } + /// If we have only one non empty variant and no NULLs, + /// offsets column will contain just sequential offsets 0, 1, 2, ... + else if (getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + for (size_t i = 0; i != discriminators_concrete->size(); ++i) + offsets_data.push_back(i); + } + /// Otherwise we should iterate through discriminators and + /// remember current offset for each variant column. + else + { + std::vector nested_offsets; + nested_offsets.resize(variants.size()); + for (Discriminator discr : discriminators_concrete->getData()) + { + if (discr == NULL_DISCRIMINATOR) + offsets_data.emplace_back(); + else + offsets_data.push_back(nested_offsets[discr]++); + } + } + } + + /// Empty global_discriminators means that variants are already in global order. + if (local_to_global_discriminators_.empty()) + { + initIdentityGlobalToLocalDiscriminatorsMapping(); + } + else + { + if (local_to_global_discriminators_.size() != variants.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "The number of values in local_to_global_discriminators mapping ({}) doesn't match the number of variants ({})", + local_to_global_discriminators_.size(), + variants.size()); + + local_to_global_discriminators = local_to_global_discriminators_; + global_to_local_discriminators.resize(local_to_global_discriminators.size()); + /// Create mapping global discriminator -> local discriminator + for (size_t i = 0; i != local_to_global_discriminators.size(); ++i) + { + if (local_to_global_discriminators[i] > variants.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator {}. The number of variants: {}", UInt64(local_to_global_discriminators[i]), variants_.size()); + + global_to_local_discriminators[local_to_global_discriminators[i]] = i; + } + } +} + +ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::vector & local_to_global_discriminators) +{ + MutableColumns mutable_variants; + mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + mutable_variants.emplace_back(variant->assumeMutable()); + } + + return ColumnVariant::create(std::move(mutable_variants), local_to_global_discriminators); +} + +ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::Columns & variants, const std::vector & local_to_global_discriminators) +{ + MutableColumns mutable_variants; + mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + mutable_variants.emplace_back(variant->assumeMutable()); + } + + return ColumnVariant::create(local_discriminators->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); +} + +ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::ColumnPtr & offsets, const DB::Columns & variants, const std::vector & local_to_global_discriminators) +{ + MutableColumns mutable_variants; + mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + mutable_variants.emplace_back(variant->assumeMutable()); + } + + return ColumnVariant::create(local_discriminators->assumeMutable(), offsets->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); +} + +MutableColumnPtr ColumnVariant::cloneEmpty() const +{ + MutableColumns new_variants; + new_variants.reserve(variants.size()); + for (const auto & variant : variants) + new_variants.emplace_back(variant->cloneEmpty()); + + return ColumnVariant::create(std::move(new_variants), local_to_global_discriminators); +} + +MutableColumnPtr ColumnVariant::cloneResized(size_t new_size) const +{ + if (new_size == 0) + return cloneEmpty(); + + const size_t num_variants = variants.size(); + size_t size = local_discriminators->size(); + /// If new size is bigger than the old one, just clone column and append default values. + if (new_size >= size) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (const auto & variant : variants) + new_variants.emplace_back(IColumn::mutate(variant)); + + auto res = ColumnVariant::create(IColumn::mutate(local_discriminators), IColumn::mutate(offsets), std::move(new_variants), local_to_global_discriminators); + res->insertManyDefaults(new_size - size); + return res; + } + + /// If new size is less than current size, we should find the new size for all variants. + + /// Optimization for case when we have only NULLs. In this case we should just resize discriminators and offsets. + if (hasOnlyNulls()) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (const auto & variant : variants) + new_variants.emplace_back(IColumn::mutate(variant)); + + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can simply call cloneResized on this single variant, discriminators and offsets. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != variants.size(); ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[i]->cloneResized(new_size)); + else + new_variants.emplace_back(variants[i]->cloneEmpty()); + } + + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + const auto & local_discriminators_data = getLocalDiscriminators(); + const auto & offsets_data = getOffsets(); + + /// We can find all variants sizes by scanning all new_size local_discriminators and calculating + /// sizes for all new variants. This code is below and commented. + +// std::vector new_nested_sizes(num_variants, 0); +// for (size_t i = 0; i != new_size; ++i) +// { +// Discriminator discr = local_discriminators_data[i]; +// if (discr != NULL_DISCRIMINATOR) +// ++new_nested_sizes[discr]; +// } +// +// MutableColumns new_variants; +// new_variants.reserve(num_variants); +// for (size_t i = 0; i != num_variants; ++i) +// { +// if (new_nested_sizes[i]) +// new_variants.emplace_back(variants[i]->cloneResized(new_nested_sizes[i])); +// else +// new_variants.emplace_back(variants[i]->cloneEmpty()); +// } +// +// return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + + /// But instead we are trying to optimize it using offsets column: + /// For all non-empty variants we are trying to find last occurrence of its discriminator in local_discriminators[:new_size] or + /// first occurrence in local_discriminators[new_size:]. The same row in offsets column will contain the desired size (or size - 1) of variant. + /// All empty variants will remain empty. + /// Not sure how good this optimization is, feel free to remove it and use simpler version above. + + MutableColumns new_variants(num_variants); + std::unordered_set seen_variants; + /// First, check which variants are empty. They will remain empty. + for (Discriminator i = 0; i != num_variants; ++i) + { + if (variants[i]->empty()) + { + seen_variants.insert(i); + new_variants[i] = variants[i]->cloneEmpty(); + } + } + + /// Now, iterate through local discriminators using two pointers. + /// First will go from new_size - 1 to 0, second from new_size to size. + /// Finish when we find all variants or hit lower or upper bound. + ssize_t i = new_size - 1; + size_t j = new_size; + while (i != -1 && j != size) + { + Discriminator i_discr = local_discriminators_data[i]; + if (i_discr != NULL_DISCRIMINATOR) + { + auto [_, inserted] = seen_variants.insert(i_discr); + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (inserted) + { + new_variants[i_discr] = variants[i_discr]->cloneResized(offsets_data[i] + 1); + if (seen_variants.size() == num_variants) + break; + } + } + + Discriminator j_discr = local_discriminators_data[j]; + if (j_discr != NULL_DISCRIMINATOR) + { + auto [_, inserted] = seen_variants.insert(j_discr); + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (inserted) + { + new_variants[j_discr] = variants[j_discr]->cloneResized(offsets_data[j]); + if (seen_variants.size() == num_variants) + break; + } + } + + --i; + ++j; + } + + /// We can finish in 3 cases: + /// 1) seen_variants.size() == num_variants - we found local_discriminators of all variants, nothing to do. + /// 2) i == -1 - we scanned all values in local_discriminators[:new_size]. Not found variants doesn't have + /// values in local_discriminators[:new_size], so they should be empty in the resized version. + /// 3) j == size - we scanned all values in local_discriminators[new_size:]. Not found variants doesn't have + /// values in local_discriminators[new_size:], so, we should use the full variant in the resized version. + if (seen_variants.size() != num_variants) + { + for (size_t discr = 0; discr != num_variants; ++discr) + { + if (!seen_variants.contains(discr)) + { + if (i == -1) + new_variants[discr] = variants[discr]->cloneEmpty(); + else + new_variants[discr] = IColumn::mutate(variants[discr]); + } + } + } + + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); +} + +Field ColumnVariant::operator[](size_t n) const +{ + Discriminator discr = localDiscriminatorAt(n); + if (discr == NULL_DISCRIMINATOR) + return Null(); + return (*variants[discr])[offsetAt(n)]; +} + +void ColumnVariant::get(size_t n, Field & res) const +{ + Discriminator discr = localDiscriminatorAt(n); + if (discr == NULL_DISCRIMINATOR) + res = Null(); + else + variants[discr]->get(offsetAt(n), res); +} + +bool ColumnVariant::isDefaultAt(size_t n) const +{ + return localDiscriminatorAt(n) == NULL_DISCRIMINATOR; +} + +bool ColumnVariant::isNullAt(size_t n) const +{ + return localDiscriminatorAt(n) == NULL_DISCRIMINATOR; +} + +StringRef ColumnVariant::getDataAt(size_t) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDataAt is not supported for {}", getName()); +} + +void ColumnVariant::insertData(const char *, size_t) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertData is not supported for {}", getName()); +} + +void ColumnVariant::insert(const Field & field) +{ + if (field.isNull()) + insertDefault(); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot insert field {} to column {}", toString(field), getName()); +} + +void ColumnVariant::insertFrom(const IColumn & src_, size_t n) +{ + const ColumnVariant & src = assert_cast(src_); + + const size_t num_variants = variants.size(); + if (src.variants.size() != num_variants) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); + + /// Remember that src column can have different local variants order. + Discriminator global_discr = src.globalDiscriminatorAt(n); + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + if (local_discr == NULL_DISCRIMINATOR) + { + getOffsets().emplace_back(); + } + else + { + getOffsets().push_back(variants[local_discr]->size()); + variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(global_discr), src.offsetAt(n)); + } +} + +void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) +{ + const size_t num_variants = variants.size(); + const auto & src = assert_cast(src_); + if (src.variants.size() != num_variants) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); + + if (start + length > src.getLocalDiscriminators().size()) + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameter out of bound in ColumnVariant::insertRangeFrom method. " + "[start({}) + length({}) > local_discriminators.size({})]", start, length, src.getLocalDiscriminators().size()); + + /// If src column contains only NULLs, just insert NULLs. + if (src.hasOnlyNulls()) + { + insertManyDefaults(length); + return; + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs in src column. + /// In this case we can simply call insertRangeFrom on this single variant. + if (auto non_empty_src_local_discr = src.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(*non_empty_src_local_discr)); + size_t offset = variants[local_discr]->size(); + variants[local_discr]->insertRangeFrom(*src.variants[*non_empty_src_local_discr], start, length); + getLocalDiscriminators().resize_fill(local_discriminators->size() + length, local_discr); + auto & offsets_data = getOffsets(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(offset++); + return; + } + + /// Iterate through src local_discriminators in range [start, start + length], + /// collect ranges we need to insert for all variants and update offsets. + /// nested_ranges[i].first - offset in src.variants[i] + /// nested_ranges[i].second - length in src.variants[i] + std::vector> nested_ranges(num_variants, {0, 0}); + auto & offsets_data = getOffsets(); + offsets_data.reserve(offsets_data.size() + length); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.reserve(local_discriminators_data.size() + length); + const auto & src_offsets_data = src.getOffsets(); + const auto & src_local_discriminators_data = src.getLocalDiscriminators(); + for (size_t i = start; i != start + length; ++i) + { + /// We insert from src.variants[src_local_discr] to variants[local_discr] + Discriminator src_local_discr = src_local_discriminators_data[i]; + Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + local_discriminators_data.push_back(local_discr); + if (local_discr == NULL_DISCRIMINATOR) + { + offsets_data.emplace_back(); + } + else + { + /// If we see this discriminator for the first time, set its range start. + if (!nested_ranges[src_local_discr].second) + nested_ranges[src_local_discr].first = src_offsets_data[i]; + /// Update offsets column with correct offset. + offsets_data.push_back(variants[local_discr]->size() + nested_ranges[src_local_discr].second); + ++nested_ranges[src_local_discr].second; + } + } + + for (size_t src_local_discr = 0; src_local_discr != nested_ranges.size(); ++src_local_discr) + { + auto [nested_start, nested_length] = nested_ranges[src_local_discr]; + auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + if (nested_length) + variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length); + } +} + +void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +{ + const size_t num_variants = variants.size(); + const auto & src = assert_cast(src_); + if (src.variants.size() != num_variants) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); + + /// Remember that src column can have different local variants order. + Discriminator src_local_discr = src.localDiscriminatorAt(position); + Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); + + auto & offsets_data = getOffsets(); + if (local_discr == NULL_DISCRIMINATOR) + { + offsets_data.resize_fill(offsets_data.size() + length); + } + else + { + size_t prev_offset = variants[local_discr]->size(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(prev_offset + i); + + variants[local_discr]->insertManyFrom(*src.variants[src_local_discr], src.offsetAt(position), length); + } +} + +void ColumnVariant::insertDefault() +{ + getLocalDiscriminators().push_back(NULL_DISCRIMINATOR); + getOffsets().emplace_back(); +} + +void ColumnVariant::insertManyDefaults(size_t length) +{ + size_t size = local_discriminators->size(); + getLocalDiscriminators().resize_fill(size + length, NULL_DISCRIMINATOR); + getOffsets().resize_fill(size + length); +} + +void ColumnVariant::popBack(size_t n) +{ + /// If we have only NULLs, just pop back from local_discriminators and offsets. + if (hasOnlyNulls()) + { + local_discriminators->popBack(n); + offsets->popBack(n); + return; + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can just popBack n elements from this variant. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + variants[*non_empty_local_discr]->popBack(n); + local_discriminators->popBack(n); + offsets->popBack(n); + return; + } + + /// Calculate how many rows we need to pop from each variant + auto & local_discriminators_data = getLocalDiscriminators(); + size_t size = local_discriminators_data.size(); + const size_t num_variants = variants.size(); + std::vector nested_n(num_variants, 0); + for (size_t i = 0; i != n; ++i) + { + Discriminator discr = local_discriminators_data[size - i - 1]; + if (discr != NULL_DISCRIMINATOR) + ++nested_n[discr]; + } + + for (size_t i = 0; i != num_variants; ++i) + { + if (nested_n[i]) + variants[i]->popBack(nested_n[i]); + } + + local_discriminators->popBack(n); + offsets->popBack(n); +} + +StringRef ColumnVariant::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const UInt8 *) const +{ + /// During any serialization/deserialization we should always use global discriminators. + Discriminator global_discr = globalDiscriminatorAt(n); + char * pos = arena.allocContinue(sizeof(global_discr), begin); + memcpy(pos, &global_discr, sizeof(global_discr)); + StringRef res(pos, sizeof(global_discr)); + + if (global_discr == NULL_DISCRIMINATOR) + return res; + + auto value_ref = variants[localDiscriminatorByGlobal(global_discr)]->serializeValueIntoArena(offsetAt(n), arena, begin); + res.data = value_ref.data - res.size; + res.size += value_ref.size; + + return res; +} + +const char * ColumnVariant::deserializeAndInsertFromArena(const char * pos) +{ + /// During any serialization/deserialization we should always use global discriminators. + Discriminator global_discr = unalignedLoad(pos); + pos += sizeof(global_discr); + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + if (local_discr == NULL_DISCRIMINATOR) + { + getOffsets().emplace_back(); + return pos; + } + + getOffsets().push_back(variants[local_discr]->size()); + return variants[local_discr]->deserializeAndInsertFromArena(pos); +} + +const char * ColumnVariant::skipSerializedInArena(const char * pos) const +{ + Discriminator global_discr = unalignedLoad(pos); + pos += sizeof(global_discr); + if (global_discr == NULL_DISCRIMINATOR) + return pos; + + return variants[localDiscriminatorByGlobal(global_discr)]->skipSerializedInArena(pos); +} + +void ColumnVariant::updateHashWithValue(size_t n, SipHash & hash) const +{ + Discriminator global_discr = globalDiscriminatorAt(n); + hash.update(global_discr); + if (global_discr != NULL_DISCRIMINATOR) + variants[localDiscriminatorByGlobal(global_discr)]->updateHashWithValue(offsetAt(n), hash); +} + +void ColumnVariant::updateWeakHash32(WeakHash32 & hash) const +{ + auto s = size(); + + if (hash.getData().size() != s) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: " + "column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size())); + + /// If we have only NULLs, keep hash unchanged. + if (hasOnlyNulls()) + return; + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can just calculate weak hash for this variant. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + variants[*non_empty_local_discr]->updateWeakHash32(hash); + return; + } + + /// Calculate weak hash for all variants. + std::vector nested_hashes; + for (const auto & variant : variants) + { + WeakHash32 nested_hash(variant->size()); + variant->updateWeakHash32(nested_hash); + nested_hashes.emplace_back(std::move(nested_hash)); + } + + /// For each row hash is a hash of corresponding row from corresponding variant. + auto & hash_data = hash.getData(); + const auto & local_discriminators_data = getLocalDiscriminators(); + const auto & offsets_data = getOffsets(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + /// Update hash only for non-NULL values + if (discr != NULL_DISCRIMINATOR) + { + auto nested_hash = nested_hashes[local_discriminators_data[i]].getData()[offsets_data[i]]; + hash_data[i] = static_cast(hashCRC32(nested_hash, hash_data[i])); + } + } +} + +void ColumnVariant::updateHashFast(SipHash & hash) const +{ + local_discriminators->updateHashFast(hash); + for (const auto & variant : variants) + variant->updateHashFast(hash); +} + +ColumnPtr ColumnVariant::filter(const Filter & filt, ssize_t result_size_hint) const +{ + if (size() != filt.size()) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size()); + + /// If we have only NULLs, just filter local_discriminators column. + if (hasOnlyNulls()) + { + Columns new_variants(variants.begin(), variants.end()); + auto new_discriminators = local_discriminators->filter(filt, result_size_hint); + /// In case of all NULL values offsets doesn't contain any useful values, just resize it. + ColumnPtr new_offsets = offsets->cloneResized(new_discriminators->size()); + return ColumnVariant::create(new_discriminators, new_offsets, new_variants, local_to_global_discriminators); + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can just filter this variant and resize discriminators/offsets. + if (auto non_empty_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + Columns new_variants(variants.begin(), variants.end()); + new_variants[*non_empty_discr] = variants[*non_empty_discr]->filter(filt, result_size_hint); + size_t new_size = new_variants[*non_empty_discr]->size(); + ColumnPtr new_discriminators = local_discriminators->cloneResized(new_size); + ColumnPtr new_offsets = offsets->cloneResized(new_size); + return ColumnVariant::create(new_discriminators, new_offsets, new_variants, local_to_global_discriminators); + } + + /// We should create filter for each variant + /// according to local_discriminators and given filter. + const size_t num_variants = variants.size(); + std::vector nested_filters(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_filters[i].reserve(variants[i]->size()); + + /// As we will iterate through local_discriminators anyway, we can count + /// result size for each variant. + std::vector variant_result_size_hints(num_variants); + + const auto & local_discriminators_data = getLocalDiscriminators(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + nested_filters[local_discriminators_data[i]].push_back(filt[i]); + variant_result_size_hints[local_discriminators_data[i]] += !!(filt[i]); + } + } + + Columns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + /// It make sense to call filter() on variant only if the result size is not 0. + if (variant_result_size_hints[i]) + new_variants.emplace_back(variants[i]->filter(nested_filters[i], variant_result_size_hints[i])); + else + new_variants.emplace_back(variants[i]->cloneEmpty()); + } + + /// We cannot use filtered offsets column, as it will be incorrect. + /// It will be reconstructed on ColumnVariant creation according to new local_discriminators. + return ColumnVariant::create(local_discriminators->filter(filt, result_size_hint), new_variants, local_to_global_discriminators); +} + +void ColumnVariant::expand(const Filter & mask, bool inverted) +{ + /// Expand local_discriminators using NULL_DISCRIMINATOR for 0-rows. + expandDataByMask(getLocalDiscriminators(), mask, inverted, NULL_DISCRIMINATOR); + expandDataByMask(getOffsets(), mask, inverted); +} + +ColumnPtr ColumnVariant::permute(const Permutation & perm, size_t limit) const +{ + /// If we have only NULLs, permutation will take no effect, just return resized column. + if (hasOnlyNulls()) + return cloneResized(limit); + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case local_discriminators column is filled with identical values and offsets column + /// filled with sequential numbers. In this case we can just apply permutation to this + /// single non-empty variant and cut local_discriminators and offsets columns to the result size. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + const size_t num_variants = variants.size(); + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[*non_empty_local_discr]->permute(perm, limit)->assumeMutable()); + else + new_variants.emplace_back(variants[i]->assumeMutable()); + } + + size_t new_size = new_variants[*non_empty_local_discr]->size(); + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + return permuteImpl(*this, perm, limit); +} + +ColumnPtr ColumnVariant::index(const IColumn & indexes, size_t limit) const +{ + /// If we have only NULLs, index will take no effect, just return resized column. + if (hasOnlyNulls()) + return cloneResized(limit); + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case local_discriminators column is filled with identical values and offsets column + /// filled with sequential numbers. So we can just apply indexes to this + /// single non-empty variant and cut local_discriminators and offsets columns to the result size. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + const size_t num_variants = variants.size(); + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[*non_empty_local_discr]->index(indexes, limit)->assumeMutable()); + else + new_variants.emplace_back(variants[i]->assumeMutable()); + } + + size_t new_size = new_variants[*non_empty_local_discr]->size(); + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + return selectIndexImpl(*this, indexes, limit); +} + +template +ColumnPtr ColumnVariant::indexImpl(const PaddedPODArray & indexes, size_t limit) const +{ + /// First, apply indexes for local_discriminators and offsets. + ColumnPtr new_local_discriminators = assert_cast(*local_discriminators).indexImpl(indexes, limit); + ColumnPtr new_offsets = assert_cast(*offsets).indexImpl(indexes, limit); + const auto & new_local_discriminators_data = assert_cast(*new_local_discriminators).getData(); + const auto & new_offsets_data = assert_cast(*new_offsets).getData(); + /// Then, create permutation for each variant. + const size_t num_variants = variants.size(); + std::vector nested_perms(num_variants); + /// If there is no limit, we know the size of each permutation + /// in advance and can use reserve. + if (limit == 0) + { + for (size_t i = 0; i != num_variants; ++i) + nested_perms[i].reserve(variants[i]->size()); + } + + for (size_t i = 0; i != new_local_discriminators_data.size(); ++i) + { + Discriminator discr = new_local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + nested_perms[discr].push_back(new_offsets_data[i]); + } + + Columns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + size_t nested_limit = nested_perms[i].size() == variants[i]->size() ? 0 : nested_perms[i].size(); + new_variants.emplace_back(variants[i]->permute(nested_perms[i], nested_limit)); + } + + /// We cannot use new_offsets column as an offset column, because it became invalid after variants permutation. + /// New offsets column will be created in constructor. + return ColumnVariant::create(new_local_discriminators, new_variants, local_to_global_discriminators); +} + +ColumnPtr ColumnVariant::replicate(const Offsets & replicate_offsets) const +{ + if (size() != replicate_offsets.size()) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of offsets {} doesn't match size of column {}", replicate_offsets.size(), size()); + + if (empty()) + return cloneEmpty(); + + /// If we have only NULLs, just resize column to the new size. + if (hasOnlyNulls()) + return cloneResized(replicate_offsets.back()); + + const size_t num_variants = variants.size(); + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case local_discriminators column is filled with identical values and offsets column + /// filled with sequential numbers. So we can just replicate this one non empty variant, + /// then resize local_discriminators to the result size and fill offsets column. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[*non_empty_local_discr]->replicate(replicate_offsets)->assumeMutable()); + else + new_variants.emplace_back(variants[i]->cloneEmpty()); + } + + size_t new_size = new_variants[*non_empty_local_discr]->size(); + /// Create and fill new local_discriminators column with non_empty_index discriminator. + auto new_local_discriminators = IColumn::mutate(local_discriminators); + assert_cast(*new_local_discriminators).getData().resize_fill(new_size, *non_empty_local_discr); + /// Create and fill new offsets column with sequential indexes. + auto new_offsets = IColumn::mutate(offsets); + auto & new_offsets_data = assert_cast(*new_offsets).getData(); + size_t old_size = offsets->size(); + if (new_size > old_size) + { + new_offsets_data.reserve(new_size); + for (size_t i = old_size; i < new_size; ++i) + new_offsets_data.push_back(new_offsets_data[i - 1] + 1); + } + else + { + new_offsets_data.resize(new_size); + } + + return ColumnVariant::create(std::move(new_local_discriminators), std::move(new_offsets), std::move(new_variants), local_to_global_discriminators); + } + + /// Create replicate offsets for each variant according to + /// local_discriminators column. + std::vector nested_replicated_offsets(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_replicated_offsets[i].reserve(variants[i]->size()); + + const auto & local_discriminators_data = getLocalDiscriminators(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + size_t repeat_count = replicate_offsets[i] - replicate_offsets[i - 1]; + nested_replicated_offsets[discr].push_back(nested_replicated_offsets[discr].back() + repeat_count); + } + } + + auto new_local_discriminators = local_discriminators->replicate(replicate_offsets); + Columns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + new_variants.emplace_back(variants[i]->replicate(nested_replicated_offsets[i])); + + /// New offsets column will be created in constructor. + return ColumnVariant::create(new_local_discriminators, new_variants, local_to_global_discriminators); +} + +MutableColumns ColumnVariant::scatter(ColumnIndex num_columns, const Selector & selector) const +{ + const size_t num_variants = variants.size(); + + /// If we have only NULLs, we need to scatter only local_discriminators. + if (hasOnlyNulls()) + { + auto scattered_local_discriminators = local_discriminators->scatter(num_columns, selector); + MutableColumns result; + result.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (const auto & variant : variants) + new_variants.emplace_back(IColumn::mutate(variant)); + + result.emplace_back(ColumnVariant::create(std::move(scattered_local_discriminators[i]), std::move(new_variants), local_to_global_discriminators)); + } + + return result; + } + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case we can just scatter local_discriminators and this non empty variant. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + auto scattered_local_discriminators = local_discriminators->scatter(num_columns, selector); + auto scattered_non_empty_variant = variants[*non_empty_local_discr]->scatter(num_columns, selector); + MutableColumns result; + result.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + { + MutableColumns scattered_nested_variants(num_variants); + for (size_t j = 0; j != num_variants; ++j) + { + if (j == *non_empty_local_discr) + scattered_nested_variants[j] = std::move(scattered_non_empty_variant[i]); + else + scattered_nested_variants[j] = IColumn::mutate(variants[j]); + } + + result.emplace_back(ColumnVariant::create(std::move(scattered_local_discriminators[i]), std::move(scattered_nested_variants), local_to_global_discriminators)); + } + + return result; + } + + /// Create selector for each variant according to local_discriminators. + std::vector nested_selectors(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_selectors[i].reserve(variants[i]->size()); + + const auto & local_discriminators_data = getLocalDiscriminators(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + nested_selectors[discr].push_back(selector[i]); + } + + auto scattered_local_discriminators = local_discriminators->scatter(num_columns, selector); + std::vector nested_scattered_variants; + nested_scattered_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_scattered_variants.emplace_back(variants[i]->scatter(num_columns, nested_selectors[i])); + + MutableColumns result; + result.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (size_t j = 0; j != num_variants; ++j) + new_variants.emplace_back(std::move(nested_scattered_variants[j][i])); + result.emplace_back(ColumnVariant::create(std::move(scattered_local_discriminators[i]), std::move(new_variants), local_to_global_discriminators)); + } + + return result; +} + +void ColumnVariant::gather(ColumnGathererStream & gatherer) +{ + gatherer.gather(*this); +} + +bool ColumnVariant::hasEqualValues() const +{ + if (local_discriminators->empty() || hasOnlyNulls()) + return true; + + return local_discriminators->hasEqualValues() && variants[localDiscriminatorAt(0)]->hasEqualValues(); +} + +void ColumnVariant::getPermutation(IColumn::PermutationSortDirection, IColumn::PermutationSortStability, size_t, int, IColumn::Permutation & res) const +{ + size_t s = local_discriminators->size(); + res.resize(s); + for (size_t i = 0; i < s; ++i) + res[i] = i; +} + +void ColumnVariant::updatePermutation(IColumn::PermutationSortDirection, IColumn::PermutationSortStability, size_t, int, IColumn::Permutation &, DB::EqualRanges &) const +{ +} + +void ColumnVariant::reserve(size_t n) +{ + local_discriminators->reserve(n); + offsets->reserve(n); +} + +void ColumnVariant::ensureOwnership() +{ + const size_t num_variants = variants.size(); + for (size_t i = 0; i < num_variants; ++i) + getVariantByLocalDiscriminator(i).ensureOwnership(); +} + +size_t ColumnVariant::byteSize() const +{ + size_t res = local_discriminators->byteSize() + offsets->byteSize(); + for (const auto & variant : variants) + res += variant->byteSize(); + return res; +} + +size_t ColumnVariant::byteSizeAt(size_t n) const +{ + size_t res = sizeof(Offset) + sizeof(Discriminator); + Discriminator discr = localDiscriminatorAt(n); + if (discr == NULL_DISCRIMINATOR) + return res; + + return res + variants[discr]->byteSizeAt(offsetAt(n)); +} + +size_t ColumnVariant::allocatedBytes() const +{ + size_t res = local_discriminators->allocatedBytes() + offsets->allocatedBytes(); + for (const auto & variant : variants) + res += variant->allocatedBytes(); + return res; +} + +void ColumnVariant::protect() +{ + local_discriminators->protect(); + offsets->protect(); + for (auto & variant : variants) + variant->protect(); +} + +void ColumnVariant::getExtremes(Field & min, Field & max) const +{ + min = Null(); + max = Null(); +} + +void ColumnVariant::forEachSubcolumn(MutableColumnCallback callback) +{ + callback(local_discriminators); + callback(offsets); + for (auto & variant : variants) + callback(variant); +} + +void ColumnVariant::forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) +{ + callback(*local_discriminators); + local_discriminators->forEachSubcolumnRecursively(callback); + callback(*offsets); + offsets->forEachSubcolumnRecursively(callback); + + for (auto & variant : variants) + { + callback(*variant); + variant->forEachSubcolumnRecursively(callback); + } +} + +bool ColumnVariant::structureEquals(const IColumn & rhs) const +{ + const auto * rhs_variant = typeid_cast(&rhs); + if (!rhs_variant) + return false; + + const size_t num_variants = variants.size(); + if (num_variants != rhs_variant->variants.size()) + return false; + + for (size_t i = 0; i < num_variants; ++i) + if (!variants[i]->structureEquals(rhs_variant->getVariantByGlobalDiscriminator(globalDiscriminatorByLocal(i)))) + return false; + + return true; +} + +ColumnPtr ColumnVariant::compress() const +{ + ColumnPtr local_discriminators_compressed = local_discriminators->compress(); + ColumnPtr offsets_compressed = offsets->compress(); + size_t byte_size = local_discriminators_compressed->byteSize() + offsets_compressed->byteSize(); + Columns compressed; + compressed.reserve(variants.size()); + for (const auto & variant : variants) + { + auto compressed_variant = variant->compress(); + byte_size += compressed_variant->byteSize(); + compressed.emplace_back(std::move(compressed_variant)); + } + + return ColumnCompressed::create(size(), byte_size, + [my_local_discriminators_compressed = std::move(local_discriminators_compressed), my_offsets_compressed = std::move(offsets_compressed), my_compressed = std::move(compressed), my_local_to_global_discriminators = this->local_to_global_discriminators]() mutable + { + for (auto & variant : my_compressed) + variant = variant->decompress(); + return ColumnVariant::create(my_local_discriminators_compressed->decompress(), my_offsets_compressed->decompress(), my_compressed, my_local_to_global_discriminators); + }); +} + +double ColumnVariant::getRatioOfDefaultRows(double) const +{ + UInt64 num_defaults = getNumberOfDefaultRows(); + return static_cast(num_defaults) / local_discriminators->size(); +} + +UInt64 ColumnVariant::getNumberOfDefaultRows() const +{ + size_t total_variant_sizes = 0; + for (const auto & variant : variants) + total_variant_sizes += variant->size(); + return local_discriminators->size() - total_variant_sizes; +} + +void ColumnVariant::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + +void ColumnVariant::finalize() +{ + for (auto & variant : variants) + variant->finalize(); +} + +bool ColumnVariant::isFinalized() const +{ + return std::all_of(variants.begin(), variants.end(), [](const auto & variant) { return variant->isFinalized(); }); +} + +std::optional ColumnVariant::getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls() const +{ + for (size_t i = 0; i != variants.size(); ++i) + { + if (variants[i]->size() == local_discriminators->size()) + return i; + } + + return std::nullopt; +} + +void ColumnVariant::applyNullMap(const ColumnVector::Container & null_map) +{ + applyNullMapImpl(null_map); +} + +void ColumnVariant::applyNegatedNullMap(const ColumnVector::Container & null_map) +{ + applyNullMapImpl(null_map); +} + +template +void ColumnVariant::applyNullMapImpl(const ColumnVector::Container & null_map) +{ + if (null_map.size() != local_discriminators->size()) + throw Exception(ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT, + "Logical error: Sizes of discriminators column and null map data are not equal"); + + /// If we have only NULLs, nothing to do. + if (hasOnlyNulls()) + { + return; + } + + /// If we have only 1 non empty column and no NULLs, we can just filter that + /// variant according to the null_map. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + auto & local_discriminators_data = getLocalDiscriminators(); + auto & offsets_data = getOffsets(); + size_t size_hint = 0; + + if constexpr (inverted) + { + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + if (null_map[i]) + offsets_data[i] = size_hint++; + else + local_discriminators_data[i] = NULL_DISCRIMINATOR; + } + variants[*non_empty_local_discr] = variants[*non_empty_local_discr]->filter(null_map, size_hint); + } + else + { + ColumnVector::Container filter; + filter.reserve(null_map.size()); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + if (null_map[i]) + { + filter.push_back(0); + local_discriminators_data[i] = NULL_DISCRIMINATOR; + } + else + { + filter.push_back(1); + offsets_data[i] = size_hint++; + } + } + variants[*non_empty_local_discr] = variants[*non_empty_local_discr]->filter(filter, size_hint); + } + + return; + } + + /// In general case we should iterate through null_map + discriminators, + /// create filter for each variant and update offsets column. + std::vector variant_filters; + variant_filters.resize(variants.size()); + std::vector variant_new_sizes; + variant_new_sizes.resize(variants.size(), 0); + + auto & local_discriminators_data = getLocalDiscriminators(); + auto & offsets_data = getOffsets(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + auto & discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + if (null_map[i] ^ inverted) + { + auto & variant_filter = variant_filters[discr]; + /// We create filters lazily. + if (variant_filter.empty()) + variant_filter.resize_fill(variants[discr]->size(), 1); + variant_filter[offsets_data[i]] = 0; + discr = NULL_DISCRIMINATOR; + } + else + { + offsets_data[i] = variant_new_sizes[discr]++; + } + } + } + + for (size_t i = 0; i != variants.size(); ++i) + { + if (!variant_filters[i].empty()) + variants[i] = variants[i]->filter(variant_filters[i], variant_new_sizes[i]); + } +} + +} diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h new file mode 100644 index 00000000000..692fdd1709e --- /dev/null +++ b/src/Columns/ColumnVariant.h @@ -0,0 +1,306 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +/** + * Column for storing Variant(...) type values. + * Variant type represents a union of other data types. + * For example, type Variant(T1, T2, ..., TN) means that each row of this type + * has a value of either type T1 or T2 or ... or TN or none of them (NULL value) + * + * ColumnVariant stores: + * - The discriminators column, which determines which variant is stored in each row. + * - The offsets column, which determines the offset in the corresponding variant column in each row. + * - The list of variant columns with only real values (so the sizes of variant columns can be different). + * Discriminator is an index of a variant in the variants list, it also has special value called NULL_DISCRIMINATOR + * that indicates that the value in the row is NULL. + * + * We want to be able to extend Variant column for free without rewriting the data, but as we don't care about the + * order of variants during Variant creation (we want Variant(T1, T2) to be the same as Variant(T2, T1)), we support + * some global order of nested types inside Variant during type creation, so after extension the order of variant types + * (and so their discriminators) can change. For example: Variant(T1, T3) -> Variant(T1, T2, T3). + * To avoid full rewrite of discriminators column on Variant extension, we differentiate local order of variants + * inside a column and global order of variants created during type creation. So, ColumnVariant stores only local + * discriminators and additionally stores the mapping between global and local discriminators. + * So, when we need to extend Variant column with new variant, we can just append it to a list of variant columns + * with new local discriminator and update mapping from global to local orders. + * + * Note that two instances of ColumnVariant can have different local orders, so we should always use global + * discriminators during inter-column interactions. + * + * Let's take an example with type Variant(UInt32, String, Array(UInt32)): + * During type creation we will sort types by their names and get the global order: Array(UInt32), String, UInt32. + * So, type Array(UInt32) will have global discriminator 0, String - 1 and UInt32 - 2. + * Let's say we have a column with local order (String, UInt32, Array(UInt32)) and values: + * 'Hello', 42, NULL, 'World', 43, [1, 2, 3], NULL, 44 + * + * Let's see how these values will be stored in ColumnVariant: + * + * local_to_global_discriminators: {0 : 1, 1 : 2, 2 : 0} + * global_to_local_discriminators: {0 : 2, 1 : 0, 2 : 1} + * local_discriminators offsets String UInt32 Array(UInt32) + * 0 0 'Hello' 42 [1, 2, 3] + * 1 0 'World' 43 + * NULL_DISCRIMINATOR 0 44 + * 0 1 + * 1 1 + * 2 0 + * NULL_DISCRIMINATOR 0 + * 1 2 + * + */ +class ColumnVariant final : public COWHelper +{ +public: + using Discriminator = UInt8; + using Discriminators = PaddedPODArray; + using ColumnDiscriminators = ColumnVector; + using ColumnOffsets = ColumnVector; + + static constexpr UInt8 NULL_DISCRIMINATOR = std::numeric_limits::max(); /// 255 + static constexpr size_t MAX_NESTED_COLUMNS = std::numeric_limits::max(); /// 255 + +private: + friend class COWHelper; + + using NestedColumns = std::vector; + + /// Create an empty column with provided variants. + /// Variants are in global order. + explicit ColumnVariant(MutableColumns && variants_); + /// Variants are in local order according to provided mapping. + explicit ColumnVariant(MutableColumns && variants_, const std::vector & local_to_global_discriminators_); + + /// Create column from discriminators column and list of variant columns. + /// Offsets column should be constructed according to the discriminators. + /// Variants are in global order. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_); + /// Variants are in local order according to provided mapping. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_); + + /// Create column from discriminators column, offsets column and list of variant columns. + /// Variants are in global order. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_); + /// Variants are in local order according to provided mapping. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_); + + ColumnVariant(const ColumnVariant &) = default; + +public: + /** Create immutable column using immutable arguments. This arguments may be shared with other variants. + * Use IColumn::mutate in order to make mutable column and mutate shared nested variants. + */ + using Base = COWHelper; + static Ptr create(const Columns & variants_) { return create(variants_, {}); } + static Ptr create(const Columns & variants_, const std::vector & local_to_global_discriminators_); + static Ptr create(const ColumnPtr & local_discriminators_, const Columns & variants_) { return create(local_discriminators_, variants_, {}); } + static Ptr create(const ColumnPtr & local_discriminators_, const Columns & variants_, const std::vector & local_to_global_discriminators_); + static Ptr create(const ColumnPtr & local_discriminators_, const DB::ColumnPtr & offsets_, const Columns & variants_) { return create(local_discriminators_, offsets_, variants_, {}); } + static Ptr create(const ColumnPtr & local_discriminators_, const DB::ColumnPtr & offsets_, const Columns & variants_, const std::vector & local_to_global_discriminators_); + + static MutablePtr create(MutableColumns && variants_) + { + return Base::create(std::move(variants_)); + } + + static MutablePtr create(MutableColumns && variants_, const std::vector & local_to_global_discriminators_) + { + return Base::create(std::move(variants_), local_to_global_discriminators_); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumns && variants_) + { + return Base::create(std::move(local_discriminators_), std::move(variants_)); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_) + { + return Base::create(std::move(local_discriminators_), std::move(variants_), local_to_global_discriminators_); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_) + { + return Base::create(std::move(local_discriminators_), std::move(offsets_), std::move(variants_)); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_) + { + return Base::create(std::move(local_discriminators_), std::move(offsets_), std::move(variants_), local_to_global_discriminators_); + } + + std::string getName() const override; + const char * getFamilyName() const override { return "Variant"; } + TypeIndex getDataType() const override { return TypeIndex::Variant; } + + MutableColumnPtr cloneEmpty() const override; + MutableColumnPtr cloneResized(size_t size) const override; + + size_t ALWAYS_INLINE offsetAt(size_t i) const { return getOffsets()[i]; } + Discriminator ALWAYS_INLINE localDiscriminatorAt(size_t i) const { return getLocalDiscriminators()[i]; } + Discriminator ALWAYS_INLINE globalDiscriminatorAt(size_t i) const { return globalDiscriminatorByLocal(getLocalDiscriminators()[i]); } + + Discriminator ALWAYS_INLINE globalDiscriminatorByLocal(Discriminator local_discr) const + { + /// NULL_DISCRIMINATOR is always the same in local and global orders. + return local_discr == NULL_DISCRIMINATOR ? NULL_DISCRIMINATOR : local_to_global_discriminators[local_discr]; + } + + Discriminator ALWAYS_INLINE localDiscriminatorByGlobal(Discriminator global_discr) const + { + /// NULL_DISCRIMINATOR is always the same in local and global orders. + return global_discr == NULL_DISCRIMINATOR ? NULL_DISCRIMINATOR : global_to_local_discriminators[global_discr]; + } + + size_t size() const override + { + return local_discriminators->size(); + } + + Field operator[](size_t n) const override; + void get(size_t n, Field & res) const override; + + bool isDefaultAt(size_t n) const override; + bool isNullAt(size_t n) const override; + StringRef getDataAt(size_t n) const override; + void insertData(const char * pos, size_t length) override; + void insert(const Field & x) override; + void insertFrom(const IColumn & src_, size_t n) override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + void insertDefault() override; + void insertManyDefaults(size_t length) override; + void popBack(size_t n) override; + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const UInt8 *) const override; + const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override; + void updateWeakHash32(WeakHash32 & hash) const override; + void updateHashFast(SipHash & hash) const override; + ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; + void expand(const Filter & mask, bool inverted) override; + ColumnPtr permute(const Permutation & perm, size_t limit) const override; + ColumnPtr index(const IColumn & indexes, size_t limit) const override; + template + ColumnPtr indexImpl(const PaddedPODArray & indexes, size_t limit) const; + ColumnPtr replicate(const Offsets & replicate_offsets) const override; + MutableColumns scatter(ColumnIndex num_variants, const Selector & selector) const override; + void gather(ColumnGathererStream & gatherer_stream) override; + + /// Variant type is not comparable. + int compareAt(size_t, size_t, const IColumn &, int) const override + { + return 0; + } + + void compareColumn(const IColumn &, size_t, PaddedPODArray *, PaddedPODArray &, int, int) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method compareColumn is not supported for ColumnAggregateFunction"); + } + + bool hasEqualValues() const override; + void getExtremes(Field & min, Field & max) const override; + void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override; + void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override; + + void reserve(size_t n) override; + void ensureOwnership() override; + size_t byteSize() const override; + size_t byteSizeAt(size_t n) const override; + size_t allocatedBytes() const override; + void protect() override; + void forEachSubcolumn(MutableColumnCallback callback) override; + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; + bool structureEquals(const IColumn & rhs) const override; + ColumnPtr compress() const override; + double getRatioOfDefaultRows(double sample_ratio) const override; + UInt64 getNumberOfDefaultRows() const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; + void finalize() override; + bool isFinalized() const override; + + const IColumn & getVariantByLocalDiscriminator(size_t discr) const { return *variants[discr]; } + const IColumn & getVariantByGlobalDiscriminator(size_t discr) const { return *variants[global_to_local_discriminators.at(discr)]; } + IColumn & getVariantByLocalDiscriminator(size_t discr) { return *variants[discr]; } + IColumn & getVariantByGlobalDiscriminator(size_t discr) { return *variants[global_to_local_discriminators.at(discr)]; } + + const ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) const { return variants[discr]; } + const ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) const { return variants[global_to_local_discriminators.at(discr)]; } + ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) { return variants[discr]; } + ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) { return variants[global_to_local_discriminators.at(discr)]; } + + const IColumn & getLocalDiscriminatorsColumn() const { return *local_discriminators; } + IColumn & getLocalDiscriminatorsColumn() { return *local_discriminators; } + + const ColumnPtr & getLocalDiscriminatorsPtr() const { return local_discriminators; } + ColumnPtr & getLocalDiscriminatorsPtr() { return local_discriminators; } + + const Discriminators & ALWAYS_INLINE getLocalDiscriminators() const { return assert_cast(*local_discriminators).getData(); } + Discriminators & ALWAYS_INLINE getLocalDiscriminators() { return assert_cast(*local_discriminators).getData(); } + + const IColumn & getOffsetsColumn() const { return *offsets; } + IColumn & getOffsetsColumn() { return *offsets; } + + const ColumnPtr & getOffsetsPtr() const { return offsets; } + ColumnPtr & getOffsetsPtr() { return offsets; } + + const Offsets & ALWAYS_INLINE getOffsets() const { return assert_cast(*offsets).getData(); } + Offsets & ALWAYS_INLINE getOffsets() { return assert_cast(*offsets).getData(); } + + size_t getNumVariants() const { return variants.size(); } + + bool hasOnlyNulls() const + { + /// If all variants are empty, we have only NULL values. + return std::all_of(variants.begin(), variants.end(), [](const auto & v){ return v->empty(); } ); + } + + /// Check if local and global order is the same. + bool hasGlobalVariantsOrder() const + { + for (size_t i = 0; i != local_to_global_discriminators.size(); ++i) + { + if (local_to_global_discriminators[i] != i) + return false; + } + + return true; + } + + /// Check if we have only 1 non-empty variant and no NULL values, + /// and if so, return the discriminator of this non-empty column. + std::optional getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls() const; + + /// Apply null map to a Variant column. + /// Replace corresponding discriminators with NULL_DISCRIMINATOR + /// and filter out rows in variants if needed. + void applyNullMap(const ColumnVector::Container & null_map); + void applyNegatedNullMap(const ColumnVector::Container & null_map); + +private: + void initIdentityGlobalToLocalDiscriminatorsMapping(); + + template + void applyNullMapImpl(const ColumnVector::Container & null_map); + + WrappedPtr local_discriminators; + WrappedPtr offsets; + NestedColumns variants; + + std::vector global_to_local_discriminators; + std::vector local_to_global_discriminators; +}; + + +} diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 7923bca6354..82dc82e0bd9 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,11 @@ bool isColumnNullable(const IColumn & column) return checkColumn(column); } +bool isColumnNullableOrLowCardinalityNullable(const IColumn & column) +{ + return isColumnNullable(column) || isColumnLowCardinalityNullable(column); +} + bool isColumnConst(const IColumn & column) { return checkColumn(column); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index f012eeca61f..0dcba5b310c 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -631,6 +631,17 @@ struct IsMutableColumns template <> struct IsMutableColumns<> { static const bool value = true; }; +template +struct IsMutableColumnsOrRvalueReferences; + +template +struct IsMutableColumnsOrRvalueReferences +{ + static const bool value = (std::is_assignable::value || std::is_rvalue_reference_v) && IsMutableColumnsOrRvalueReferences::value; +}; + +template <> +struct IsMutableColumnsOrRvalueReferences<> { static const bool value = true; }; template const Type * checkAndGetColumn(const IColumn & column) @@ -662,4 +673,7 @@ bool isColumnConst(const IColumn & column); /// True if column's an ColumnNullable instance. It's just a syntax sugar for type check. bool isColumnNullable(const IColumn & column); +/// True if column's is ColumnNullable or ColumnLowCardinality with nullable nested column. +bool isColumnNullableOrLowCardinalityNullable(const IColumn & column); + } diff --git a/src/Columns/MaskOperations.cpp b/src/Columns/MaskOperations.cpp index b84268356a7..518269e1728 100644 --- a/src/Columns/MaskOperations.cpp +++ b/src/Columns/MaskOperations.cpp @@ -17,7 +17,7 @@ namespace ErrorCodes } template -void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted) +void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted, T default_value) { if (mask.size() < data.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Mask size should be no less than data size."); @@ -38,7 +38,7 @@ void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & ma --from; } else - data[index] = T(); + data[index] = default_value; --index; } @@ -49,7 +49,7 @@ void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & ma /// Explicit instantiations - not to place the implementation of the function above in the header file. #define INSTANTIATE(TYPE) \ -template void expandDataByMask(PaddedPODArray &, const PaddedPODArray &, bool); +template void expandDataByMask(PaddedPODArray &, const PaddedPODArray &, bool, TYPE); INSTANTIATE(UInt8) INSTANTIATE(UInt16) diff --git a/src/Columns/MaskOperations.h b/src/Columns/MaskOperations.h index e43b4588258..cc5226bf0c1 100644 --- a/src/Columns/MaskOperations.h +++ b/src/Columns/MaskOperations.h @@ -13,7 +13,7 @@ namespace DB /// If inverted is true, we will work with inverted mask. This function is used in implementations of /// expand() method in IColumn interface. template -void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted); +void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted, T default_value = T()); struct MaskInfo { diff --git a/src/Columns/tests/gtest_column_variant.cpp b/src/Columns/tests/gtest_column_variant.cpp new file mode 100644 index 00000000000..b701e2d3183 --- /dev/null +++ b/src/Columns/tests/gtest_column_variant.cpp @@ -0,0 +1,688 @@ +#include +#include +#include +#include +#include + +using namespace DB; + +TEST(ColumnVariant, CreateFromEmptyColumns) +{ + MutableColumns columns; + columns.push_back(ColumnUInt32::create()); + columns.push_back(ColumnString::create()); + auto column = ColumnVariant::create(std::move(columns)); + ASSERT_TRUE(column->empty() && column->getLocalDiscriminators().empty() && column->getOffsets().empty()); +} + +TEST(ColumnVariant, CreateFromEmptyColumnsWithLocalOrder) +{ + MutableColumns columns; + columns.push_back(ColumnUInt32::create()); + columns.push_back(ColumnString::create()); + std::vector local_to_global_discriminators; + local_to_global_discriminators.push_back(1); + local_to_global_discriminators.push_back(0); + auto column = ColumnVariant::create(std::move(columns), local_to_global_discriminators); + ASSERT_TRUE(column->empty() && column->getLocalDiscriminators().empty() && column->getOffsets().empty()); + ASSERT_EQ(column->localDiscriminatorByGlobal(0), 0); + ASSERT_EQ(column->localDiscriminatorByGlobal(1), 1); + ASSERT_EQ(column->globalDiscriminatorByLocal(0), 0); + ASSERT_EQ(column->globalDiscriminatorByLocal(1), 1); +} + +MutableColumns createColumns1() +{ + MutableColumns columns; + auto column1 = ColumnUInt64::create(); + column1->insertValue(42); + columns.push_back(std::move(column1)); + auto column2 = ColumnString::create(); + column2->insertData("Hello", 5); + column2->insertData("World", 5); + columns.push_back(std::move(column2)); + auto column3 = ColumnUInt32::create(); + columns.push_back(std::move(column3)); + return columns; +} + +MutableColumnPtr createDiscriminators1() +{ + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + return discriminators_column; +} + +void reorderColumns(const std::vector & local_to_global_order, MutableColumns & columns) +{ + MutableColumns res; + for (auto global_discr : local_to_global_order) + res.push_back(std::move(columns[global_discr])); + columns = std::move(res); +} + +template +void reorderDiscriminators(const std::vector & local_to_global_order, Ptr & discriminators) +{ + std::vector global_to_local_order(local_to_global_order.size()); + for (size_t i = 0; i != local_to_global_order.size(); ++i) + global_to_local_order[local_to_global_order[i]] = i; + + auto & discriminators_data = assert_cast(discriminators.get())->getData(); + for (auto & discr : discriminators_data) + { + if (discr != ColumnVariant::NULL_DISCRIMINATOR) + discr = global_to_local_order[discr]; + } +} + +MutableColumnPtr createOffsets1() +{ + auto offsets = ColumnVariant::ColumnOffsets::create(); + offsets->insertValue(0); + offsets->insertValue(0); + offsets->insertValue(0); + offsets->insertValue(1); + offsets->insertValue(0); + return offsets; +} + +std::vector createLocalToGlobalOrder1() +{ + std::vector local_to_global_discriminators; + local_to_global_discriminators.push_back(1); + local_to_global_discriminators.push_back(2); + local_to_global_discriminators.push_back(0); + return local_to_global_discriminators; +} + +void checkColumnVariant1(ColumnVariant * column) +{ + const auto & offsets = column->getOffsets(); + ASSERT_EQ(column->size(), 5); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 0); + ASSERT_EQ(offsets[3], 1); + ASSERT_TRUE(column->isDefaultAt(2) && column->isDefaultAt(4)); + ASSERT_EQ((*column)[0].get(), 42); + ASSERT_EQ((*column)[1].get(), "Hello"); + ASSERT_TRUE((*column)[2].isNull()); + ASSERT_EQ((*column)[3].get(), "World"); + ASSERT_TRUE((*column)[4].isNull()); +} + +void checkColumnVariant1Order(ColumnVariant * column) +{ + ASSERT_EQ(column->localDiscriminatorByGlobal(0), 2); + ASSERT_EQ(column->localDiscriminatorByGlobal(1), 0); + ASSERT_EQ(column->localDiscriminatorByGlobal(2), 1); + ASSERT_EQ(column->globalDiscriminatorByLocal(0), 1); + ASSERT_EQ(column->globalDiscriminatorByLocal(1), 2); + ASSERT_EQ(column->globalDiscriminatorByLocal(2), 0); + ASSERT_EQ(column->localDiscriminatorAt(0), 2); + ASSERT_EQ(column->localDiscriminatorAt(1), 0); + ASSERT_EQ(column->localDiscriminatorAt(2), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column->localDiscriminatorAt(3), 0); + ASSERT_EQ(column->localDiscriminatorAt(4), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); + ASSERT_EQ(column->globalDiscriminatorAt(1), 1); + ASSERT_EQ(column->globalDiscriminatorAt(2), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column->globalDiscriminatorAt(3), 1); + ASSERT_EQ(column->globalDiscriminatorAt(4), ColumnVariant::NULL_DISCRIMINATOR); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndColumns) +{ + auto columns = createColumns1(); + auto discriminators = createDiscriminators1(); + auto column = ColumnVariant::create(std::move(discriminators), std::move(columns)); + checkColumnVariant1(column.get()); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndColumnsWithLocalOrder) +{ + auto local_to_global_order = createLocalToGlobalOrder1(); + auto columns = createColumns1(); + reorderColumns(local_to_global_order, columns); + auto discriminators = createDiscriminators1(); + reorderDiscriminators(local_to_global_order, discriminators); + auto column = ColumnVariant::create(std::move(discriminators), std::move(columns), createLocalToGlobalOrder1()); + checkColumnVariant1(column.get()); + checkColumnVariant1Order(column.get()); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsOffsetsAndColumns) +{ + auto columns = createColumns1(); + auto discriminators = createDiscriminators1(); + auto offsets = createOffsets1(); + auto column = ColumnVariant::create(std::move(discriminators), std::move(offsets), std::move(columns)); + checkColumnVariant1(column.get()); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsOffsetsAndColumnsWithLocalOrder) +{ + auto local_to_global_order = createLocalToGlobalOrder1(); + auto columns = createColumns1(); + reorderColumns(local_to_global_order, columns); + auto discriminators = createDiscriminators1(); + reorderDiscriminators(local_to_global_order, discriminators); + auto offsets = createOffsets1(); + auto column = ColumnVariant::create(std::move(discriminators), std::move(offsets), std::move(columns), createLocalToGlobalOrder1()); + checkColumnVariant1(column.get()); + checkColumnVariant1Order(column.get()); +} + +ColumnVariant::MutablePtr createVariantWithOneFullColumNoNulls(size_t size, bool change_order) +{ + MutableColumns columns; + auto column1 = ColumnUInt64::create(); + for (size_t i = 0; i != size; ++i) + column1->insertValue(i); + columns.push_back(std::move(column1)); + auto column2 = ColumnString::create(); + columns.push_back(std::move(column2)); + auto column3 = ColumnUInt32::create(); + columns.push_back(std::move(column3)); + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + for (size_t i = 0; i != size; ++i) + discriminators_column->insertValue(0); + if (change_order) + { + auto local_to_global_order = createLocalToGlobalOrder1(); + reorderColumns(local_to_global_order, columns); + reorderDiscriminators(local_to_global_order, discriminators_column); + return ColumnVariant::create(std::move(discriminators_column), std::move(columns), createLocalToGlobalOrder1()); + } + return ColumnVariant::create(std::move(discriminators_column), std::move(columns)); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndOneFullColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(3, false); + const auto & offsets = column->getOffsets(); + ASSERT_EQ(column->size(), 3); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 1); + ASSERT_EQ(offsets[2], 2); + ASSERT_EQ((*column)[0].get(), 0); + ASSERT_EQ((*column)[1].get(), 1); + ASSERT_EQ((*column)[2].get(), 2); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndOneFullColumnNoNullsWithLocalOrder) +{ + auto column = createVariantWithOneFullColumNoNulls(3, true); + const auto & offsets = column->getOffsets(); + ASSERT_EQ(column->size(), 3); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 1); + ASSERT_EQ(offsets[2], 2); + ASSERT_EQ((*column)[0].get(), 0); + ASSERT_EQ((*column)[1].get(), 1); + ASSERT_EQ((*column)[2].get(), 2); + ASSERT_EQ(column->localDiscriminatorAt(0), 2); + ASSERT_EQ(column->localDiscriminatorAt(1), 2); + ASSERT_EQ(column->localDiscriminatorAt(2), 2); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); +} + +TEST(ColumnVariant, CloneResizedToEmpty) +{ + auto column = ColumnVariant::create(createDiscriminators1(), createOffsets1(), createColumns1()); + auto resized_column = column->cloneResized(0); + ASSERT_TRUE(resized_column->empty()); +} + +TEST(ColumnVariant, CloneResizedToLarge) +{ + auto column = ColumnVariant::create(createDiscriminators1(), createOffsets1(), createColumns1()); + auto resized_column = column->cloneResized(7); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 7); + const auto & offsets = resized_column_variant->getOffsets(); + for (size_t i = 0; i != 7; ++i) + { + if (i == 3) + ASSERT_EQ(offsets[i], 1); + else + ASSERT_EQ(offsets[i], 0); + } + + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + std::vector null_indexes = {2, 4, 5, 6}; + for (size_t i : null_indexes) + ASSERT_EQ(discriminators[i], ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 1); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 2); +} + +TEST(ColumnVariant, CloneResizedWithOneFullColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(5, false); + auto resized_column = column->cloneResized(3); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 3); + const auto & offsets = resized_column_variant->getOffsets(); + for (size_t i = 0; i != 3; ++i) + ASSERT_EQ(offsets[i], i); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + for (size_t i = 0; i != 3; ++i) + ASSERT_EQ(discriminators[i], 0); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 3); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 0); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); +} + +MutableColumns createColumns2() +{ + MutableColumns columns; + auto column1 = ColumnUInt64::create(); + column1->insertValue(42); + column1->insertValue(43); + column1->insertValue(44); + columns.push_back(std::move(column1)); + auto column2 = ColumnString::create(); + column2->insertData("Hello", 5); + column2->insertData("World", 5); + columns.push_back(std::move(column2)); + auto column3 = ColumnUInt8::create(); + columns.push_back(std::move(column3)); + return columns; +} + +TEST(ColumnVariant, CloneResizedGeneral1) +{ + /// D c1 c2 c3 + /// 0 42 Hello + /// 1 43 World + /// NULL 44 + /// 0 + /// 1 + /// NULL + /// 0 + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + auto column = ColumnVariant::create(std::move(discriminators_column), createColumns2()); + auto resized_column = column->cloneResized(4); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 4); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 2); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 1); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + ASSERT_EQ(discriminators[0], 0); + ASSERT_EQ(discriminators[1], 1); + ASSERT_EQ(discriminators[2], ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(discriminators[3], 0); + const auto & offsets = resized_column_variant->getOffsets(); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 0); + ASSERT_EQ(offsets[3], 1); + ASSERT_EQ((*resized_column_variant)[0].get(), 42); + ASSERT_EQ((*resized_column_variant)[1].get(), "Hello"); + ASSERT_EQ((*resized_column_variant)[3].get(), 43); +} + +TEST(ColumnVariant, CloneResizedGeneral2) +{ + /// D c1 c2 c3 + /// 0 42 Hello + /// NULL 43 World + /// NULL 44 + /// 0 + /// 1 + /// 1 + /// 0 + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(1); + discriminators_column->insertValue(0); + auto column = ColumnVariant::create(std::move(discriminators_column), createColumns2()); + auto resized_column = column->cloneResized(3); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 3); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 1); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 0); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + ASSERT_EQ(discriminators[0], 0); + ASSERT_EQ(discriminators[1], ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(discriminators[2], ColumnVariant::NULL_DISCRIMINATOR); + const auto & offsets = resized_column_variant->getOffsets(); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ((*resized_column_variant)[0].get(), 42); +} + +TEST(ColumnVariant, CloneResizedGeneral3) +{ + /// D c1 c2 c3 + /// 0 42 Hello + /// 1 43 World + /// 1 44 + /// 0 + /// NULL + /// NULL + /// 0 + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(1); + discriminators_column->insertValue(0); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + auto column = ColumnVariant::create(std::move(discriminators_column), createColumns2()); + auto resized_column = column->cloneResized(5); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 5); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 2); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 2); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + ASSERT_EQ(discriminators[0], 0); + ASSERT_EQ(discriminators[1], 1); + ASSERT_EQ(discriminators[2], 1); + ASSERT_EQ(discriminators[3], 0); + const auto & offsets = resized_column_variant->getOffsets(); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 0); + ASSERT_EQ(offsets[2], 1); + ASSERT_EQ(offsets[3], 1); + ASSERT_EQ((*resized_column_variant)[0].get(), 42); + ASSERT_EQ((*resized_column_variant)[1].get(), "Hello"); + ASSERT_EQ((*resized_column_variant)[2].get(), "World"); + ASSERT_EQ((*resized_column_variant)[3].get(), 43); +} + +MutableColumnPtr createDiscriminators2() +{ + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + return discriminators_column; +} + +std::vector createLocalToGlobalOrder2() +{ + std::vector local_to_global_discriminators; + local_to_global_discriminators.push_back(2); + local_to_global_discriminators.push_back(0); + local_to_global_discriminators.push_back(1); + return local_to_global_discriminators; +} + +ColumnVariant::MutablePtr createVariantColumn1(bool reorder) +{ + auto columns = createColumns1(); + auto discriminators = createDiscriminators1(); + if (!reorder) + return ColumnVariant::create(std::move(discriminators), std::move(columns)); + auto local_to_global_order = createLocalToGlobalOrder1(); + reorderColumns(local_to_global_order, columns); + reorderDiscriminators(local_to_global_order, discriminators); + return ColumnVariant::create(std::move(discriminators), std::move(columns), local_to_global_order); +} + +ColumnVariant::MutablePtr createVariantColumn2(bool reorder) +{ + auto columns = createColumns2(); + auto discriminators = createDiscriminators2(); + if (!reorder) + return ColumnVariant::create(std::move(discriminators), std::move(columns)); + auto local_to_global_order = createLocalToGlobalOrder2(); + reorderColumns(local_to_global_order, columns); + reorderDiscriminators(local_to_global_order, discriminators); + return ColumnVariant::create(std::move(discriminators), std::move(columns), local_to_global_order); +} + +TEST(ColumnVariant, InsertFrom) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn1(change_order); + auto column_from = createVariantColumn2(change_order); + column_to->insertFrom(*column_from, 3); + ASSERT_EQ(column_to->globalDiscriminatorAt(5), 0); + ASSERT_EQ((*column_to)[5].get(), 43); + } +} + +TEST(ColumnVariant, InsertRangeFromOneColumnNoNulls) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn2(change_order); + auto column_from = createVariantWithOneFullColumNoNulls(5, change_order); + column_to->insertRangeFrom(*column_from, 2, 2); + ASSERT_EQ(column_to->globalDiscriminatorAt(7), 0); + ASSERT_EQ(column_to->globalDiscriminatorAt(8), 0); + ASSERT_EQ((*column_to)[7].get(), 2); + ASSERT_EQ((*column_to)[8].get(), 3); + } +} + +TEST(ColumnVariant, InsertRangeFromGeneral) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn1(change_order); + auto column_from = createVariantColumn2(change_order); + column_to->insertRangeFrom(*column_from, 1, 4); + ASSERT_EQ(column_to->globalDiscriminatorAt(5), 1); + ASSERT_EQ(column_to->globalDiscriminatorAt(6), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column_to->globalDiscriminatorAt(7), 0); + ASSERT_EQ(column_to->globalDiscriminatorAt(8), 1); + ASSERT_EQ((*column_to)[5].get(), "Hello"); + ASSERT_EQ((*column_to)[7].get(), 43); + ASSERT_EQ((*column_to)[8].get(), "World"); + } +} + +TEST(ColumnVariant, InsertManyFrom) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn1(change_order); + auto column_from = createVariantColumn2(change_order); + column_to->insertManyFrom(*column_from, 3, 2); + ASSERT_EQ(column_to->globalDiscriminatorAt(5), 0); + ASSERT_EQ(column_to->globalDiscriminatorAt(6), 0); + ASSERT_EQ((*column_to)[5].get(), 43); + ASSERT_EQ((*column_to)[6].get(), 43); + } +} + +TEST(ColumnVariant, PopBackOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(5, false); + column->popBack(3); + ASSERT_EQ(column->size(), 2); + ASSERT_EQ(column->getVariantByLocalDiscriminator(0).size(), 2); + ASSERT_EQ((*column)[0].get(), 0); + ASSERT_EQ((*column)[1].get(), 1); +} + +TEST(ColumnVariant, PopBackGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + column->popBack(4); + ASSERT_EQ(column->size(), 3); + ASSERT_EQ(column->getVariantByLocalDiscriminator(0).size(), 1); + ASSERT_EQ(column->getVariantByLocalDiscriminator(1).size(), 1); + ASSERT_EQ((*column)[0].get(), 42); + ASSERT_EQ((*column)[1].get(), "Hello"); + ASSERT_TRUE((*column)[2].isNull()); +} + +TEST(ColumnVariant, FilterOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(3, false); + IColumn::Filter filter; + filter.push_back(1); + filter.push_back(0); + filter.push_back(1); + auto filtered_column = column->filter(filter, -1); + ASSERT_EQ(filtered_column->size(), 2); + ASSERT_EQ((*filtered_column)[0].get(), 0); + ASSERT_EQ((*filtered_column)[1].get(), 2); +} + +TEST(ColumnVariant, FilterGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + IColumn::Filter filter; + filter.push_back(0); + filter.push_back(1); + filter.push_back(1); + filter.push_back(0); + filter.push_back(0); + filter.push_back(1); + filter.push_back(0); + auto filtered_column = column->filter(filter, -1); + ASSERT_EQ(filtered_column->size(), 3); + ASSERT_EQ((*filtered_column)[0].get(), "Hello"); + ASSERT_TRUE((*filtered_column)[1].isNull()); + ASSERT_TRUE((*filtered_column)[2].isNull()); +} + +TEST(ColumnVariant, PermuteAndIndexOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(4, false); + IColumn::Permutation permutation; + permutation.push_back(1); + permutation.push_back(3); + permutation.push_back(2); + permutation.push_back(0); + auto permuted_column = column->permute(permutation, 3); + ASSERT_EQ(permuted_column->size(), 3); + ASSERT_EQ((*permuted_column)[0].get(), 1); + ASSERT_EQ((*permuted_column)[1].get(), 3); + ASSERT_EQ((*permuted_column)[2].get(), 2); + + auto index = ColumnUInt64::create(); + index->getData() = std::move(permutation); + auto indexed_column = column->index(*index, 3); + ASSERT_EQ(indexed_column->size(), 3); + ASSERT_EQ((*indexed_column)[0].get(), 1); + ASSERT_EQ((*indexed_column)[1].get(), 3); + ASSERT_EQ((*indexed_column)[2].get(), 2); +} + +TEST(ColumnVariant, PermuteGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + IColumn::Permutation permutation; + permutation.push_back(3); + permutation.push_back(4); + permutation.push_back(1); + permutation.push_back(5); + auto permuted_column = column->permute(permutation, 4); + ASSERT_EQ(permuted_column->size(), 4); + ASSERT_EQ((*permuted_column)[0].get(), 43); + ASSERT_EQ((*permuted_column)[1].get(), "World"); + ASSERT_EQ((*permuted_column)[2].get(), "Hello"); + ASSERT_TRUE((*permuted_column)[3].isNull()); +} + +TEST(ColumnVariant, ReplicateOneColumnNoNull) +{ + auto column = createVariantWithOneFullColumNoNulls(3, false); + IColumn::Offsets offsets; + offsets.push_back(0); + offsets.push_back(3); + offsets.push_back(6); + auto replicated_column = column->replicate(offsets); + ASSERT_EQ(replicated_column->size(), 6); + ASSERT_EQ((*replicated_column)[0].get(), 1); + ASSERT_EQ((*replicated_column)[1].get(), 1); + ASSERT_EQ((*replicated_column)[2].get(), 1); + ASSERT_EQ((*replicated_column)[3].get(), 2); + ASSERT_EQ((*replicated_column)[4].get(), 2); + ASSERT_EQ((*replicated_column)[5].get(), 2); +} + +TEST(ColumnVariant, ReplicateGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators1(), createColumns1()); + IColumn::Offsets offsets; + offsets.push_back(1); + offsets.push_back(3); + offsets.push_back(5); + offsets.push_back(5); + offsets.push_back(7); + auto replicated_column = column->replicate(offsets); + ASSERT_EQ(replicated_column->size(), 7); + ASSERT_EQ((*replicated_column)[0].get(), 42); + ASSERT_EQ((*replicated_column)[1].get(), "Hello"); + ASSERT_EQ((*replicated_column)[2].get(), "Hello"); + ASSERT_TRUE((*replicated_column)[3].isNull()); + ASSERT_TRUE((*replicated_column)[4].isNull()); + ASSERT_TRUE((*replicated_column)[5].isNull()); + ASSERT_TRUE((*replicated_column)[6].isNull()); +} + +TEST(ColumnVariant, ScatterOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(5, false); + IColumn::Selector selector; + selector.push_back(0); + selector.push_back(1); + selector.push_back(2); + selector.push_back(0); + selector.push_back(1); + auto columns = column->scatter(3, selector); + ASSERT_EQ(columns[0]->size(), 2); + ASSERT_EQ((*columns[0])[0].get(), 0); + ASSERT_EQ((*columns[0])[1].get(), 3); + ASSERT_EQ(columns[1]->size(), 2); + ASSERT_EQ((*columns[1])[0].get(), 1); + ASSERT_EQ((*columns[1])[1].get(), 4); + ASSERT_EQ(columns[2]->size(), 1); + ASSERT_EQ((*columns[2])[0].get(), 2); +} + +TEST(ColumnVariant, ScatterGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + IColumn::Selector selector; + selector.push_back(0); + selector.push_back(0); + selector.push_back(2); + selector.push_back(0); + selector.push_back(1); + selector.push_back(2); + selector.push_back(1); + auto columns = column->scatter(3, selector); + ASSERT_EQ(columns[0]->size(), 3); + ASSERT_EQ((*columns[0])[0].get(), 42); + ASSERT_EQ((*columns[0])[1].get(), "Hello"); + ASSERT_EQ((*columns[0])[2].get(), 43); + ASSERT_EQ(columns[1]->size(), 2); + ASSERT_EQ((*columns[1])[0].get(), "World"); + ASSERT_EQ((*columns[1])[1].get(), 44); + ASSERT_EQ(columns[2]->size(), 2); + ASSERT_TRUE((*columns[2])[0].isNull()); + ASSERT_TRUE((*columns[2])[1].isNull()); +} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7e50a81ada8..0151dcb982d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -816,6 +816,7 @@ class IColumn; M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \ M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ + M(Bool, use_variant_when_no_common_type_in_if, false, "Use Variant as a result type for if/multiIf in case when there is no common type for arguments", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ @@ -823,6 +824,7 @@ class IColumn; M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ + M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index 9c634d2321c..7003e880cd5 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -49,6 +49,7 @@ enum class TypeIndex IPv4, IPv6, JSONPaths, + Variant, }; /** diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 415f24d8151..d154b386ace 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -290,6 +290,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeDomainGeo(*this); registerDataTypeMap(*this); registerDataTypeObject(*this); + registerDataTypeVariant(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index ba7c1a3d7fe..a2aeb6f3646 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -100,5 +100,6 @@ void registerDataTypeDomainBool(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); void registerDataTypeObject(DataTypeFactory & factory); +void registerDataTypeVariant(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 41a9a1de543..484d779551f 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -114,5 +114,33 @@ DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type) return std::make_shared(type); } +DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type) +{ + if (isNullableOrLowCardinalityNullable(type)) + return type; + + if (type->lowCardinality()) + { + const auto & dictionary_type = assert_cast(*type).getDictionaryType(); + return std::make_shared(makeNullable(dictionary_type)); + } + + return makeNullableSafe(type); +} + +DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type) +{ + if (type->isNullable()) + return static_cast(*type).getNestedType(); + + if (type->isLowCardinalityNullable()) + { + auto dict_type = removeNullable(static_cast(*type).getDictionaryType()); + return std::make_shared(dict_type); + } + + return type; + +} } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 06d46fb15ed..7ad0e1ba5f1 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -54,5 +54,8 @@ DataTypePtr makeNullable(const DataTypePtr & type); DataTypePtr makeNullableSafe(const DataTypePtr & type); DataTypePtr removeNullable(const DataTypePtr & type); DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type); +DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type); +/// Nullable(T) -> T, LowCardinality(Nullable(T)) -> T +DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type); } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index fd2e5e6a784..df9af203618 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -172,11 +173,15 @@ MutableColumnPtr DataTypeTuple::createColumn() const MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const { + /// If we read Tuple as Variant subcolumn, it may be wrapped to SerializationVariantElement. + /// Here we don't need it, so we drop this wrapper. + const auto * current_serialization = &serialization; + while (const auto * serialization_variant_element = typeid_cast(current_serialization)) + current_serialization = serialization_variant_element->getNested().get(); + /// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed /// several times to allow to reconstruct the substream path name. /// Here we don't need substream path name, so we drop first several wrapper serializations. - - const auto * current_serialization = &serialization; while (const auto * serialization_named = typeid_cast(current_serialization)) current_serialization = serialization_named->getNested().get(); diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp new file mode 100644 index 00000000000..77e1c504cf8 --- /dev/null +++ b/src/DataTypes/DataTypeVariant.cpp @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int EMPTY_DATA_PASSED; +} + + +DataTypeVariant::DataTypeVariant(const DataTypes & variants_) +{ + /// Sort nested types by their full names and squash identical types. + std::map name_to_type; + for (const auto & type : variants_) + { + /// Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types are not allowed inside Variant type. + if (isNullableOrLowCardinalityNullable(type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nullable/LowCardinality(Nullable) types are not allowed inside Variant type"); + if (type->getTypeId() == TypeIndex::Variant) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nested Variant types are not allowed"); + /// Don't use Nothing type as a variant. + if (!isNothing(type)) + name_to_type[type->getName()] = type; + } + + variants.reserve(name_to_type.size()); + for (const auto & [_, type] : name_to_type) + variants.push_back(type); + + if (variants.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + + if (variants.size() > ColumnVariant::MAX_NESTED_COLUMNS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); +} + +std::string DataTypeVariant::doGetName() const +{ + size_t size = variants.size(); + WriteBufferFromOwnString s; + + s << "Variant("; + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + s << ", "; + + s << variants[i]->getName(); + } + s << ")"; + + return s.str(); +} + +std::string DataTypeVariant::doGetPrettyName(size_t indent) const +{ + size_t size = variants.size(); + WriteBufferFromOwnString s; + s << "Variant(\n"; + + for (size_t i = 0; i != size; ++i) + { + if (i != 0) + s << ",\n"; + + s << fourSpaceIndent(indent + 1) << variants[i]->getPrettyName(indent + 1); + } + + s << '\n' << fourSpaceIndent(indent) << ')'; + return s.str(); +} + +MutableColumnPtr DataTypeVariant::createColumn() const +{ + size_t size = variants.size(); + MutableColumns nested_columns; + nested_columns.reserve(size); + for (size_t i = 0; i < size; ++i) + nested_columns.push_back(variants[i]->createColumn()); + + return ColumnVariant::create(std::move(nested_columns)); +} + + +Field DataTypeVariant::getDefault() const +{ + return Null(); +} + +bool DataTypeVariant::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + const DataTypeVariant & rhs_variant = static_cast(rhs); + + size_t size = variants.size(); + if (size != rhs_variant.variants.size()) + return false; + + for (size_t i = 0; i < size; ++i) + if (!variants[i]->equals(*rhs_variant.variants[i])) + return false; + + return true; +} + +bool DataTypeVariant::textCanContainOnlyValidUTF8() const +{ + return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->textCanContainOnlyValidUTF8(); }); +} + +bool DataTypeVariant::haveMaximumSizeOfValue() const +{ + return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); +} + +bool DataTypeVariant::hasDynamicSubcolumns() const +{ + return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); +} + +std::optional DataTypeVariant::tryGetVariantDiscriminator(const DataTypePtr & type) const +{ + String type_name = type->getName(); + for (size_t i = 0; i != variants.size(); ++i) + { + /// We don't use equals here, because it doesn't respect custom type names. + if (variants[i]->getName() == type_name) + return i; + } + + return std::nullopt; +} + +size_t DataTypeVariant::getMaximumSizeOfValueInMemory() const +{ + size_t max_size = 0; + for (const auto & elem : variants) + { + size_t elem_max_size = elem->getMaximumSizeOfValueInMemory(); + if (elem_max_size > max_size) + max_size = elem_max_size; + } + return max_size; +} + +SerializationPtr DataTypeVariant::doGetDefaultSerialization() const +{ + SerializationVariant::VariantSerializations serializations; + serializations.reserve(variants.size()); + Names variant_names; + variant_names.reserve(variants.size()); + + for (const auto & variant : variants) + { + serializations.push_back(variant->getDefaultSerialization()); + variant_names.push_back(variant->getName()); + } + + return std::make_shared(std::move(serializations), std::move(variant_names), SerializationVariant::getVariantsDeserializeTextOrder(variants), getName()); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + + DataTypes nested_types; + nested_types.reserve(arguments->children.size()); + + for (const ASTPtr & child : arguments->children) + nested_types.emplace_back(DataTypeFactory::instance().get(child)); + + return std::make_shared(nested_types); +} + + +void registerDataTypeVariant(DataTypeFactory & factory) +{ + factory.registerDataType("Variant", create); +} + +} diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h new file mode 100644 index 00000000000..60113a188b0 --- /dev/null +++ b/src/DataTypes/DataTypeVariant.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +/** Variant data type. + * This type represents a union of other data types. + * For example, type Variant(T1, T2, ..., TN) means that each row of this type + * has a value of either type T1 or T2 or ... or TN or none of them (NULL value). + * Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types are not allowed + * inside Variant type. + * The order of nested types doesn't matter: Variant(T1, T2) = Variant(T2, T1). + * To have global order of nested types we sort variants by type names on Variant creation. + * The index of a variant in a sorted list is called global variant discriminator. + */ +class DataTypeVariant final : public IDataType +{ +private: + DataTypes variants; + +public: + static constexpr bool is_parametric = true; + + explicit DataTypeVariant(const DataTypes & variants_); + + TypeIndex getTypeId() const override { return TypeIndex::Variant; } + const char * getFamilyName() const override { return "Variant"; } + + bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return false; } + bool canBeInsideSparseColumns() const override { return false; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool textCanContainOnlyValidUTF8() const override; + bool haveMaximumSizeOfValue() const override; + bool hasDynamicSubcolumns() const override; + size_t getMaximumSizeOfValueInMemory() const override; + + const DataTypePtr & getVariant(size_t i) const { return variants[i]; } + const DataTypes & getVariants() const { return variants; } + + /// Check if Variant has provided type in the list of variants and return its discriminator. + std::optional tryGetVariantDiscriminator(const DataTypePtr & type) const; + +private: + std::string doGetName() const override; + std::string doGetPrettyName(size_t indent) const override; + SerializationPtr doGetDefaultSerialization() const override; +}; + +} + diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp index 9df49e765a7..8a4b1304d5e 100644 --- a/src/DataTypes/EnumValues.cpp +++ b/src/DataTypes/EnumValues.cpp @@ -74,6 +74,27 @@ T EnumValues::getValue(StringRef field_name, bool try_treat_as_id) const return it->getMapped(); } +template +bool EnumValues::tryGetValue(T & x, StringRef field_name, bool try_treat_as_id) const +{ + const auto it = name_to_value_map.find(field_name); + if (!it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names, we will try to treat it as enum id. + if (try_treat_as_id) + { + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + if (!tryReadText(x, tmp_buf) || !tmp_buf.eof() || !value_to_name_map.contains(x)) + return false; + return true; + } + return false; + } + x = it->getMapped(); + return true; +} + template Names EnumValues::getAllRegisteredNames() const { diff --git a/src/DataTypes/EnumValues.h b/src/DataTypes/EnumValues.h index 5189f7a56f5..889878bc60f 100644 --- a/src/DataTypes/EnumValues.h +++ b/src/DataTypes/EnumValues.h @@ -7,7 +7,7 @@ namespace DB { -namespace ErrorCodes +namespace ErrorCodesEnumValues { extern const int BAD_ARGUMENTS; } @@ -42,6 +42,11 @@ public: return it; } + bool hasValue(const T & value) const + { + return value_to_name_map.contains(value); + } + /// throws exception if value is not valid const StringRef & getNameForValue(const T & value) const { @@ -60,6 +65,7 @@ public: } T getValue(StringRef field_name, bool try_treat_as_id = false) const; + bool tryGetValue(T & x, StringRef field_name, bool try_treat_as_id = false) const; template bool containsAll(const TValues & rhs_values) const diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 2a7e0f246de..392c56343e3 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -109,11 +109,26 @@ Ptr IDataType::getForSubcolumn( bool throw_if_null) const { Ptr res; - forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata) + + ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) { - if (name == subcolumn_name) - res = subdata.*member; - }, data); + for (size_t i = 0; i < subpath.size(); ++i) + { + size_t prefix_len = i + 1; + if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len)) + { + auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); + /// Create data from path only if it's requested subcolumn. + if (name == subcolumn_name) + res = ISerialization::createFromPath(subpath, prefix_len).*member; + } + subpath[i].visited = true; + } + }; + + ISerialization::EnumerateStreamsSettings settings; + settings.position_independent_encoding = false; + data.serialization->enumerateStreams(settings, callback_with_data, data); if (!res && throw_if_null) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index eabf066bc3d..ccdf54f57c3 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -412,6 +412,8 @@ struct WhichDataType constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); } constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; } + + constexpr bool isVariant() const { return idx == TypeIndex::Variant; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -464,6 +466,7 @@ template inline bool isTuple(const T & data_type) { return WhichDat template inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); } template inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); } template inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); } +template inline bool isVariant(const T & data_type) { return WhichDataType(data_type).isVariant(); } template inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index e70dc6a2380..86a37949dc8 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -55,6 +55,9 @@ String ISerialization::Substream::toString() const return fmt::format("TupleElement({}, escape_tuple_delimiter = {})", tuple_element_name, escape_tuple_delimiter ? "true" : "false"); + if (type == VariantElement) + return fmt::format("VariantElement({})", variant_element_name); + return String(magic_enum::enum_name(type)); } @@ -172,6 +175,10 @@ String getNameForSubstreamPath( else stream_name += "." + it->tuple_element_name; } + else if (it->type == Substream::VariantDiscriminators) + stream_name += ".discr"; + else if (it->type == Substream::VariantElement) + stream_name += "." + it->variant_element_name; } return stream_name; @@ -252,6 +259,45 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } +#define TRY_DESERIALIZE_TEXT(deserialize) \ + size_t prev_size = column.size(); \ + try \ + { \ + deserialize(column, istr, settings); \ + return true; \ + } \ + catch (...) \ + { \ + if (column.size() > prev_size) \ + column.popBack(column.size() - prev_size); \ + return false; \ + } \ + +bool ISerialization::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextCSV) +} + +bool ISerialization::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextEscaped) +} + +bool ISerialization::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextJSON) +} + +bool ISerialization::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextQuoted) +} + +bool ISerialization::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeWholeText) +} + void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; @@ -261,6 +307,15 @@ void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, con deserializeWholeText(column, buf, settings); } +bool ISerialization::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + /// Read until \t or \n. + readString(field, istr); + ReadBufferFromString buf(field); + return tryDeserializeWholeText(column, buf, settings); +} + void ISerialization::serializeTextMarkdown( const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const { @@ -288,7 +343,9 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref size_t last_elem = prefix_len - 1; return path[last_elem].type == Substream::NullMap || path[last_elem].type == Substream::TupleElement - || path[last_elem].type == Substream::ArraySizes; + || path[last_elem].type == Substream::ArraySizes + || path[last_elem].type == Substream::VariantDiscriminators + || path[last_elem].type == Substream::VariantElement; } ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) @@ -317,6 +374,8 @@ void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadB { WriteBufferFromOwnString ostr; serializeText(column, column.size() - 1, ostr, settings); + /// Restore correct column size. + column.popBack(1); throw Exception( ErrorCodes::UNEXPECTED_DATA_AFTER_PARSED_VALUE, "Unexpected data '{}' after parsed {} value '{}'", diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 030c3c6d81e..f0273f59d1f 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -152,6 +152,10 @@ public: ObjectStructure, ObjectData, + VariantDiscriminators, + VariantElements, + VariantElement, + Regular, }; @@ -160,6 +164,9 @@ public: /// Index of tuple element, starting at 1 or name. String tuple_element_name; + /// The name of a variant element type. + String variant_element_name; + /// Do we need to escape a dot in filenames for tuple elements. bool escape_tuple_delimiter = true; @@ -320,17 +327,20 @@ public: virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization as a literal that may be inserted into a query. */ virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization for the CSV format. */ virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization for displaying on a terminal or saving into a text file, and the like. * Without escaping or quoting. @@ -340,11 +350,13 @@ public: /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. */ virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization intended for using in JSON format. */ virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t /*indent*/) const { serializeTextJSON(column, row_num, ostr, settings); @@ -364,6 +376,7 @@ public: * additional code in data types serialization and ReadHelpers. */ virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + virtual bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; virtual void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index c804f58c567..be23278ef25 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -417,9 +417,11 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe } -template -static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) +template +static ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) { + static constexpr bool throw_exception = std::is_same_v; + ColumnArray & column_array = assert_cast(column); ColumnArray::Offsets & offsets = column_array.getOffsets(); @@ -431,7 +433,18 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (checkChar('[', istr)) has_braces = true; else if (!allow_unenclosed) - throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + return ReturnType(false); + } + + auto on_error_no_throw = [&]() + { + if (size) + nested_column.popBack(size); + return ReturnType(false); + }; try { @@ -441,11 +454,17 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (!first) { if (*istr.position() == ',') + { ++istr.position(); + } else - throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, - "Cannot read array from text, expected comma or end of array, found '{}'", - *istr.position()); + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, + "Cannot read array from text, expected comma or end of array, found '{}'", + *istr.position()); + return on_error_no_throw(); + } } first = false; @@ -455,25 +474,42 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (*istr.position() == ']') break; - read_nested(nested_column); + if constexpr (throw_exception) + read_nested(nested_column); + else if (!read_nested(nested_column)) + return on_error_no_throw(); + ++size; skipWhitespaceIfAny(istr); } if (has_braces) - assertChar(']', istr); + { + if constexpr (throw_exception) + assertChar(']', istr); + else if (!checkChar(']', istr)) + return on_error_no_throw(); + } else /// If array is not enclosed in braces, we read until EOF. - assertEOF(istr); + { + if constexpr (throw_exception) + assertEOF(istr); + else if (!istr.eof()) + return on_error_no_throw(); + } } catch (...) { if (size) nested_column.popBack(size); - throw; + if constexpr (throw_exception) + throw; + return ReturnType(false); } offsets.push_back(offsets.back() + size); + return ReturnType(true); } @@ -492,8 +528,8 @@ void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, co deserializeTextImpl(column, istr, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(nested_column, istr, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(nested_column, istr, settings, nested); else nested->deserializeTextQuoted(nested_column, istr, settings); }, false); @@ -502,6 +538,29 @@ void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, co throwUnexpectedDataAfterParsedValue(column, istr, settings, "Array"); } +bool SerializationArray::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(nested_column, istr, settings, nested); + return nested->tryDeserializeTextQuoted(nested_column, istr, settings); + }; + + bool ok = deserializeTextImpl(column, istr, std::move(read_nested), false); + + if (!ok) + return false; + + if (whole && !istr.eof()) + { + column.popBack(1); + return false; + } + + return true; +} + void SerializationArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const ColumnArray & column_array = assert_cast(column); @@ -557,13 +616,25 @@ void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr deserializeTextImpl(column, istr, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(nested_column, istr, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); else nested->deserializeTextJSON(nested_column, istr, settings); }, false); } +bool SerializationArray::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); + return nested->tryDeserializeTextJSON(nested_column, istr, settings); + }; + + return deserializeTextImpl(column, istr, std::move(read_nested), false); +} + void SerializationArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -606,8 +677,8 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, rb, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextCSVImpl(nested_column, rb, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(nested_column, rb, settings, nested); else nested->deserializeTextCSV(nested_column, rb, settings); }, true); @@ -617,12 +688,43 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, rb, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(nested_column, rb, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(nested_column, rb, settings, nested); else nested->deserializeTextQuoted(nested_column, rb, settings); }, true); } } +bool SerializationArray::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + + if (settings.csv.arrays_as_nested_csv) + { + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(nested_column, rb, settings, nested); + return nested->tryDeserializeTextCSV(nested_column, rb, settings); + }; + + return deserializeTextImpl(column, rb, read_nested, true); + } + else + { + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(nested_column, rb, settings, nested); + return nested->tryDeserializeTextQuoted(nested_column, rb, settings); + }; + + return deserializeTextImpl(column, rb, read_nested, true); + } +} + } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index de331169db5..82f5e8bce45 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -20,15 +20,18 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Streaming serialization of arrays is arranged in a special way: * - elements placed in a row are written/read without array sizes; diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index 41b5bf806e5..f745fac4d30 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -150,30 +150,42 @@ bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr) return true; } -void deserializeImpl( +template +ReturnType deserializeImpl( IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function check_end_of_value) { + static constexpr bool throw_exception = std::is_same_v; + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + auto restore_column_if_needed = [&, prev_size = col->size()]() + { + if (col->size() > prev_size) + col->popBack(1); + }; PeekableReadBuffer buf(istr); buf.setCheckpoint(); if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf)) { col->insert(true); - return; + return ReturnType(true); } buf.rollbackToCheckpoint(); if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf)) { - col->insert(false); buf.dropCheckpoint(); if (buf.hasUnreadData()) - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " - "bool_true_representation or bool_false_representation contains some delimiters of input format"); - return; + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return ReturnType(false); + } + col->insert(false); + return ReturnType(true); } buf.rollbackToCheckpoint(); @@ -181,22 +193,31 @@ void deserializeImpl( { buf.dropCheckpoint(); if (buf.hasUnreadData()) - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " - "bool_true_representation or bool_false_representation contains some delimiters of input format"); - return; + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + restore_column_if_needed(); + return ReturnType(false); + } + return ReturnType(true); } buf.makeContinuousMemoryFromCheckpointToPos(); buf.rollbackToCheckpoint(); - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " - "bool_false_representation or one of " - "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", - String(buf.position(), std::min(10lu, buf.available())), - settings.bool_true_representation, settings.bool_false_representation); + restore_column_if_needed(); + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " + "bool_false_representation or one of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", + String(buf.position(), std::min(10lu, buf.available())), + settings.bool_true_representation, settings.bool_false_representation); + + return ReturnType(false); } } @@ -225,6 +246,14 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } +bool SerializationBool::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + void SerializationBool::serializeTextJSON(const IColumn &column, size_t row_num, WriteBuffer &ostr, const FormatSettings &settings) const { serializeSimple(column, row_num, ostr, settings); @@ -250,6 +279,33 @@ void SerializationBool::deserializeTextJSON(IColumn &column, ReadBuffer &istr, c col->insert(value); } +bool SerializationBool::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + if (istr.eof()) + return false; + + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + bool value = false; + char first_char = *istr.position(); + if (first_char == 't' || first_char == 'f') + { + if (!readBoolTextWord(value, istr)) + return false; + } + else if (first_char == '1' || first_char == '0') + { + /// Doesn't throw. + readBoolText(value, istr); + } + else + { + return false; + } + + col->insert(value); + return true; +} + void SerializationBool::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeCustom(column, row_num, ostr, settings); @@ -263,6 +319,14 @@ void SerializationBool::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); } +bool SerializationBool::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); +} + void SerializationBool::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeCustom(column, row_num, ostr, settings); @@ -276,15 +340,30 @@ void SerializationBool::deserializeTextRaw(IColumn & column, ReadBuffer & istr, deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } +bool SerializationBool::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + void SerializationBool::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeSimple(column, row_num, ostr, settings); } -void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) { + static constexpr bool throw_exception = std::is_same_v; + if (istr.eof()) - throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + return ReturnType(false); + } auto * col = checkAndGetDeserializeColumnType(column); @@ -292,11 +371,17 @@ void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & ist switch (symbol) { case 't': - assertStringCaseInsensitive("true", istr); + if constexpr (throw_exception) + assertStringCaseInsensitive("true", istr); + else if (!checkStringCaseInsensitive("true", istr)) + return ReturnType(false); col->insert(true); break; case 'f': - assertStringCaseInsensitive("false", istr); + if constexpr (throw_exception) + assertStringCaseInsensitive("false", istr); + else if (!checkStringCaseInsensitive("false", istr)) + return ReturnType(false); col->insert(false); break; case '1': @@ -307,16 +392,40 @@ void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & ist break; case '\'': ++istr.position(); - deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); - assertChar('\'', istr); + if constexpr (throw_exception) + { + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); + assertChar('\'', istr); + } + else + { + if (!deserializeImpl(column, istr, settings, [](ReadBuffer & buf) { return !buf.eof() && *buf.position() == '\''; }) || !checkChar('\'', istr)) + return ReturnType(false); + } break; default: - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " - "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", - String(istr.position(), std::min(10ul, istr.available()))); + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", + String(istr.position(), std::min(10ul, istr.available()))); + return ReturnType(false); + } } + + return ReturnType(true); +} + +void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextQuotedImpl(column, istr, settings); +} + +bool SerializationBool::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextQuotedImpl(column, istr, settings); } void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -327,6 +436,14 @@ void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); } +bool SerializationBool::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); +} + void SerializationBool::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeSimple(column, row_num, ostr, settings); diff --git a/src/DataTypes/Serializations/SerializationBool.h b/src/DataTypes/Serializations/SerializationBool.h index a5aa0ca80a2..3e511b7249e 100644 --- a/src/DataTypes/Serializations/SerializationBool.h +++ b/src/DataTypes/Serializations/SerializationBool.h @@ -15,21 +15,27 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; }; diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp index 03564bac64b..abe443cab1b 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -24,6 +24,12 @@ void deserializeFromString(const SerializationCustomSimpleText & domain, IColumn domain.deserializeText(column, istr, settings, true); } +bool tryDeserializeFromString(const SerializationCustomSimpleText & domain, IColumn & column, const String & s, const FormatSettings & settings) +{ + ReadBufferFromString istr(s); + return domain.tryDeserializeText(column, istr, settings, true); +} + } namespace DB @@ -34,6 +40,19 @@ SerializationCustomSimpleText::SerializationCustomSimpleText(const Serialization { } +bool SerializationCustomSimpleText::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + try + { + deserializeText(column, istr, settings, whole); + return true; + } + catch (...) + { + return false; + } +} + void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String str; @@ -41,6 +60,13 @@ void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadB deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readStringUntilEOF(str, istr); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); @@ -53,6 +79,13 @@ void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, Rea deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readEscapedString(str, istr); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); @@ -65,6 +98,14 @@ void SerializationCustomSimpleText::deserializeTextQuoted(IColumn & column, Read deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + if (!tryReadQuotedString(str, istr)) + return false; + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCSVString(serializeToString(*this, column, row_num, settings), ostr); @@ -77,6 +118,13 @@ void SerializationCustomSimpleText::deserializeTextCSV(IColumn & column, ReadBuf deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readCSVStringInto(str, istr, settings.csv); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); @@ -89,6 +137,14 @@ void SerializationCustomSimpleText::deserializeTextJSON(IColumn & column, ReadBu deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + if (!tryReadJSONStringInto(str, istr)) + return false; + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.h b/src/DataTypes/Serializations/SerializationCustomSimpleText.h index 0c909350002..c80a57e234c 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.h +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -22,20 +22,24 @@ public: /// whole = true means that buffer contains only one value, so we should read until EOF. /// It's needed to check if there is garbage after parsed field. virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + virtual bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. */ void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization with escaping but without quoting. */ void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization as a literal that may be inserted into a query. */ void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization for the CSV format. */ @@ -44,12 +48,14 @@ public: * (the delimiter is not consumed). */ void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization intended for using in JSON format. * force_quoting_64bit_integers parameter forces to brace UInt64 and Int64 types into quotes. */ void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization for putting into the XML format. */ diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp index 534f599a072..38e1bb87b6d 100644 --- a/src/DataTypes/Serializations/SerializationDate.cpp +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -22,6 +22,15 @@ void SerializationDate::deserializeWholeText(IColumn & column, ReadBuffer & istr throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date"); } +bool SerializationDate::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!tryReadDateText(x, istr, time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; @@ -29,6 +38,15 @@ void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & is assert_cast(column).getData().push_back(x); } +bool SerializationDate::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!tryReadDateText(x, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeText(column, row_num, ostr, settings); @@ -50,6 +68,16 @@ void SerializationDate::deserializeTextQuoted(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDate::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!checkChar('\'', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('\'', istr)) + return false; + + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -66,6 +94,15 @@ void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } +bool SerializationDate::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -80,6 +117,15 @@ void SerializationDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(value); } +bool SerializationDate::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum value; + if (!tryReadCSV(value, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(value); + return true; +} + SerializationDate::SerializationDate(const DateLUTImpl & time_zone_) : time_zone(time_zone_) { } diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h index f751b06fba6..dcf79eb49da 100644 --- a/src/DataTypes/Serializations/SerializationDate.h +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -13,14 +13,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; protected: const DateLUTImpl & time_zone; diff --git a/src/DataTypes/Serializations/SerializationDate32.cpp b/src/DataTypes/Serializations/SerializationDate32.cpp index 851710de839..70a22d59e42 100644 --- a/src/DataTypes/Serializations/SerializationDate32.cpp +++ b/src/DataTypes/Serializations/SerializationDate32.cpp @@ -21,6 +21,15 @@ void SerializationDate32::deserializeWholeText(IColumn & column, ReadBuffer & is throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date32"); } +bool SerializationDate32::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!tryReadDateText(x, istr, time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; @@ -28,6 +37,15 @@ void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDate32::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!tryReadDateText(x, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeText(column, row_num, ostr, settings); @@ -49,6 +67,15 @@ void SerializationDate32::deserializeTextQuoted(IColumn & column, ReadBuffer & i assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDate32::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!checkChar('\'', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('\'', istr)) + return false; + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + return true; +} + void SerializationDate32::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -65,6 +92,15 @@ void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); } +bool SerializationDate32::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -79,6 +115,15 @@ void SerializationDate32::deserializeTextCSV(IColumn & column, ReadBuffer & istr assert_cast(column).getData().push_back(value.getExtenedDayNum()); } +bool SerializationDate32::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + LocalDate value; + if (!tryReadCSV(value, istr)) + return false; + assert_cast(column).getData().push_back(value.getExtenedDayNum()); + return true; +} + SerializationDate32::SerializationDate32(const DateLUTImpl & time_zone_) : time_zone(time_zone_) { } diff --git a/src/DataTypes/Serializations/SerializationDate32.h b/src/DataTypes/Serializations/SerializationDate32.h index 49560fb6c7d..be2e2b76c1d 100644 --- a/src/DataTypes/Serializations/SerializationDate32.h +++ b/src/DataTypes/Serializations/SerializationDate32.h @@ -12,14 +12,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; protected: const DateLUTImpl & time_zone; diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index 77beb0d9b75..17465d85e9d 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -21,15 +21,56 @@ inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & setti switch (settings.date_time_input_format) { case FormatSettings::DateTimeInputFormat::Basic: - readDateTimeText(x, istr, time_zone); - return; + readDateTimeTextImpl<>(x, istr, time_zone); + break; case FormatSettings::DateTimeInputFormat::BestEffort: parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); - return; + break; case FormatSettings::DateTimeInputFormat::BestEffortUS: parseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); - return; + break; } + + if (x < 0) + x = 0; +} + +inline void readAsIntText(time_t & x, ReadBuffer & istr) +{ + readIntText(x, istr); + if (x < 0) + x = 0; +} + +inline bool tryReadText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + bool res; + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + res = tryReadDateTimeText(x, istr, time_zone); + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + res = tryParseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + res = tryParseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); + break; + } + + if (x < 0) + x = 0; + + return res; +} + +inline bool tryReadAsIntText(time_t & x, ReadBuffer & istr) +{ + if (!tryReadIntText(x, istr)) + return false; + if (x < 0) + x = 0; + return true; } } @@ -68,15 +109,32 @@ void SerializationDateTime::deserializeWholeText(IColumn & column, ReadBuffer & throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); } +bool SerializationDateTime::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !istr.eof()) + return false; + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; readText(x, istr, settings, time_zone, utc_time_zone); - if (x < 0) - x = 0; assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone)) + return false; + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('\'', ostr); @@ -94,15 +152,32 @@ void SerializationDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & } else /// Just 1504193808 or 01504193808 { - readIntText(x, istr); + readAsIntText(x, istr); } - if (x < 0) - x = 0; /// It's important to do this at the end - for exception safety. assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + return false; + } + else /// Just 1504193808 or 01504193808 + { + if (!tryReadAsIntText(x, istr)) + return false; + } + + /// It's important to do this at the end - for exception safety. + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -120,13 +195,30 @@ void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & i } else { - readIntText(x, istr); + readAsIntText(x, istr); } - if (x < 0) - x = 0; + assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('"', istr)) + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar('"', istr)) + return false; + } + else + { + if (!tryReadIntText(x, istr)) + return false; + } + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -165,13 +257,48 @@ void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & is readCSVString(datetime_str, istr, settings.csv); ReadBufferFromString buf(datetime_str); readText(x, buf, settings, time_zone, utc_time_zone); + if (!buf.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); } } - if (x < 0) - x = 0; - assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + + if (istr.eof()) + return false; + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar(maybe_quote, istr)) + return false; + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone)) + return false; + } + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + if (!tryReadText(x, buf, settings, time_zone, utc_time_zone) || !buf.eof()) + return false; + } + } + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h index f4a142483e5..584b0c4116b 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.h +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -15,14 +15,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index 93891886000..a19619bf8d3 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -47,6 +47,16 @@ void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & ist throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime64"); } +bool SerializationDateTime64::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + DateTime64 result = 0; + if (tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && istr.eof())) + return false; + + assert_cast(column).getData().push_back(result); + return true; +} + void SerializationDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeTextEscaped(column, istr, settings); @@ -75,6 +85,29 @@ static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, con } } +static inline bool tryReadText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + return tryReadDateTime64Text(x, scale, istr, time_zone); + case FormatSettings::DateTimeInputFormat::BestEffort: + return tryParseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); + case FormatSettings::DateTimeInputFormat::BestEffortUS: + return tryParseDateTime64BestEffortUS(x, scale, istr, time_zone, utc_time_zone); + } +} + + +bool SerializationDateTime64::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; @@ -82,6 +115,15 @@ void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffe assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('\'', ostr); @@ -104,6 +146,23 @@ void SerializationDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDateTime64::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + return false; + } + else /// Just 1504193808 or 01504193808 + { + if (!tryReadIntText(x, istr)) + return false; + } + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + return true; +} + void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -126,6 +185,23 @@ void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('"', istr)) + { + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('"', istr)) + return false; + } + else + { + if (!tryReadIntText(x, istr)) + return false; + } + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -170,4 +246,40 @@ void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + + if (istr.eof()) + return false; + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar(maybe_quote, istr)) + return false; + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + return false; + } + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + if (!tryReadText(x, scale, buf, settings, time_zone, utc_time_zone) || !buf.eof()) + return false; + } + } + + assert_cast(column).getData().push_back(x); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h index f817edbf0dd..b49bd1e9098 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.h +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -15,15 +15,21 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationDecimal.cpp b/src/DataTypes/Serializations/SerializationDecimal.cpp index b576b7a048c..d632c224783 100644 --- a/src/DataTypes/Serializations/SerializationDecimal.cpp +++ b/src/DataTypes/Serializations/SerializationDecimal.cpp @@ -16,11 +16,19 @@ namespace ErrorCodes } template -bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) +bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) { UInt32 unread_scale = scale; - if (!tryReadDecimalText(istr, x, precision, unread_scale)) - return false; + if (csv) + { + if (!tryReadCSVDecimalText(istr, x, precision, unread_scale)) + return false; + } + else + { + if (!tryReadDecimalText(istr, x, precision, unread_scale)) + return false; + } if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) return false; @@ -59,6 +67,16 @@ void SerializationDecimal::deserializeText(IColumn & column, ReadBuffer & ist ISerialization::throwUnexpectedDataAfterParsedValue(column, istr, settings, "Decimal"); } +template +bool SerializationDecimal::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + T x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { @@ -67,6 +85,16 @@ void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +template +bool SerializationDecimal::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + if (!tryReadText(x, istr, true)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationDecimal::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -88,6 +116,18 @@ void SerializationDecimal::deserializeTextJSON(IColumn & column, ReadBuffer & assertChar('"', istr); } +template +bool SerializationDecimal::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + bool have_quotes = checkChar('"', istr); + T x; + if (!tryReadText(x, istr) || (have_quotes && !checkChar('"', istr))) + return false; + + assert_cast(column).getData().push_back(x); + return true; +} + template class SerializationDecimal; template class SerializationDecimal; diff --git a/src/DataTypes/Serializations/SerializationDecimal.h b/src/DataTypes/Serializations/SerializationDecimal.h index 57decdd0973..22a8eb1a47c 100644 --- a/src/DataTypes/Serializations/SerializationDecimal.h +++ b/src/DataTypes/Serializations/SerializationDecimal.h @@ -16,15 +16,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } + bool tryReadText(T & x, ReadBuffer & istr, bool csv = false) const { return tryReadText(x, istr, this->precision, this->scale, csv); } static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); - static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); + static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); }; } diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index 9b3a437e9cf..6ad55913738 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -34,6 +34,27 @@ void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffe } } +template +bool SerializationEnum::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + if (settings.tsv.enum_as_number) + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readEscapedString(field_name, istr); + if (!this->tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -48,6 +69,18 @@ void SerializationEnum::deserializeTextQuoted(IColumn & column, ReadBuffer assert_cast(column).getData().push_back(ref_enum_values.getValue(StringRef(field_name))); } +template +bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + std::string field_name; + readQuotedStringWithSQLStyle(field_name, istr); + FieldType x; + if (!this->tryGetValue(x, StringRef(field_name))) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { @@ -65,6 +98,27 @@ void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer } } +template +bool SerializationEnum::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + if (settings.tsv.enum_as_number) + { + if (!tryReadValue(istr, x) || !istr.eof()) + return false; + } + else + { + std::string field_name; + readStringUntilEOF(field_name, istr); + if (!this->tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -90,6 +144,27 @@ void SerializationEnum::deserializeTextJSON(IColumn & column, ReadBuffer & } } +template +bool SerializationEnum::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + FieldType x; + if (!istr.eof() && *istr.position() != '"') + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readJSONString(field_name, istr); + if (!this->tryGetValue(x, StringRef(field_name))) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -109,6 +184,28 @@ void SerializationEnum::deserializeTextCSV(IColumn & column, ReadBuffer & } } +template +bool SerializationEnum::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + + if (settings.csv.enum_as_number) + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readCSVString(field_name, istr, settings.csv); + if (!this->tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextMarkdown( const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 03b134e59a6..708161dc5fd 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -34,15 +34,20 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -53,6 +58,14 @@ public: return ref_enum_values.findByValue(x)->first; } + bool tryReadValue(ReadBuffer & istr, FieldType & x) const + { + if (!tryReadText(x, istr) || !this->hasValue(x)) + return false; + + return true; + } + std::optional> own_enum_values; std::shared_ptr> own_enum_type; const EnumValues & ref_enum_values; diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index fa50af52f2f..23e959d80c9 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -150,12 +150,49 @@ static inline void read(const SerializationFixedString & self, IColumn & column, } } +bool SerializationFixedString::tryAlignStringLength(size_t n, PaddedPODArray & data, size_t string_start) +{ + size_t length = data.size() - string_start; + if (length < n) + { + data.resize_fill(string_start + n); + } + else if (length > n) + { + data.resize_assume_reserved(string_start); + return false; + } + + return true; +} + +template +static inline bool tryRead(const SerializationFixedString & self, IColumn & column, Reader && reader) +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + size_t prev_size = data.size(); + try + { + return reader(data) && SerializationFixedString::tryAlignStringLength(self.getN(), data, prev_size); + } + catch (...) + { + data.resize_assume_reserved(prev_size); + return false; + } +} + void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); return true; }); +} + void SerializationFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -169,12 +206,22 @@ void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffe read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { return tryReadQuotedStringInto(data, istr); }); +} + void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); +} + void SerializationFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -188,6 +235,10 @@ void SerializationFixedString::deserializeTextJSON(IColumn & column, ReadBuffer read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { return tryReadJSONStringInto(data, istr); }); +} void SerializationFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -208,6 +259,11 @@ void SerializationFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); } +bool SerializationFixedString::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryRead(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); return true; }); +} + void SerializationFixedString::serializeTextMarkdown( const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const { diff --git a/src/DataTypes/Serializations/SerializationFixedString.h b/src/DataTypes/Serializations/SerializationFixedString.h index c27b10ad158..8eb4eacdbff 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.h +++ b/src/DataTypes/Serializations/SerializationFixedString.h @@ -26,20 +26,25 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -47,6 +52,7 @@ public: /// If the length is less than getN() the function will add zero characters up to getN(). /// If the length is greater than getN() the function will throw an exception. static void alignStringLength(size_t n, PaddedPODArray & data, size_t string_start); + static bool tryAlignStringLength(size_t n, PaddedPODArray & data, size_t string_start); }; } diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp new file mode 100644 index 00000000000..81c4af97401 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -0,0 +1,188 @@ +#include + +namespace DB +{ + +template +void SerializationIP::serializeText(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings &) const +{ + writeText(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationIP::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + IPv x; + readText(x, istr); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + + assert_cast &>(column).getData().push_back(x); +} + +template +bool SerializationIP::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &, bool whole) const +{ + IPv x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextQuoted(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +template +void SerializationIP::deserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + assertChar('\'', istr); + readText(x, istr); + assertChar('\'', istr); + assert_cast &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +template +bool SerializationIP::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if (!checkChar('\'', istr) || !tryReadText(x, istr) || !checkChar('\'', istr)) + return false; + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextJSON(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +template +void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + IPv x; + assertChar('"', istr); + readText(x, istr); + /// this code looks weird, but we want to throw specific exception to match original behavior... + if (istr.eof()) + assertChar('"', istr); + if (*istr.position() != '"') + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + istr.ignore(); + + assert_cast &>(column).getData().push_back(x); +} + +template +bool SerializationIP::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextCSV(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +template +void SerializationIP::deserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv value; + readCSV(value, istr); + + assert_cast &>(column).getData().push_back(value); +} + +template +bool SerializationIP::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv value; + if (!tryReadCSV(value, istr)) + return false; + + assert_cast &>(column).getData().push_back(value); + return true; +} + +template +void SerializationIP::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + IPv x = field.get(); + if constexpr (std::is_same_v) + writeBinary(x, ostr); + else + writeBinaryLittleEndian(x, ostr); +} + +template +void SerializationIP::deserializeBinary(DB::Field & field, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if constexpr (std::is_same_v) + readBinary(x, istr); + else + readBinaryLittleEndian(x, istr); + field = NearestFieldType(x); +} + +template +void SerializationIP::serializeBinary(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings &) const +{ + writeBinary(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationIP::deserializeBinary(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + readBinary(x.toUnderType(), istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationIP::serializeBinaryBulk(const DB::IColumn & column, DB::WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&x[offset]), sizeof(IPv) * limit); +} + +template +void SerializationIP::deserializeBinaryBulk(DB::IColumn & column, DB::ReadBuffer & istr, size_t limit, double) const +{ + typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(IPv) * limit); + x.resize(initial_size + size / sizeof(IPv)); +} + +template class SerializationIP; +template class SerializationIP; + +} diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h index 7d8669fd444..a53f257646b 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h @@ -13,123 +13,30 @@ template class SerializationIP : public SimpleTextSerialization { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - writeText(assert_cast &>(column).getData()[row_num], ostr); - } - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override - { - IPv x; - readText(x, istr); + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; - if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - assert_cast &>(column).getData().push_back(x); - } - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeText(column, row_num, ostr, settings); - } - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeText(column, istr, settings, false); - } - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); - } - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - assertChar('\'', istr); - readText(x, istr); - assertChar('\'', istr); - assert_cast &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. - } - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); - } - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - IPv x; - assertChar('"', istr); - readText(x, istr); - /// this code looks weird, but we want to throw specific exception to match original behavior... - if (istr.eof()) - assertChar('"', istr); - if (*istr.position() != '"') - throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); - istr.ignore(); + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - assert_cast &>(column).getData().push_back(x); - } - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); - } - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override - { - IPv value; - readCSV(value, istr); + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; - assert_cast &>(column).getData().push_back(value); - } + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; - void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override - { - IPv x = field.get(); - if constexpr (std::is_same_v) - writeBinary(x, ostr); - else - writeBinaryLittleEndian(x, ostr); - } - void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - if constexpr (std::is_same_v) - readBinary(x, istr); - else - readBinaryLittleEndian(x, istr); - field = NearestFieldType(x); - } - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - writeBinary(assert_cast &>(column).getData()[row_num], ostr); - } - void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - readBinary(x.toUnderType(), istr); - assert_cast &>(column).getData().push_back(x); - } - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override - { - const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&x[offset]), sizeof(IPv) * limit); - } - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override - { - typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(IPv) * limit); - x.resize(initial_size + size / sizeof(IPv)); - } + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override; }; using SerializationIPv4 = SerializationIP; diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 3e1cbdb00f5..9efe05042ed 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -700,6 +700,11 @@ void SerializationLowCardinality::deserializeTextEscaped(IColumn & column, ReadB deserializeImpl(column, &ISerialization::deserializeTextEscaped, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextEscaped, istr, settings); +} + void SerializationLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextQuoted, ostr, settings); @@ -710,11 +715,21 @@ void SerializationLowCardinality::deserializeTextQuoted(IColumn & column, ReadBu deserializeImpl(column, &ISerialization::deserializeTextQuoted, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextQuoted, istr, settings); +} + void SerializationLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeImpl(column, &ISerialization::deserializeWholeText, istr, settings); } +bool SerializationLowCardinality::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeWholeText, istr, settings); +} + void SerializationLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextCSV, ostr, settings); @@ -725,6 +740,11 @@ void SerializationLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffe deserializeImpl(column, &ISerialization::deserializeTextCSV, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextCSV, istr, settings); +} + void SerializationLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeText, ostr, settings); @@ -740,6 +760,11 @@ void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuff deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextJSON, istr, settings); +} + void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); @@ -750,6 +775,11 @@ void SerializationLowCardinality::deserializeTextRaw(IColumn & column, ReadBuffe deserializeImpl(column, &ISerialization::deserializeTextRaw, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextRaw, istr, settings); +} + void SerializationLowCardinality::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextRaw, ostr, settings); @@ -769,7 +799,7 @@ template void SerializationLowCardinality::deserializeImpl( IColumn & column, SerializationLowCardinality::DeserializeFunctionPtr func, Args &&... args) const { - auto & low_cardinality_column= getColumnLowCardinality(column); + auto & low_cardinality_column = getColumnLowCardinality(column); auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); auto serialization = dictionary_type->getDefaultSerialization(); @@ -778,4 +808,19 @@ void SerializationLowCardinality::deserializeImpl( low_cardinality_column.insertFromFullColumn(*temp_column, 0); } +template +bool SerializationLowCardinality::tryDeserializeImpl( + IColumn & column, SerializationLowCardinality::TryDeserializeFunctionPtr func, Args &&... args) const +{ + auto & low_cardinality_column = getColumnLowCardinality(column); + auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + + auto serialization = dictionary_type->getDefaultSerialization(); + if (!(serialization.get()->*func)(*temp_column, std::forward(args)...)) + return false; + + low_cardinality_column.insertFromFullColumn(*temp_column, 0); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index 5f56bcf8108..d2c3a95c702 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -55,16 +55,22 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; private: @@ -79,6 +85,12 @@ private: template void deserializeImpl(IColumn & column, DeserializeFunctionPtr func, Args &&... args) const; + + template + using TryDeserializeFunctionPtr = bool (ISerialization::*)(IColumn &, Params ...) const; + + template + bool tryDeserializeImpl(IColumn & column, TryDeserializeFunctionPtr func, Args &&... args) const; }; } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 7588e630689..7b6f87baf2e 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -115,9 +115,11 @@ void SerializationMap::serializeTextImpl( writeChar('}', ostr); } -template -void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const +template +ReturnType SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const { + static constexpr bool throw_exception = std::is_same_v; + auto & column_map = assert_cast(column); auto & nested_array = column_map.getNestedColumn(); @@ -128,7 +130,21 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, auto & value_column = nested_tuple.getColumn(1); size_t size = 0; - assertChar('{', istr); + if constexpr (throw_exception) + assertChar('{', istr); + else if (!checkChar('{', istr)) + return ReturnType(false); + + auto on_error_no_throw = [&]() + { + if (size) + { + nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); + nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); + } + + return ReturnType(false); + }; try { @@ -138,9 +154,15 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, if (!first) { if (*istr.position() == ',') + { ++istr.position(); + } else - throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + return on_error_no_throw(); + } } first = false; @@ -150,19 +172,32 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, if (*istr.position() == '}') break; - reader(istr, key, key_column); + if constexpr (throw_exception) + reader(istr, key, key_column); + else if (!reader(istr, key, key_column)) + return on_error_no_throw(); + ++size; skipWhitespaceIfAny(istr); - assertChar(':', istr); + if constexpr (throw_exception) + assertChar(':', istr); + else if (!checkChar(':', istr)) + return on_error_no_throw(); skipWhitespaceIfAny(istr); - reader(istr, value, value_column); + if constexpr (throw_exception) + reader(istr, value, value_column); + else if (!reader(istr, value, value_column)) + return on_error_no_throw(); skipWhitespaceIfAny(istr); } - assertChar('}', istr); + if constexpr (throw_exception) + assertChar('}', istr); + else if (!checkChar('}', istr)) + return on_error_no_throw(); } catch (...) { @@ -171,10 +206,14 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); } - throw; + + if constexpr (throw_exception) + throw; + return ReturnType(false); } offsets.push_back(offsets.back() + size); + return ReturnType(true); } void SerializationMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -192,8 +231,8 @@ void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, cons deserializeTextImpl(column, istr, [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(subcolumn, buf, settings, subcolumn_serialization); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextQuoted(subcolumn, buf, settings); }); @@ -202,6 +241,28 @@ void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, cons throwUnexpectedDataAfterParsedValue(column, istr, settings, "Map"); } +bool SerializationMap::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextQuoted(subcolumn, buf, settings); + }; + + auto ok = deserializeTextImpl(column, istr, reader); + if (!ok) + return false; + + if (whole && !istr.eof()) + { + column.popBack(1); + return false; + } + + return true; +} + void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeTextImpl(column, row_num, ostr, @@ -260,13 +321,25 @@ void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, istr, [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(subcolumn, buf, settings, subcolumn_serialization); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); }); } +bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); + }; + + return deserializeTextImpl(column, istr, reader); +} + void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const auto & column_map = assert_cast(column); @@ -308,6 +381,15 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c deserializeText(column, rb, settings, true); } +bool SerializationMap::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + return tryDeserializeText(column, rb, settings, true); +} + void SerializationMap::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index f32c656757d..3e27ef1b04a 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -24,13 +24,16 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void enumerateStreams( EnumerateStreamsSettings & settings, @@ -68,8 +71,8 @@ private: template void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; - template - void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; + template + ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; }; } diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index ca60948ce68..1a9cbe9a37d 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB { diff --git a/src/DataTypes/Serializations/SerializationNothing.h b/src/DataTypes/Serializations/SerializationNothing.h index 02974d1ca76..7d1fff55b01 100644 --- a/src/DataTypes/Serializations/SerializationNothing.h +++ b/src/DataTypes/Serializations/SerializationNothing.h @@ -25,6 +25,7 @@ public: void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } + bool tryDeserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } /// These methods read and write zero bytes just to allow to figure out size of column. void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 15203bdc9fa..e7f0e61f2a5 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -187,55 +187,59 @@ void SerializationNullable::serializeBinary(const IColumn & column, size_t row_n nested->serializeBinary(col.getNestedColumn(), row_num, ostr, settings); } -/// Deserialize value into ColumnNullable. -/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. -template -requires std::same_as -static ReturnType -safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +template +ReturnType safeAppendToNullMap(ColumnNullable & column, bool is_null) { - ColumnNullable & col = assert_cast(column); - - if (check_for_null()) + try { - col.insertDefault(); + column.getNullMapData().push_back(is_null); } - else + catch (...) { - deserialize_nested(col.getNestedColumn()); - - try - { - col.getNullMapData().push_back(0); - } - catch (...) - { - col.getNestedColumn().popBack(1); + column.getNestedColumn().popBack(1); + if constexpr (std::is_same_v) throw; - } + return ReturnType(false); } + + return ReturnType(true); } -/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. +/// Deserialize value into non-nullable column. In case of NULL, insert default and set is_null to true. +/// If ReturnType is bool, return true if parsing was succesfull and false in case of any error. template -requires std::same_as -static ReturnType -safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +static ReturnType deserializeImpl(IColumn & column, ReadBuffer & buf, CheckForNull && check_for_null, DeserializeNested && deserialize_nested, bool & is_null) { - bool insert_default = check_for_null(); - if (insert_default) + is_null = check_for_null(buf); + if (is_null) + { column.insertDefault(); + } else - deserialize_nested(column); - return !insert_default; + { + if constexpr (std::is_same_v) + deserialize_nested(column, buf); + else if (!deserialize_nested(column, buf)) + return ReturnType(false); + } + + return ReturnType(true); } void SerializationNullable::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - safeDeserialize(column, *nested, - [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, - [this, &istr, settings] (IColumn & nested_column) { nested->deserializeBinary(nested_column, istr, settings); }); + ColumnNullable & col = assert_cast(column); + bool is_null; + auto check_for_null = [](ReadBuffer & buf) + { + bool is_null_ = false; + readBinary(is_null_, buf); + return is_null_; + }; + auto deserialize_nested = [this, &settings] (IColumn & nested_column, ReadBuffer & buf) { nested->deserializeBinary(nested_column, buf, settings); }; + deserializeImpl(col.getNestedColumn(), istr, check_for_null, deserialize_nested, is_null); + safeAppendToNullMap(col, is_null); } @@ -244,20 +248,19 @@ void SerializationNullable::serializeTextEscaped(const IColumn & column, size_t const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); + serializeNullEscaped(ostr, settings); else nested->serializeTextEscaped(col.getNestedColumn(), row_num, ostr, settings); } - -void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullEscaped(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - deserializeTextEscapedImpl(column, istr, settings, nested); + writeString(settings.tsv.null_representation, ostr); } -void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationNullable::tryDeserializeNullEscaped(DB::ReadBuffer & istr, const DB::FormatSettings & settings) { - deserializeTextRawImpl(column, istr, settings, nested); + return checkString(settings.tsv.null_representation, istr); } void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -265,72 +268,73 @@ void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_ const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); + serializeNullRaw(ostr, settings); else nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings); } -template -ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) +void SerializationNullable::serializeNullRaw(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); + writeString(settings.tsv.null_representation, ostr); } -template -ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +bool SerializationNullable::tryDeserializeNullRaw(DB::ReadBuffer & istr, const DB::FormatSettings & settings) { - return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); + return checkString(settings.tsv.null_representation, istr); } template -ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested_serialization) +ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + const String & null_representation = settings.tsv.null_representation; + auto deserialize_nested = [&nested_serialization, &settings] (IColumn & nested_column, ReadBuffer & buf_) + { + if constexpr (throw_exception) + { + if constexpr (escaped) + nested_serialization->deserializeTextEscaped(nested_column, buf_, settings); + else + nested_serialization->deserializeTextRaw(nested_column, buf_, settings); + } + else + { + if constexpr (escaped) + return nested_serialization->tryDeserializeTextEscaped(nested_column, buf_, settings); + else + return nested_serialization->tryDeserializeTextRaw(nested_column, buf_, settings); + } + }; /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested_serialization, - [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) - { - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, istr, settings); - else - nested_serialization->deserializeTextRaw(nested_column, istr, settings); - }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation]() + auto check_for_null = [&null_representation](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) + auto * pos = buf.position(); + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) - { - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, istr, settings); - else - nested_serialization->deserializeTextRaw(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a null. /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf, &null_representation]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [&null_representation](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) @@ -340,16 +344,18 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested, &nested_serialization, &settings, &null_representation, &istr] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); auto * pos = buf.position(); - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, buf, settings); - else - nested_serialization->deserializeTextRaw(nested_column, buf, settings); + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return ReturnType(false); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is a string instead of a number @@ -358,6 +364,9 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + if constexpr (!throw_exception) + return ReturnType(false); + if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\t' or '\\n' may not work correctly for large input."); @@ -375,7 +384,63 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); +} + +void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); +} + +void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -383,45 +448,51 @@ void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t r const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeCString("NULL", ostr); + serializeNullQuoted(ostr); else nested->serializeTextQuoted(col.getNestedColumn(), row_num, ostr, settings); } - -void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullQuoted(DB::WriteBuffer & ostr) { - deserializeTextQuotedImpl(column, istr, settings, nested); + writeCString("NULL", ostr); +} + +bool SerializationNullable::tryDeserializeNullQuoted(DB::ReadBuffer & istr) +{ + return checkStringCaseInsensitive("NULL", istr); } template -ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_nested = [&nested, &settings] (IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (!throw_exception) + return nested->tryDeserializeTextQuoted(nested_column, buf, settings); + nested->deserializeTextQuoted(nested_column, buf, settings); + }; + if (istr.eof() || (*istr.position() != 'N' && *istr.position() != 'n')) { /// This is not null, surely. - return safeDeserialize(column, *nested, - [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (istr.available() >= 4) { - auto check_for_null = [&istr]() + auto check_for_null = [](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkStringCaseInsensitive("NULL", istr)) + auto * pos = buf.position(); + if (checkStringCaseInsensitive("NULL", buf)) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) - { - nested->deserializeTextQuoted(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a NULL @@ -429,9 +500,10 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re /// to differentiate for example NULL and NaN for float) /// Use PeekableReadBuffer to make a checkpoint before checking /// null and rollback if the check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkStringCaseInsensitive("NULL", buf)) @@ -441,39 +513,74 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re return false; }; - auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested] (IColumn & nested_column, ReadBuffer & buf_) { - nested->deserializeTextQuoted(nested_column, buf, settings); + auto & buf = assert_cast(buf_); + + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return false; + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number. /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + + if constexpr (!throw_exception) + return ReturnType(false); + throw DB::ParsingException( ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing Nullable: got an unquoted string {} instead of a number", String(buf.position(), std::min(10ul, buf.available()))); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); } -void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeWholeTextImpl(column, istr, settings, nested); + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextQuotedImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextQuotedImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextQuotedImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextQuotedImpl(nested_column, istr, settings, nested_serialization, is_null); } template -ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf]() + static constexpr bool throw_exception = std::is_same_v; + + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); @@ -488,15 +595,46 @@ ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, Rea return false; }; - auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + auto deserialize_nested = [&nested, &settings] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); + if constexpr (!throw_exception) + return nested->tryDeserializeWholeText(nested_column, buf, settings); + nested->deserializeWholeText(nested_column, buf, settings); assert(!buf.hasUnreadData()); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested, is_null); } +void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeWholeTextImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeWholeTextImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedWholeText(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeWholeTextImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedWholeText(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeWholeTextImpl(nested_column, istr, settings, nested_serialization, is_null); +} void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -508,48 +646,56 @@ void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_ nested->serializeTextCSV(col.getNestedColumn(), row_num, ostr, settings); } -void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullCSV(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - deserializeTextCSVImpl(column, istr, settings, nested); + writeString(settings.csv.null_representation, ostr); +} + +bool SerializationNullable::tryDeserializeNullCSV(DB::ReadBuffer & istr, const DB::FormatSettings & settings) +{ + return checkString(settings.csv.null_representation, istr); } template -ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested_serialization) +ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_nested = [&nested_serialization, &settings] (IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (!throw_exception) + return nested_serialization->tryDeserializeTextCSV(nested_column, buf, settings); + nested_serialization->deserializeTextCSV(nested_column, buf, settings); + }; + const String & null_representation = settings.csv.null_representation; if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested_serialization, - [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (settings.csv.custom_delimiter.empty() && istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation, &settings]() + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == settings.csv.delimiter || *istr.position() == '\r' || *istr.position() == '\n')) + auto * pos = buf.position(); + if (checkString(null_representation, buf) && (*buf.position() == settings.csv.delimiter || *buf.position() == '\r' || *buf.position() == '\n')) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) - { - nested_serialization->deserializeTextCSV(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a null. /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if the check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf, &null_representation, &settings]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkString(null_representation, buf)) @@ -572,13 +718,18 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested, &nested_serialization, &settings, &null_representation, &istr] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); auto * pos = buf.position(); - nested_serialization->deserializeTextCSV(nested_column, buf, settings); + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return ReturnType(false); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number @@ -587,6 +738,9 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + if constexpr (!throw_exception) + return ReturnType(false); + if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing " @@ -602,7 +756,35 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); +} + +void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextCSVImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextCSVImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextCSVImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextCSVImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -616,38 +798,86 @@ void SerializationNullable::serializeText(const IColumn & column, size_t row_num /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. if (col.isNullAt(row_num)) - { - if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) - writeCString("ᴺᵁᴸᴸ", ostr); - else - writeCString("NULL", ostr); - } + serializeNullText(ostr, settings); else nested->serializeText(col.getNestedColumn(), row_num, ostr, settings); } +void SerializationNullable::serializeNullText(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) +{ + if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) + writeCString("ᴺᵁᴸᴸ", ostr); + else + writeCString("NULL", ostr); +} + +bool SerializationNullable::tryDeserializeNullText(DB::ReadBuffer & istr) +{ + if (checkCharCaseInsensitive('N', istr)) + return checkStringCaseInsensitive("ULL", istr); + return checkStringCaseInsensitive("ᴺᵁᴸᴸ", istr); +} + void SerializationNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeCString("null", ostr); + serializeNullJSON(ostr); else nested->serializeTextJSON(col.getNestedColumn(), row_num, ostr, settings); } -void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullJSON(DB::WriteBuffer & ostr) { - deserializeTextJSONImpl(column, istr, settings, nested); + writeCString("null", ostr); +} + +bool SerializationNullable::tryDeserializeNullJSON(DB::ReadBuffer & istr) +{ + return checkString("null", istr); } template -ReturnType SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { - return safeDeserialize(column, *nested, - [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextJSON(nested_column, istr, settings); }); + auto check_for_null = [](ReadBuffer & buf){ return checkStringByFirstCharacterAndAssertTheRest("null", buf); }; + auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (std::is_same_v) + return nested->tryDeserializeTextJSON(nested_column, buf, settings); + nested->deserializeTextJSON(nested_column, buf, settings); + }; + + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); +} + +void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -660,11 +890,9 @@ void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_ nested->serializeTextXML(col.getNestedColumn(), row_num, ostr, settings); } -template bool SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +void SerializationNullable::serializeNullXML(DB::WriteBuffer & ostr) +{ + writeCString("\\N", ostr); +} } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index 3ec01b46de5..37858ccdefd 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -51,9 +51,12 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -66,31 +69,49 @@ public: * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. */ void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) - /// If ReturnType is void, deserialize Nullable(T) - template - static ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); - template - static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); - template - static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + /// If Check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) + static bool deserializeNullAsDefaultOrNestedWholeText(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + + /// If Check for NULL and deserialize value into non-nullable column or insert default value of nested type. + /// Return true if parsing was successful and false in case of any error. + static bool tryDeserializeNullAsDefaultOrNestedWholeText(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + + + static void serializeNullEscaped(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullEscaped(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullQuoted(WriteBuffer & ostr); + static bool tryDeserializeNullQuoted(ReadBuffer & istr); + static void serializeNullCSV(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullCSV(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullJSON(WriteBuffer & ostr); + static bool tryDeserializeNullJSON(ReadBuffer & istr); + static void serializeNullRaw(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullRaw(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullText(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullText(ReadBuffer & istr); + static void serializeNullXML(WriteBuffer & ostr); private: struct SubcolumnCreator : public ISubcolumnCreator diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp index b6c7e4618b8..bdb4dfc6735 100644 --- a/src/DataTypes/Serializations/SerializationNumber.cpp +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -37,6 +37,18 @@ void SerializationNumber::deserializeText(IColumn & column, ReadBuffer & istr throwUnexpectedDataAfterParsedValue(column, istr, settings, "Number"); } +template +bool SerializationNumber::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + T x; + + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + template void SerializationNumber::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -44,9 +56,10 @@ void SerializationNumber::serializeTextJSON(const IColumn & column, size_t ro writeJSONNumber(x, ostr, settings); } -template -void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) { + static constexpr bool throw_exception = std::is_same_v; bool has_quote = false; if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. { @@ -54,13 +67,16 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & ++istr.position(); } - FieldType x; + T x; /// null if (!has_quote && !istr.eof() && *istr.position() == 'n') { ++istr.position(); - assertString("ull", istr); + if constexpr (throw_exception) + assertString("ull", istr); + else if (!checkString("ull", istr)) + return ReturnType(false); x = NaNOrZero(); } @@ -73,26 +89,62 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & { // extra conditions to parse true/false strings into 1/0 if (istr.eof()) - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + else + return false; + } + if (*istr.position() == 't' || *istr.position() == 'f') { bool tmp = false; - readBoolTextWord(tmp, istr); + if constexpr (throw_exception) + readBoolTextWord(tmp, istr); + else if (!readBoolTextWord(tmp, istr)) + return ReturnType(false); + x = tmp; } else - readText(x, istr); + { + if constexpr (throw_exception) + readText(x, istr); + else if (!tryReadText(x, istr)) + return ReturnType(false); + } } else { - readText(x, istr); + if constexpr (throw_exception) + readText(x, istr); + else if (!tryReadText(x, istr)) + return ReturnType(false); } if (has_quote) - assertChar('"', istr); + { + if constexpr (throw_exception) + assertChar('"', istr); + else if (!checkChar('"', istr)) + return ReturnType(false); + } } assert_cast &>(column).getData().push_back(x); + return ReturnType(true); +} + +template +void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +template +bool SerializationNumber::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); } template @@ -103,6 +155,16 @@ void SerializationNumber::deserializeTextCSV(IColumn & column, ReadBuffer & i assert_cast &>(column).getData().push_back(x); } +template +bool SerializationNumber::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & /*settings*/) const +{ + FieldType x; + if (!tryReadCSV(x, istr)) + return false; + assert_cast &>(column).getData().push_back(x); + return true; +} + template void SerializationNumber::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const { diff --git a/src/DataTypes/Serializations/SerializationNumber.h b/src/DataTypes/Serializations/SerializationNumber.h index 972c6c9a30f..9d53dc9c494 100644 --- a/src/DataTypes/Serializations/SerializationNumber.h +++ b/src/DataTypes/Serializations/SerializationNumber.h @@ -20,9 +20,12 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; /** Format is platform-dependent. */ void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index 788ff429088..1680ec8a333 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -272,40 +272,67 @@ void SerializationString::serializeTextEscaped(const IColumn & column, size_t ro } -template -static inline void read(IColumn & column, Reader && reader) +template +static inline ReturnType read(IColumn & column, Reader && reader) { + static constexpr bool throw_exception = std::is_same_v; ColumnString & column_string = assert_cast(column); ColumnString::Chars & data = column_string.getChars(); ColumnString::Offsets & offsets = column_string.getOffsets(); size_t old_chars_size = data.size(); size_t old_offsets_size = offsets.size(); - try - { - reader(data); - data.push_back(0); - offsets.push_back(data.size()); - } - catch (...) + auto restore_column = [&]() { offsets.resize_assume_reserved(old_offsets_size); data.resize_assume_reserved(old_chars_size); - throw; + }; + + try + { + if constexpr (throw_exception) + { + reader(data); + } + else if (!reader(data)) + { + restore_column(); + return false; + } + + data.push_back(0); + offsets.push_back(data.size()); + return ReturnType(true); + } + catch (...) + { + restore_column(); + if constexpr (throw_exception) + throw; + else + return false; } } void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); } +bool SerializationString::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); +} void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); } +bool SerializationString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); return true; }); +} void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -315,7 +342,12 @@ void SerializationString::serializeTextQuoted(const IColumn & column, size_t row void SerializationString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); +} + +bool SerializationString::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { return tryReadQuotedStringInto(data, istr); }); } @@ -329,11 +361,11 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist { if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') { - read(column, [&](ColumnString::Chars & data) { readJSONObjectPossiblyInvalid(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONObjectPossiblyInvalid(data, istr); }); } else if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') { - read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); }); } else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { @@ -342,12 +374,40 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist Float64 tmp; ReadBufferFromString buf(field); if (tryReadFloatText(tmp, buf) && buf.eof()) - read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); else throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON String value here: {}", field); } else - read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); +} + +bool SerializationString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') + return read(column, [&](ColumnString::Chars & data) { return readJSONObjectPossiblyInvalid(data, istr); }); + + if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') + return read(column, [&](ColumnString::Chars & data) { return readJSONArrayInto(data, istr); }); + + if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') + { + String field; + if (!tryReadJSONField(field, istr)) + return false; + + Float64 tmp; + ReadBufferFromString buf(field); + if (tryReadFloatText(tmp, buf) && buf.eof()) + { + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + return true; + } + + return false; + } + + return read(column, [&](ColumnString::Chars & data) { return tryReadJSONStringInto(data, istr); }); } @@ -365,7 +425,12 @@ void SerializationString::serializeTextCSV(const IColumn & column, size_t row_nu void SerializationString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); + read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); +} + +bool SerializationString::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); return true; }); } void SerializationString::serializeTextMarkdown( diff --git a/src/DataTypes/Serializations/SerializationString.h b/src/DataTypes/Serializations/SerializationString.h index cd4cdf79c11..89ab84f0d22 100644 --- a/src/DataTypes/Serializations/SerializationString.h +++ b/src/DataTypes/Serializations/SerializationString.h @@ -18,20 +18,25 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; }; diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index cbbe97eb05c..c0b0658e6b4 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -62,15 +62,35 @@ void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, } -template -static void addElementSafe(size_t num_elems, IColumn & column, F && impl) +template +static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) { + static constexpr bool throw_exception = std::is_same_v; + /// We use the assumption that tuples of zero size do not exist. size_t old_size = column.size(); + auto restore_elements = [&]() + { + for (size_t i = 0; i < num_elems; ++i) + { + auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) + element_column.popBack(1); + } + }; + try { - impl(); + if constexpr (throw_exception) + { + impl(); + } + else if (!impl()) + { + restore_elements(); + return ReturnType(false); + } // Check that all columns now have the same size. size_t new_size = column.size(); @@ -81,22 +101,23 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl) { // This is not a logical error because it may work with // user-supplied data. - throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, - "Cannot read a tuple because not all elements are present"); + if constexpr (throw_exception) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + restore_elements(); + return ReturnType(false); } } } catch (...) { - for (size_t i = 0; i < num_elems; ++i) - { - auto & element_column = extractElementColumn(column, i); - if (element_column.size() > old_size) - element_column.popBack(1); - } - - throw; + restore_elements(); + if constexpr (throw_exception) + throw; + return ReturnType(false); } + + return ReturnType(true); } void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -120,25 +141,51 @@ void SerializationTuple::serializeText(const IColumn & column, size_t row_num, W writeChar(')', ostr); } -void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +template +ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const { - const size_t size = elems.size(); - assertChar('(', istr); + static constexpr bool throw_exception = std::is_same_v; - addElementSafe(elems.size(), column, [&] + const size_t size = elems.size(); + if constexpr (throw_exception) + assertChar('(', istr); + else if (!checkChar('(', istr)) + return ReturnType(false); + + auto impl = [&]() { for (size_t i = 0; i < size; ++i) { skipWhitespaceIfAny(istr); if (i != 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return ReturnType(false); + skipWhitespaceIfAny(istr); } - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(extractElementColumn(column, i), istr, settings, elems[i]); + + auto & element_column = extractElementColumn(column, i); + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(element_column, istr, settings, elems[i]); + else + elems[i]->deserializeTextQuoted(element_column, istr, settings); + } else - elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); + { + bool ok; + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + ok = SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(element_column, istr, settings, elems[i]); + else + ok = elems[i]->tryDeserializeTextQuoted(element_column, istr, settings); + + if (!ok) + return false; + } } // Special format for one element tuple (1,) @@ -150,11 +197,35 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co } skipWhitespaceIfAny(istr); - assertChar(')', istr); + if constexpr (throw_exception) + assertChar(')', istr); + else if (!checkChar(')', istr)) + return ReturnType(false); if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); - }); + { + if constexpr (throw_exception) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); + return ReturnType(false); + } + + return ReturnType(true); + }; + + if constexpr (throw_exception) + addElementSafe(elems.size(), column, impl); + else + return addElementSafe(elems.size(), column, impl); +} + +void SerializationTuple::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + deserializeTextImpl(column, istr, settings, whole); +} + +bool SerializationTuple::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + return deserializeTextImpl(column, istr, settings, whole); } void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -239,16 +310,39 @@ void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t } } -void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_element = [&](IColumn & element_column, size_t element_pos) + { + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + else + elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + return elems[element_pos]->tryDeserializeTextJSON(element_column, istr, settings); + } + }; + if (settings.json.read_named_tuples_as_objects && have_explicit_names) { skipWhitespaceIfAny(istr); - assertChar('{', istr); + if constexpr (throw_exception) + assertChar('{', istr); + else if (!checkChar('{', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); - addElementSafe(elems.size(), column, [&] + auto impl = [&]() { std::vector seen_elements(elems.size(), 0); size_t processed = 0; @@ -256,18 +350,32 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr while (!istr.eof() && *istr.position() != '}') { if (!settings.json.ignore_unknown_keys_in_named_tuple && processed == elems.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + return ReturnType(false); + } if (processed + skipped > 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); } std::string name; - readDoubleQuotedString(name, istr); + if constexpr (throw_exception) + readDoubleQuotedString(name, istr); + else if (!tryReadDoubleQuotedString(name, istr)) + return ReturnType(false); + skipWhitespaceIfAny(istr); - assertChar(':', istr); + if constexpr (throw_exception) + assertChar(':', istr); + else if (!checkChar(':', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); const size_t element_pos = getPositionByName(name); @@ -275,36 +383,52 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr { if (settings.json.ignore_unknown_keys_in_named_tuple) { - skipJSONField(istr, name); + if constexpr (throw_exception) + skipJSONField(istr, name); + else if (!trySkipJSONField(istr, name)) + return ReturnType(false); + skipWhitespaceIfAny(istr); ++skipped; continue; } else - throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + return ReturnType(false); + } } seen_elements[element_pos] = 1; auto & element_column = extractElementColumn(column, element_pos); - try + if constexpr (throw_exception) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(element_column, istr, settings, elems[element_pos]); - else - elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + try + { + deserialize_element(element_column, element_pos); + } + catch (Exception & e) + { + e.addMessage("(while reading the value of nested key " + name + ")"); + throw; + } } - catch (Exception & e) + else { - e.addMessage("(while reading the value of nested key " + name + ")"); - throw; + if (!deserialize_element(element_column, element_pos)) + return ReturnType(false); } skipWhitespaceIfAny(istr); ++processed; } - assertChar('}', istr); + if constexpr (throw_exception) + assertChar('}', istr); + else if (!checkChar('}', istr)) + return ReturnType(false); /// Check if we have missing elements. if (processed != elems.size()) @@ -315,41 +439,87 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr continue; if (!settings.json.defaults_for_missing_elements_in_named_tuple) - throw Exception( - ErrorCodes::INCORRECT_DATA, - "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " - "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", - elems[element_pos]->getElementName()); + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " + "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", + elems[element_pos]->getElementName()); + return ReturnType(false); + } auto & element_column = extractElementColumn(column, element_pos); element_column.insertDefault(); } } - }); + + return ReturnType(true); + }; + + if constexpr (throw_exception) + addElementSafe(elems.size(), column, impl); + else + return addElementSafe(elems.size(), column, impl); } else { - assertChar('[', istr); + skipWhitespaceIfAny(istr); + if constexpr (throw_exception) + assertChar('[', istr); + else if (!checkChar('[', istr)) + return ReturnType(false); + skipWhitespaceIfAny(istr); - addElementSafe(elems.size(), column, [&] + auto impl = [&]() { for (size_t i = 0; i < elems.size(); ++i) { skipWhitespaceIfAny(istr); if (i != 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); } - elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); + + auto & element_column = extractElementColumn(column, i); + + if constexpr (throw_exception) + deserialize_element(element_column, i); + else if (!deserialize_element(element_column, i)) + return ReturnType(false); } skipWhitespaceIfAny(istr); - assertChar(']', istr); - }); + if constexpr (throw_exception) + assertChar(']', istr); + else if (!checkChar(']', istr)) + return ReturnType(false); + + return ReturnType(true); + }; + + if constexpr (throw_exception) + addElementSafe(elems.size(), column, impl); + else + return addElementSafe(elems.size(), column, impl); } } +void SerializationTuple::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +bool SerializationTuple::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); +} + + void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCString("", ostr); @@ -385,14 +555,48 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assertChar(settings.csv.tuple_delimiter, istr); skipWhitespaceIfAny(istr); } - if (settings.null_as_default) - SerializationNullable::deserializeTextCSVImpl(extractElementColumn(column, i), istr, settings, elems[i]); + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]); else - elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); + elems[i]->deserializeTextCSV(element_column, istr, settings); } }); } +bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + if (!checkChar(settings.csv.tuple_delimiter, istr)) + return false; + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + { + if (!SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i])) + return false; + } + else + { + if (!elems[i]->tryDeserializeTextCSV(element_column, istr, settings)) + return false; + } + } + + return true; + }); +} + void SerializationTuple::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 7325259f440..d9c63a05217 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -23,14 +23,17 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Each sub-column in a tuple is serialized in separate stream. */ @@ -73,6 +76,15 @@ private: bool have_explicit_names; size_t getPositionByName(const String & name) const; + + template + ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; + + template + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + + template + ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index 5cf17b4c0c8..5a7aeca67a0 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -25,15 +25,16 @@ void SerializationUUID::deserializeText(IColumn & column, ReadBuffer & istr, con throwUnexpectedDataAfterParsedValue(column, istr, settings, "UUID"); } -void SerializationUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationUUID::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const { - deserializeText(column, istr, settings, false); + UUID x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast(column).getData().push_back(x); + return true; } -void SerializationUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -76,6 +77,17 @@ void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(std::move(uuid)); /// It's important to do this at the end - for exception safety. } +bool SerializationUUID::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID uuid; + String field; + if (!checkChar('\'', istr) || !tryReadText(uuid, istr) || !checkChar('\'', istr)) + return false; + + assert_cast(column).getData().push_back(std::move(uuid)); + return true; +} + void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -92,6 +104,15 @@ void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } +bool SerializationUUID::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -106,6 +127,14 @@ void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(value); } +bool SerializationUUID::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID value; + if (!tryReadCSV(value, istr)) + return false; + assert_cast(column).getData().push_back(value); + return true; +} void SerializationUUID::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const { diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h index da8c15f7279..458504f8f42 100644 --- a/src/DataTypes/Serializations/SerializationUUID.h +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -10,14 +10,16 @@ class SerializationUUID : public SimpleTextSerialization public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp new file mode 100644 index 00000000000..ebd44fd6955 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -0,0 +1,828 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_DATA; +} + +void SerializationVariant::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; + + auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", false); + auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; + + settings.path.push_back(Substream::VariantDiscriminators); + auto discriminators_data = SubstreamData(discriminators_serialization) + .withType(type_variant ? std::make_shared>() : nullptr) + .withColumn(column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = discriminators_data; + callback(settings.path); + settings.path.pop_back(); + + settings.path.push_back(Substream::VariantElements); + settings.path.back().data = data; + + for (size_t i = 0; i < variants.size(); ++i) + { + settings.path.back().creator = std::make_shared(local_discriminators, variant_names[i], i, column_variant ? column_variant->localDiscriminatorByGlobal(i) : i); + + auto variant_data = SubstreamData(variants[i]) + .withType(type_variant ? type_variant->getVariant(i) : nullptr) + .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) + .withSerializationInfo(data.serialization_info); + + addVariantElementToPath(settings.path, i); + settings.path.back().data = variant_data; + variants[i]->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); + } + + settings.path.pop_back(); +} + +struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + +void SerializationVariant::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnVariant & col = assert_cast(column); + + auto variant_state = std::make_shared(); + variant_state->states.resize(variants.size()); + + settings.path.push_back(Substream::VariantElements); + + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->states[i]); + settings.path.pop_back(); + } + + settings.path.pop_back(); + state = std::move(variant_state); +} + + +void SerializationVariant::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * variant_state = checkAndGetState(state); + + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkStateSuffix(settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + + +void SerializationVariant::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto variant_state = std::make_shared(); + variant_state->states.resize(variants.size()); + + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i]); + settings.path.pop_back(); + } + + settings.path.pop_back(); + state = std::move(variant_state); +} + + +void SerializationVariant::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnVariant & col = assert_cast(column); + if (const size_t size = col.size(); limit == 0 || offset + limit > size) + limit = size - offset; + + settings.path.push_back(Substream::VariantDiscriminators); + auto * discriminators_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::serializeBinaryBulkWithMultipleStreams"); + + auto * variant_state = checkAndGetState(state); + + /// If offset = 0 and limit == col.size() or we have only NULLs, we don't need to calculate + /// offsets and limits for variants and need to just serialize whole columns. + if ((offset == 0 && limit == col.size()) || col.hasOnlyNulls()) + { + /// First, serialize discriminators. + /// If we have only NULLs or local and global discriminators are the same, just serialize the column as is. + if (col.hasOnlyNulls() || col.hasGlobalVariantsOrder()) + { + SerializationNumber().serializeBinaryBulk(col.getLocalDiscriminatorsColumn(), *discriminators_stream, offset, limit); + } + /// If local and global discriminators are different, we should convert local to global before serializing (because we don't serialize the mapping). + else + { + const auto & local_discriminators = col.getLocalDiscriminators(); + for (size_t i = offset; i != offset + limit; ++i) + writeBinaryLittleEndian(col.globalDiscriminatorByLocal(local_discriminators[i]), *discriminators_stream); + } + + /// Second, serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); + return; + } + + /// If we have only one non empty variant and no NULLs, we can use the same limit offset for this variant. + if (auto non_empty_local_discr = col.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + /// First, serialize discriminators. + /// We know that all discriminators are the same, so we just need to serialize this discriminator limit times. + auto non_empty_global_discr = col.globalDiscriminatorByLocal(*non_empty_local_discr); + for (size_t i = 0; i != limit; ++i) + writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream); + + /// Second, serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + /// For non empty variant use the same offset/limit as for whole Variant column + if (i == non_empty_global_discr) + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), offset, limit, settings, variant_state->states[i]); + /// For empty variants, use just 0/0, they won't serialize anything. + else + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); + return; + } + + /// In general case we should iterate through local discriminators in range [offset, offset + limit] to serialize global discriminators and calculate offset/limit pair for each variant. + const auto & local_discriminators = col.getLocalDiscriminators(); + const auto & offsets = col.getOffsets(); + std::vector> variant_offsets_and_limits(variants.size(), {0, 0}); + size_t end = offset + limit; + for (size_t i = offset; i < end; ++i) + { + auto global_discr = col.globalDiscriminatorByLocal(local_discriminators[i]); + writeBinaryLittleEndian(global_discr, *discriminators_stream); + + if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) + { + /// If we see this discriminator for the first time, update offset + if (!variant_offsets_and_limits[global_discr].second) + variant_offsets_and_limits[global_discr].first = offsets[i]; + /// Update limit for this discriminator. + ++variant_offsets_and_limits[global_discr].second; + } + } + + /// If limit for some variant is 0, it means that we don't have its discriminator in the range. + /// Set offset to the size of column for such variants, so we won't serialize values from them. + for (size_t i = 0; i != variant_offsets_and_limits.size(); ++i) + { + if (!variant_offsets_and_limits[i].second) + variant_offsets_and_limits[i].first = col.getVariantByGlobalDiscriminator(i).size(); + } + + /// Serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams( + col.getVariantByGlobalDiscriminator(i), + variant_offsets_and_limits[i].first, + variant_offsets_and_limits[i].second, + settings, + variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + + +void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnVariant & col = assert_cast(*mutable_column); + /// We always serialize Variant column with global variants order, + /// so while deserialization column should be always with global variants order. + if (!col.hasGlobalVariantsOrder()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); + + /// First, deserialize new discriminators. + /// We deserialize them into a separate column to be able to use substream cache, + /// so if we also need to deserialize some of sub columns, we will read discriminators only once. + settings.path.push_back(Substream::VariantDiscriminators); + ColumnPtr discriminators; + if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) + { + discriminators = cached_discriminators; + } + else + { + auto * discriminators_stream = settings.getter(settings.path); + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::deserializeBinaryBulkWithMultipleStreams"); + + discriminators = ColumnVariant::ColumnDiscriminators::create(); + SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, discriminators); + } + + settings.path.pop_back(); + + /// Iterate through new discriminators, append them to column and calculate the limit for each variant. + /// While calculating limits we can also fill offsets column (we store offsets only in memory). + const auto & discriminators_data = assert_cast(*discriminators).getData(); + auto & local_discriminators = col.getLocalDiscriminators(); + local_discriminators.reserve(local_discriminators.size() + limit); + auto & offsets = col.getOffsets(); + offsets.reserve(offsets.size() + limit); + std::vector variant_limits(variants.size(), 0); + for (size_t i = 0; i != limit; ++i) + { + ColumnVariant::Discriminator discr = discriminators_data[i]; + local_discriminators.push_back(discr); + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + offsets.emplace_back(); + else + offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]++); + } + + /// Now we can deserialize variants according to their limits. + auto * variant_state = checkAndGetState(state); + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->deserializeBinaryBulkWithMultipleStreams(col.getVariantPtrByLocalDiscriminator(i), variant_limits[i], settings, variant_state->states[i], cache); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + +void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const +{ + path.push_back(Substream::VariantElement); + path.back().variant_element_name = variant_names[i]; +} + +void SerializationVariant::serializeBinary(const Field & /*field*/, WriteBuffer & /*ostr*/, const FormatSettings & /*settings*/) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinary from a field is not implemented for SerializationVariant"); +} + +void SerializationVariant::deserializeBinary(Field & /*field*/, ReadBuffer & /*istr*/, const FormatSettings & /*settings*/) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method deserializeBinary to a field is not implemented for SerializationVariant"); +} + +void SerializationVariant::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + writeBinaryLittleEndian(global_discr, ostr); + if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) + variants[global_discr]->serializeBinary(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnVariant & col = assert_cast(column); + ColumnVariant::Discriminator global_discr; + readBinaryLittleEndian(global_discr, istr); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + col.insertDefault(); + } + else + { + auto & variant_column = col.getVariantByGlobalDiscriminator(global_discr); + variants[global_discr]->deserializeBinary(variant_column, istr, settings); + col.getLocalDiscriminators().push_back(col.localDiscriminatorByGlobal(global_discr)); + col.getOffsets().push_back(variant_column.size() - 1); + } +} + +namespace +{ + +std::unordered_map getTypesTextDeserializePriorityMap() +{ + static const std::vector priorities = { + /// Complex types have highest priority. + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + TypeIndex::AggregateFunction, + + /// Enums can be parsed both from strings and numbers. + /// So they have high enough priority. + TypeIndex::Enum8, + TypeIndex::Enum16, + + /// Types that can be parsed from strings. + TypeIndex::UUID, + TypeIndex::IPv4, + TypeIndex::IPv6, + + /// Types that can be parsed from numbers. + /// The order: + /// 1) Integers + /// 2) Big Integers + /// 3) Decimals + /// 4) Floats + /// In each group small types have higher priority. + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Float32, + TypeIndex::Float64, + + /// Dates and DateTimes. More simple Date types have higher priority. + /// They have lower priority as numbers as some DateTimes sometimes can + /// be also parsed from numbers, but we don't want it usually. + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + + /// String types have almost the lowest priority, + /// as in text formats almost all data can + /// be deserialized into String type. + TypeIndex::FixedString, + TypeIndex::String, + }; + + std::unordered_map priority_map; + priority_map.reserve(priorities.size()); + for (size_t i = 0; i != priorities.size(); ++i) + priority_map[priorities[i]] = priorities.size() - i; + return priority_map; +} + +/// We want to create more or less optimal order of types in which we will try text deserializations. +/// To do it, for each type we calculate a priority and then sort them by this priority. +/// Above we defined priority of each data type, but types can be nested and also we can have LowCardinality and Nullable. +/// To sort any nested types we create a priority that is a tuple of 3 elements: +/// 1) The maximum depth of nested types like Array/Map/Tuple. +/// 2) The combination of simple and complex types priorities. +/// 3) The depth of nested types LowCardinality/Nullable. +/// So, when we will sort types, first we will sort by the maximum depth of nested types, so more nested types are deserialized first, +/// then for types with the same depth we sort by the types priority, and last we sort by the depth of LowCardinality/Nullable types, +/// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types +/// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). +/// This is just a batch of heuristics, +std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, std::unordered_map & priority_map) +{ + if (const auto * nullable_type = typeid_cast(type.get())) + return getTypeTextDeserializePriority(nullable_type->getNestedType(), nested_depth, simple_nested_depth + 1, priority_map); + + if (const auto * lc_type = typeid_cast(type.get())) + return getTypeTextDeserializePriority(lc_type->getDictionaryType(), nested_depth, simple_nested_depth + 1, priority_map); + + if (const auto * array_type = typeid_cast(type.get())) + { + auto [elements_nested_depth, elements_priority, elements_simple_nested_depth] = getTypeTextDeserializePriority(array_type->getNestedType(), nested_depth + 1, simple_nested_depth, priority_map); + return {elements_nested_depth, elements_priority + priority_map[TypeIndex::Array], elements_simple_nested_depth}; + } + + if (const auto * tuple_type = typeid_cast(type.get())) + { + size_t max_nested_depth = 0; + size_t sum_priority = 0; + size_t max_simple_nested_depth = 0; + for (const auto & elem : tuple_type->getElements()) + { + auto [elem_nested_depth, elem_priority, elem_simple_nested_depth] = getTypeTextDeserializePriority(elem, nested_depth + 1, simple_nested_depth, priority_map); + sum_priority += elem_priority; + if (elem_nested_depth > max_nested_depth) + max_nested_depth = elem_nested_depth; + if (elem_simple_nested_depth > max_simple_nested_depth) + max_simple_nested_depth = elem_simple_nested_depth; + } + + return {max_nested_depth, sum_priority + priority_map[TypeIndex::Tuple], max_simple_nested_depth}; + } + + if (const auto * map_type = typeid_cast(type.get())) + { + auto [key_max_depth, key_priority, key_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getKeyType(), nested_depth + 1, simple_nested_depth, priority_map); + auto [value_max_depth, value_priority, value_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getValueType(), nested_depth + 1, simple_nested_depth, priority_map); + return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map[TypeIndex::Map], std::max(key_simple_nested_depth, value_simple_nested_depth)}; + } + + if (const auto * variant_type = typeid_cast(type.get())) + { + size_t max_priority = 0; + size_t max_depth = 0; + size_t max_simple_nested_depth = 0; + for (const auto & variant : variant_type->getVariants()) + { + auto [variant_max_depth, variant_priority, variant_simple_nested_depth] = getTypeTextDeserializePriority(variant, nested_depth, simple_nested_depth, priority_map); + if (variant_priority > max_priority) + max_priority = variant_priority; + if (variant_max_depth > max_depth) + max_depth = variant_max_depth; + if (variant_simple_nested_depth > max_simple_nested_depth) + max_simple_nested_depth = variant_simple_nested_depth; + } + + return {max_depth, max_priority, max_simple_nested_depth}; + } + + return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; +} + +} + +std::vector SerializationVariant::getVariantsDeserializeTextOrder(const DB::DataTypes & variant_types) +{ + std::vector> priorities; + priorities.reserve(variant_types.size()); + std::vector order; + order.reserve(variant_types.size()); + auto priority_map = getTypesTextDeserializePriorityMap(); + for (size_t i = 0; i != variant_types.size(); ++i) + { + priorities.push_back(getTypeTextDeserializePriority(variant_types[i], 0, 0, priority_map)); + order.push_back(i); + } + + std::sort(order.begin(), order.end(), [&](size_t left, size_t right) { return priorities[left] > priorities[right]; }); + String types_order; + for (auto i : order) + types_order += " " + variant_types[i]->getName(); + return order; +} + + +bool SerializationVariant::tryDeserializeImpl( + IColumn & column, + const String & field, + std::function check_for_null, + std::function try_deserialize_variant) const +{ + auto & column_variant = assert_cast(column); + ReadBufferFromString null_buf(field); + if (check_for_null(null_buf) && null_buf.eof()) + { + column_variant.insertDefault(); + return true; + } + + for (size_t global_discr : deserialize_text_order) + { + ReadBufferFromString variant_buf(field); + /// Usually try_deserialize_variant should not throw an exception, but let's use try/catch just in case. + try + { + auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); + size_t prev_size = variant_column.size(); + if (try_deserialize_variant(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) + { + column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); + column_variant.getOffsets().push_back(prev_size); + return true; + } + else if (variant_column.size() > prev_size) + { + variant_column.popBack(1); + } + } + catch (...) + { + /// Try next variant. + } + } + + return false; +} + +void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullEscaped(ostr, settings); + else + variants[global_discr]->serializeTextEscaped(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readEscapedString(field, istr); + return tryDeserializeTextEscapedImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readEscapedString(field, istr); + if (!tryDeserializeTextEscapedImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse escaped value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextEscapedImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullEscaped(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextEscaped(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullRaw(ostr, settings); + else + variants[global_discr]->serializeTextRaw(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readString(field, istr); + return tryDeserializeTextRawImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readString(field, istr); + if (!tryDeserializeTextRawImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse raw value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextRawImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullRaw(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextRaw(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullQuoted(ostr); + else + variants[global_discr]->serializeTextQuoted(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + if (!tryReadQuotedField(field, istr)) + return false; + return tryDeserializeTextQuotedImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readQuotedField(field, istr); + if (!tryDeserializeTextQuotedImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse quoted value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextQuotedImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullQuoted(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextQuoted(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullCSV(ostr, settings); + else + variants[global_discr]->serializeTextCSV(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readCSVStringInto(field, istr, settings.csv); + return tryDeserializeTextCSVImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readCSVField(field, istr, settings.csv); + if (!tryDeserializeTextCSVImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse CSV value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextCSVImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullCSV(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextCSV(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullText(ostr, settings); + else + variants[global_discr]->serializeText(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readStringUntilEOF(field, istr); + return tryDeserializeWholeTextImpl(column, field, settings); +} + +void SerializationVariant::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readStringUntilEOF(field, istr); + if (!tryDeserializeWholeTextImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse text value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeWholeTextImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullText(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeWholeText(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullJSON(ostr); + else + variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + if (!tryReadJSONField(field, istr)) + return false; + return tryDeserializeTextJSONImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readJSONField(field, istr); + if (!tryDeserializeTextJSONImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextJSONImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullJSON(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextJSON(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullXML(ostr); + else + variants[global_discr]->serializeTextXML(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h new file mode 100644 index 00000000000..b6bee94c65f --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationVariant : public ISerialization +{ +public: + using VariantSerializations = std::vector; + + explicit SerializationVariant( + const VariantSerializations & variants_, + const std::vector & variant_names_, + const std::vector & deserialize_text_order_, + const String & variant_name_) + : variants(variants_), variant_names(variant_names_), deserialize_text_order(deserialize_text_order_), variant_name(variant_name_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + /// Determine the order in which we should try to deserialize variants. + /// In some cases the text representation of a value can be deserialized + /// into several types (for example, almost all text values can be deserialized + /// into String type), so we uses some heuristics to determine the more optimal order. + static std::vector getVariantsDeserializeTextOrder(const DataTypes & variant_types); + +private: + void addVariantElementToPath(SubstreamPath & path, size_t i) const; + + bool tryDeserializeTextEscapedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextQuotedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeWholeTextImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextCSVImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextJSONImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextRawImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + + bool tryDeserializeImpl( + IColumn & column, + const String & field, + std::function check_for_null, + std::function try_deserialize_nested) const; + + VariantSerializations variants; + std::vector variant_names; + std::vector deserialize_text_order; + /// Name of Variant data type for better exception messages. + String variant_name; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp new file mode 100644 index 00000000000..4b24ee5754e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -0,0 +1,241 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +void SerializationVariantElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData & data) const +{ + /// We will need stream for discriminators during deserialization. + settings.path.push_back(Substream::VariantDiscriminators); + callback(settings.path); + settings.path.pop_back(); + + addVariantToPath(settings.path); + settings.path.back().data = data; + nested_serialization->enumerateStreams(settings, callback, data); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const +{ + addVariantToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnNullable * nullable_col = typeid_cast(mutable_column.get()); + NullMap * null_map = nullable_col ? &nullable_col->getNullMapData() : nullptr; + + /// First, deserialize discriminators from Variant column. + settings.path.push_back(Substream::VariantDiscriminators); + ColumnPtr discriminators; + if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) + { + discriminators = cached_discriminators; + } + else + { + auto * discriminators_stream = settings.getter(settings.path); + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams"); + + discriminators = ColumnVariant::ColumnDiscriminators::create(); + SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, discriminators); + } + settings.path.pop_back(); + + /// Iterate through discriminators to calculate the size of the variant. + const auto & discriminators_data = assert_cast(*discriminators).getData(); + size_t variant_size = 0; + for (auto discr : discriminators_data) + variant_size += discr == variant_discriminator; + + /// Now we know the size of the variant and can deserialize it. + + /// If the size of variant column is the same as the size of discriminators, + /// we can deserialize new values directly into our column. + if (variant_size == discriminators_data.size()) + { + addVariantToPath(settings.path); + /// Special case when our result column is LowCardinality(Nullable(T)). + /// In this case the variant type is LowCardinality(T), and we cannot just + /// deserialize its values directly into LowCardinality(Nullable(T)) column. + /// We create a separate column with type LowCardinality(T), deserialize + /// values into it and then insert into result column using insertRangeFrom. + if (isColumnLowCardinalityNullable(*column)) + { + ColumnPtr variant_col = mutable_column->cloneEmpty(); + /// LowCardinality(Nullable(T)) -> LowCardinality(T) + assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, limit, settings, state, cache); + mutable_column->insertRangeFrom(*variant_col, 0, variant_col->size()); + } + else + { + nested_serialization->deserializeBinaryBulkWithMultipleStreams(nullable_col ? nullable_col->getNestedColumnPtr() : column, limit, settings, state, cache); + } + if (nullable_col) + null_map->resize_fill(null_map->size() + limit, 0); + removeVariantFromPath(settings.path); + return; + } + + /// If variant size is 0, just fill column with default values. + if (variant_size == 0) + { + mutable_column->insertManyDefaults(limit); + return; + } + + /// In general case we should deserialize variant into a separate column, + /// iterate through discriminators and insert values from variant only when + /// row contains its discriminator and default value otherwise. + mutable_column->reserve(mutable_column->size() + limit); + mutable_column = nullable_col ? nullable_col->getNestedColumnPtr()->assumeMutable() : std::move(mutable_column); + ColumnPtr variant_col = mutable_column->cloneEmpty(); + + /// Special case when our result column is LowCardinality(Nullable(T)). + /// We should remove Nullable from variant column before deserialization. + if (isColumnLowCardinalityNullable(*column)) + assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); + + addVariantToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, variant_size, settings, state, cache); + removeVariantFromPath(settings.path); + + size_t variant_index = 0; + for (auto discr : discriminators_data) + { + if (discr == variant_discriminator) + { + if (null_map) + null_map->push_back(0); + mutable_column->insertFrom(*variant_col, variant_index++); + } + else + { + if (null_map) + null_map->push_back(1); + mutable_column->insertDefault(); + } + } +} + +void SerializationVariantElement::addVariantToPath(DB::ISerialization::SubstreamPath & path) const +{ + path.push_back(Substream::VariantElements); + path.push_back(Substream::VariantElement); + path.back().variant_element_name = variant_element_name; +} + +void SerializationVariantElement::removeVariantFromPath(DB::ISerialization::SubstreamPath & path) const +{ + path.pop_back(); + path.pop_back(); +} + +SerializationVariantElement::VariantSubcolumnCreator::VariantSubcolumnCreator( + const DB::ColumnPtr & local_discriminators_, + const DB::String & variant_element_name_, + const ColumnVariant::Discriminator global_variant_discriminator_, + const ColumnVariant::Discriminator local_variant_discriminator_) + : local_discriminators(local_discriminators_) + , variant_element_name(variant_element_name_) + , global_variant_discriminator(global_variant_discriminator_) + , local_variant_discriminator(local_variant_discriminator_) +{ +} + +DataTypePtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::DataTypePtr & prev) const +{ + return makeNullableOrLowCardinalityNullableSafe(prev); +} + +SerializationPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::SerializationPtr & prev) const +{ + return std::make_shared(prev, variant_element_name, global_variant_discriminator); +} + +ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::ColumnPtr & prev) const +{ + /// Case when original Variant column contained only one non-empty variant and no NULLs. + /// In this case just use this variant. + if (prev->size() == local_discriminators->size()) + return makeNullableOrLowCardinalityNullableSafe(prev); + + /// If this variant is empty, fill result column with default values. + if (prev->empty()) + { + auto res = IColumn::mutate(makeNullableOrLowCardinalityNullableSafe(prev)); + res->insertManyDefaults(local_discriminators->size()); + return res; + } + + /// In general case we should iterate through discriminators and create null-map for our variant. + NullMap null_map; + null_map.reserve(local_discriminators->size()); + const auto & local_discriminators_data = assert_cast(*local_discriminators).getData(); + for (auto local_discr : local_discriminators_data) + null_map.push_back(local_discr != local_variant_discriminator); + + /// Now we can create new column from null-map and variant column using IColumn::expand. + auto res_column = IColumn::mutate(prev); + + /// Special case for LowCardinality. We want the result to be LowCardinality(Nullable), + /// but we don't have a good way to apply null-mask for LowCardinality(), so, we first + /// convert our column to LowCardinality(Nullable()) and then use expand which will + /// fill rows with 0 in mask with default value (that is NULL). + if (prev->lowCardinality()) + res_column = assert_cast(*res_column).cloneNullable(); + + res_column->expand(null_map, /*inverted = */ true); + + if (res_column->canBeInsideNullable()) + { + auto null_map_col = ColumnUInt8::create(); + null_map_col->getData() = std::move(null_map); + return ColumnNullable::create(std::move(res_column), std::move(null_map_col)); + } + + return res_column; +} + +} diff --git a/src/DataTypes/Serializations/SerializationVariantElement.h b/src/DataTypes/Serializations/SerializationVariantElement.h new file mode 100644 index 00000000000..c343c219cf3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariantElement.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class SerializationVariant; + +/// Serialization for Variant element when we read it as a subcolumn. +class SerializationVariantElement final : public SerializationWrapper +{ +private: + /// To be able to deserialize Variant element as a subcolumn + /// we need its type name and global discriminator. + String variant_element_name; + ColumnVariant::Discriminator variant_discriminator; + +public: + SerializationVariantElement(const SerializationPtr & nested_, const String & variant_element_name_, ColumnVariant::Discriminator variant_discriminator_) + : SerializationWrapper(nested_) + , variant_element_name(variant_element_name_) + , variant_discriminator(variant_discriminator_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + friend SerializationVariant; + + void addVariantToPath(SubstreamPath & path) const; + void removeVariantFromPath(SubstreamPath & path) const; + + struct VariantSubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr local_discriminators; + const String variant_element_name; + const ColumnVariant::Discriminator global_variant_discriminator; + const ColumnVariant::Discriminator local_variant_discriminator; + + VariantSubcolumnCreator( + const ColumnPtr & local_discriminators_, + const String & variant_element_name_, + const ColumnVariant::Discriminator global_variant_discriminator_, + const ColumnVariant::Discriminator local_variant_discriminator_); + + DataTypePtr create(const DataTypePtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + }; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index 18e4891ee65..bde52bb8096 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -96,6 +96,11 @@ void SerializationWrapper::deserializeTextEscaped(IColumn & column, ReadBuffer & nested_serialization->deserializeTextEscaped(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextEscaped(column, istr, settings); +} + void SerializationWrapper::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextQuoted(column, row_num, ostr, settings); @@ -106,6 +111,11 @@ void SerializationWrapper::deserializeTextQuoted(IColumn & column, ReadBuffer & nested_serialization->deserializeTextQuoted(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextQuoted(column, istr, settings); +} + void SerializationWrapper::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextCSV(column, row_num, ostr, settings); @@ -116,6 +126,11 @@ void SerializationWrapper::deserializeTextCSV(IColumn & column, ReadBuffer & ist nested_serialization->deserializeTextCSV(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextCSV(column, istr, settings); +} + void SerializationWrapper::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeText(column, row_num, ostr, settings); @@ -126,6 +141,11 @@ void SerializationWrapper::deserializeWholeText(IColumn & column, ReadBuffer & i nested_serialization->deserializeWholeText(column, istr, settings); } +bool SerializationWrapper::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeWholeText(column, istr, settings); +} + void SerializationWrapper::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextJSON(column, row_num, ostr, settings); @@ -136,6 +156,11 @@ void SerializationWrapper::deserializeTextJSON(IColumn & column, ReadBuffer & is nested_serialization->deserializeTextJSON(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextJSON(column, istr, settings); +} + void SerializationWrapper::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const { nested_serialization->serializeTextJSONPretty(column, row_num, ostr, settings, indent); diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index 31900f93148..6c5e2046062 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -63,18 +63,23 @@ public: void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SimpleTextSerialization.h b/src/DataTypes/Serializations/SimpleTextSerialization.h index 0247f30b30a..11f56de73d1 100644 --- a/src/DataTypes/Serializations/SimpleTextSerialization.h +++ b/src/DataTypes/Serializations/SimpleTextSerialization.h @@ -36,29 +36,67 @@ protected: deserializeText(column, istr, settings, true); } + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, true); + } + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + /// whole = true means that buffer contains only one value, so we should read until EOF. /// It's needed to check if there is garbage after parsed field. virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + + virtual bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const + { + try + { + deserializeText(column, istr, settings, whole); + return true; + } + catch (...) + { + return false; + } + } }; } diff --git a/src/DataTypes/Utils.cpp b/src/DataTypes/Utils.cpp index e58331a8bcb..2f29d57d454 100644 --- a/src/DataTypes/Utils.cpp +++ b/src/DataTypes/Utils.cpp @@ -223,6 +223,7 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ case TypeIndex::AggregateFunction: case TypeIndex::Nothing: case TypeIndex::JSONPaths: + case TypeIndex::Variant: return false; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 36dd858dcf7..76f6dc25aae 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -907,6 +907,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context->setSetting("allow_experimental_nlp_functions", 1); query_context->setSetting("allow_experimental_hash_functions", 1); query_context->setSetting("allow_experimental_object_type", 1); + query_context->setSetting("allow_experimental_variant_type", 1); query_context->setSetting("allow_experimental_annoy_index", 1); query_context->setSetting("allow_experimental_usearch_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 9cc7cb3b89e..a2528f9f948 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -109,31 +109,31 @@ bool deserializeFieldByEscapingRule( { case FormatSettings::EscapingRule::Escaped: if (parse_as_nullable) - read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(column, buf, format_settings, serialization); else serialization->deserializeTextEscaped(column, buf, format_settings); break; case FormatSettings::EscapingRule::Quoted: if (parse_as_nullable) - read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, buf, format_settings); break; case FormatSettings::EscapingRule::CSV: if (parse_as_nullable) - read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(column, buf, format_settings, serialization); else serialization->deserializeTextCSV(column, buf, format_settings); break; case FormatSettings::EscapingRule::JSON: if (parse_as_nullable) - read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, buf, format_settings, serialization); else serialization->deserializeTextJSON(column, buf, format_settings); break; case FormatSettings::EscapingRule::Raw: if (parse_as_nullable) - read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(column, buf, format_settings, serialization); else serialization->deserializeTextRaw(column, buf, format_settings); break; diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 7ddfdb6b572..4e7795f61bd 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -288,14 +288,14 @@ namespace JSONUtils ReadBufferFromString buf(str); if (as_nullable) - return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedWholeText(column, buf, format_settings, serialization); serialization->deserializeWholeText(column, buf, format_settings); return true; } if (as_nullable) - return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, in, format_settings, serialization); serialization->deserializeTextJSON(column, in, format_settings); return true; diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index e2ba188d015..6890e412f75 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -946,7 +946,7 @@ namespace if constexpr (is_json) ok = tryReadJSONStringInto(field, buf); else - ok = tryReadQuotedStringInto(field, buf); + ok = tryReadQuotedString(field, buf); if (!ok) return nullptr; diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index bef1e7b420a..f9f61ceed0d 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -4067,6 +4069,259 @@ arguments, result_type, input_rows_count); \ "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); } + WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const + { + /// We support only extension of variant type, so, only new types can be added. + /// For example: Variant(T1, T2) -> Variant(T1, T2, T3) is supported, but Variant(T1, T2) -> Variant(T1, T3) is not supported. + /// We want to extend Variant type for free without rewriting the data, but we sort data types inside Variant during type creation + /// (we do it because we want Variant(T1, T2) to be the same as Variant(T2, T1)), but after extension the order of variant types + /// (and so their discriminators) can be different. For example: Variant(T1, T3) -> Variant(T1, T2, T3). + /// To avoid full rewrite of discriminators column, ColumnVariant supports it's local order of variant columns (and so local + /// discriminators) and stores mapping global order -> local order. + /// So, to extend Variant with new types for free, we should keep old local order for old variants, append new variants and change + /// mapping global order -> local order according to the new global order. + + /// Create map (new variant type) -> (it's global discriminator in new order). + const auto & new_variants = to_variant.getVariants(); + std::unordered_map new_variant_types_to_new_global_discriminator; + new_variant_types_to_new_global_discriminator.reserve(new_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + new_variant_types_to_new_global_discriminator[new_variants[i]->getName()] = i; + + /// Create set of old variant types. + const auto & old_variants = from_variant.getVariants(); + std::unordered_map old_variant_types_to_old_global_discriminator; + old_variant_types_to_old_global_discriminator.reserve(old_variants.size()); + for (size_t i = 0; i != old_variants.size(); ++i) + old_variant_types_to_old_global_discriminator[old_variants[i]->getName()] = i; + + /// Check that the set of old variants types is a subset of new variant types and collect new global discriminator for each old global discriminator. + std::unordered_map old_global_discriminator_to_new; + old_global_discriminator_to_new.reserve(old_variants.size()); + for (const auto & [old_variant_type, old_discriminator] : old_variant_types_to_old_global_discriminator) + { + auto it = new_variant_types_to_new_global_discriminator.find(old_variant_type); + if (it == new_variant_types_to_new_global_discriminator.end()) + throw Exception( + ErrorCodes::CANNOT_CONVERT_TYPE, + "Cannot convert type {} to {}. Conversion between Variant types is allowed only when new Variant type is an extension " + "of an initial one", from_variant.getName(), to_variant.getName()); + old_global_discriminator_to_new[old_discriminator] = it->second; + } + + /// Collect variant types and their global discriminators that should be added to the old Variant to get the new Variant. + std::vector> variant_types_and_discriminators_to_add; + variant_types_and_discriminators_to_add.reserve(new_variants.size() - old_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + { + if (!old_variant_types_to_old_global_discriminator.contains(new_variants[i]->getName())) + variant_types_and_discriminators_to_add.emplace_back(new_variants[i], i); + } + + return [old_global_discriminator_to_new, variant_types_and_discriminators_to_add] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + size_t num_old_variants = column_variant.getNumVariants(); + Columns new_variant_columns; + new_variant_columns.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + std::vector new_local_to_global_discriminators; + new_local_to_global_discriminators.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + for (size_t i = 0; i != num_old_variants; ++i) + { + new_variant_columns.push_back(column_variant.getVariantPtrByLocalDiscriminator(i)); + new_local_to_global_discriminators.push_back(old_global_discriminator_to_new.at(column_variant.globalDiscriminatorByLocal(i))); + } + + for (const auto & [new_variant_type, new_global_discriminator] : variant_types_and_discriminators_to_add) + { + new_variant_columns.push_back(new_variant_type->createColumn()); + new_local_to_global_discriminators.push_back(new_global_discriminator); + } + + return ColumnVariant::create(column_variant.getLocalDiscriminatorsPtr(), column_variant.getOffsetsPtr(), new_variant_columns, new_local_to_global_discriminators); + }; + } + + WrapperType createVariantToColumnWrapper(const DataTypeVariant & from_variant, const DataTypePtr & to_type) const + { + const auto & variant_types = from_variant.getVariants(); + std::vector variant_wrappers; + variant_wrappers.reserve(variant_types.size()); + + /// Create conversion wrapper for each variant. + for (const auto & variant_type : variant_types) + variant_wrappers.push_back(prepareUnpackDictionaries(variant_type, to_type)); + + return [variant_wrappers, variant_types, to_type] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + + /// First, cast each variant to the result type. + std::vector casted_variant_columns; + casted_variant_columns.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); + ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; + const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; + casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); + } + + /// Second, construct resulting column from casted variant columns according to discriminators. + const auto & local_discriminators = column_variant.getLocalDiscriminators(); + auto res = result_type->createColumn(); + res->reserve(input_rows_count); + for (size_t i = 0; i != input_rows_count; ++i) + { + auto local_discr = local_discriminators[i]; + if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + res->insertDefault(); + else + res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); + } + + return res; + }; + } + + static ColumnPtr createVariantFromDescriptorsAndOneNonEmptyVariant(const DataTypes & variant_types, const ColumnPtr & discriminators, const ColumnPtr & variant, ColumnVariant::Discriminator variant_discr) + { + Columns variants; + variants.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + if (i == variant_discr) + variants.emplace_back(variant); + else + variants.push_back(variant_types[i]->createColumn()); + } + + return ColumnVariant::create(discriminators, variants); + } + + WrapperType createColumnToVariantWrapper(const DataTypePtr & from_type, const DataTypeVariant & to_variant) const + { + /// We allow converting NULL to Variant(...) as Variant can store NULLs. + if (from_type->onlyNull()) + { + return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto result_column = result_type->createColumn(); + result_column->insertManyDefaults(input_rows_count); + return result_column; + }; + } + + auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(removeNullableOrLowCardinalityNullable(from_type)); + if (!variant_discr_opt) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert type {} to {}. Conversion to Variant allowed only for types from this Variant", from_type->getName(), to_variant.getName()); + + return [variant_discr = *variant_discr_opt] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & result_variant_type = assert_cast(*result_type); + const auto & variant_types = result_variant_type.getVariants(); + if (const ColumnNullable * col_nullable = typeid_cast(arguments.front().column.get())) + { + const auto & column = col_nullable->getNestedColumnPtr(); + const auto & null_map = col_nullable->getNullMapData(); + IColumn::Filter filter; + filter.reserve(column->size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(column->size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != column->size(); ++i) + { + if (null_map[i]) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + ColumnPtr variant_column; + /// If there were no NULLs, just use the column. + if (variant_size_hint == column->size()) + variant_column = column; + /// Otherwise we should use filtered column. + else + variant_column = column->filter(filter, variant_size_hint); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), variant_column, variant_discr); + } + else if (isColumnLowCardinalityNullable(*arguments.front().column)) + { + const auto & column = arguments.front().column; + + /// Variant column cannot have LowCardinality(Nullable(...)) variant, as Variant column stores NULLs itself. + /// We should create a null-map, insert NULL_DISCRIMINATOR on NULL values and filter initial column. + const auto & col_lc = assert_cast(*column); + const auto & indexes = col_lc.getIndexes(); + auto null_index = col_lc.getDictionary().getNullValueIndex(); + IColumn::Filter filter; + filter.reserve(col_lc.size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(col_lc.size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != col_lc.size(); ++i) + { + if (indexes.getUInt(i) == null_index) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + MutableColumnPtr variant_column; + /// If there were no NULLs, we can just clone the column. + if (variant_size_hint == col_lc.size()) + variant_column = IColumn::mutate(column); + /// Otherwise we should filter column. + else + variant_column = column->filter(filter, variant_size_hint)->assumeMutable(); + + assert_cast(*variant_column).nestedRemoveNullable(); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), std::move(variant_column), variant_discr); + } + else + { + const auto & column = arguments.front().column; + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + discriminators->getData().resize_fill(column->size(), variant_discr); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), column, variant_discr); + } + }; + } + + /// Wrapper for conversion to/from Variant type + WrapperType createVariantWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + if (const auto * from_variant = checkAndGetDataType(from_type.get())) + { + if (const auto * to_variant = checkAndGetDataType(to_type.get())) + return createVariantToVariantWrapper(*from_variant, *to_variant); + + return createVariantToColumnWrapper(*from_variant, to_type); + } + + return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); + } + template WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const { @@ -4246,6 +4501,11 @@ arguments, result_type, input_rows_count); \ WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const { + /// Conversion from/to Variant data type is processed in a special way. + /// We don't need to remove LowCardinality/Nullable. + if (isVariant(to_type) || isVariant(from_type)) + return createVariantWrapper(from_type, to_type); + const auto * from_low_cardinality = typeid_cast(from_type.get()); const auto * to_low_cardinality = typeid_cast(to_type.get()); const auto & from_nested = from_low_cardinality ? from_low_cardinality->getDictionaryType() : from_type; @@ -4253,7 +4513,7 @@ arguments, result_type, input_rows_count); \ if (from_type->onlyNull()) { - if (!to_nested->isNullable()) + if (!to_nested->isNullable() && !isVariant(to_type)) { if (cast_type == CastType::accurateOrNull) { diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index eba1733c683..b15bc5938be 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +24,8 @@ #include #include #include +#include + #include @@ -215,9 +219,16 @@ class FunctionIf : public FunctionIfBase { public: static constexpr auto name = "if"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if); + } + + FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} private: + bool use_variant_when_no_common_type = false; + template static UInt32 decimalScale(const ColumnsWithTypeAndName & arguments [[maybe_unused]]) { @@ -626,13 +637,23 @@ private: } static ColumnPtr executeGeneric( - const ColumnUInt8 * cond_col, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) + const ColumnUInt8 * cond_col, const ColumnsWithTypeAndName & arguments, size_t input_rows_count, bool use_variant_when_no_common_type) { /// Convert both columns to the common type (if needed). const ColumnWithTypeAndName & arg1 = arguments[1]; const ColumnWithTypeAndName & arg2 = arguments[2]; - DataTypePtr common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); + DataTypePtr common_type; + if (use_variant_when_no_common_type) + { + common_type = tryGetLeastSupertype(DataTypes{arg1.type, arg2.type}); + if (!common_type) + common_type = std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arg1.type), removeNullableOrLowCardinalityNullable(arg2.type)}); + } + else + { + common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); + } ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -807,6 +828,10 @@ private: ColumnPtr executeForNullableThenElse(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const { + /// If result type is Variant, we don't need to remove Nullable. + if (isVariant(result_type)) + return nullptr; + const ColumnWithTypeAndName & arg_cond = arguments[0]; const ColumnWithTypeAndName & arg_then = arguments[1]; const ColumnWithTypeAndName & arg_else = arguments[2]; @@ -912,6 +937,11 @@ private: assert_cast(*result_column).applyNullMap(assert_cast(*arg_cond.column)); return result_column; } + else if (auto * variant_column = typeid_cast(result_column.get())) + { + variant_column->applyNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else return ColumnNullable::create(materializeColumnIfConst(result_column), arg_cond.column); } @@ -950,6 +980,11 @@ private: assert_cast(*result_column).applyNegatedNullMap(assert_cast(*arg_cond.column)); return result_column; } + else if (auto * variant_column = typeid_cast(result_column.get())) + { + variant_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else { size_t size = input_rows_count; @@ -1039,6 +1074,13 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument (condition) of function if. " "Must be UInt8.", arguments[0]->getName()); + if (use_variant_when_no_common_type) + { + if (auto res = tryGetLeastSupertype(DataTypes{arguments[1], arguments[2]})) + return res; + return std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arguments[1]), removeNullableOrLowCardinalityNullable(arguments[2])}); + } + return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } @@ -1122,7 +1164,7 @@ public: || (res = executeGenericArray(cond_col, arguments, result_type)) || (res = executeTuple(arguments, result_type, input_rows_count)))) { - return executeGeneric(cond_col, arguments, input_rows_count); + return executeGeneric(cond_col, arguments, input_rows_count, use_variant_when_no_common_type); } return res; diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index cbdc08c2fab..360c2fc7f9f 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -45,6 +46,18 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const ColumnWithTypeAndName & elem = arguments[0]; + + if (isVariant(elem.type)) + { + const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + auto res = DataTypeUInt8().createColumn(); + auto & data = typeid_cast(*res).getData(); + data.reserve(discriminators.size()); + for (auto discr : discriminators) + data.push_back(discr != ColumnVariant::NULL_DISCRIMINATOR); + return res; + } + if (elem.type->isLowCardinalityNullable()) { const auto * low_cardinality_column = checkAndGetColumn(*elem.column); diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index cdce037088d..4bf4e44f866 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -44,6 +45,18 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { const ColumnWithTypeAndName & elem = arguments[0]; + + if (isVariant(elem.type)) + { + const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + auto res = DataTypeUInt8().createColumn(); + auto & data = typeid_cast(*res).getData(); + data.reserve(discriminators.size()); + for (auto discr : discriminators) + data.push_back(discr == ColumnVariant::NULL_DISCRIMINATOR); + return res; + } + if (elem.type->isLowCardinalityNullable()) { const auto * low_cardinality_column = checkAndGetColumn(*elem.column); diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index cdb9ca061c3..7a2e9444b2c 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -117,6 +118,15 @@ public: types_of_branches.emplace_back(arg); }); + if (context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if) + { + if (auto res = tryGetLeastSupertype(types_of_branches)) + return res; + for (auto & type : types_of_branches) + type = removeNullableOrLowCardinalityNullable(type); + return std::make_shared(types_of_branches); + } + return getLeastSupertype(types_of_branches); } diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp new file mode 100644 index 00000000000..7c63e1266e6 --- /dev/null +++ b/src/Functions/variantElement.cpp @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ + +/** Extract element of Variant by variant type name. + * Also the function looks through Arrays: you can get Array of Variant elements from Array of Variants. + */ +class FunctionVariantElement : public IFunction +{ +public: + static constexpr auto name = "variantElement"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const size_t number_of_arguments = arguments.size(); + + if (number_of_arguments < 2 || number_of_arguments > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3", + getName(), number_of_arguments); + + size_t count_arrays = 0; + const IDataType * input_type = arguments[0].type.get(); + while (const DataTypeArray * array = checkAndGetDataType(input_type)) + { + input_type = array->getNestedType().get(); + ++count_arrays; + } + + const DataTypeVariant * variant_type = checkAndGetDataType(input_type); + if (!variant_type) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or Array of Variant. Actual {}", + getName(), + arguments[0].type->getName()); + + std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *variant_type, number_of_arguments); + if (variant_global_discr.has_value()) + { + DataTypePtr return_type = makeNullableOrLowCardinalityNullableSafe(variant_type->getVariant(variant_global_discr.value())); + + for (; count_arrays; --count_arrays) + return_type = std::make_shared(return_type); + + return return_type; + } + else + return arguments[2].type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto & input_arg = arguments[0]; + const IDataType * input_type = input_arg.type.get(); + const IColumn * input_col = input_arg.column.get(); + + bool input_arg_is_const = false; + if (typeid_cast(input_col)) + { + input_col = assert_cast(input_col)->getDataColumnPtr().get(); + input_arg_is_const = true; + } + + Columns array_offsets; + while (const DataTypeArray * array_type = checkAndGetDataType(input_type)) + { + const ColumnArray * array_col = assert_cast(input_col); + + input_type = array_type->getNestedType().get(); + input_col = &array_col->getData(); + array_offsets.push_back(array_col->getOffsetsPtr()); + } + + const DataTypeVariant * input_type_as_variant = checkAndGetDataType(input_type); + const ColumnVariant * input_col_as_variant = checkAndGetColumn(input_col); + if (!input_type_as_variant || !input_col_as_variant) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or array of Variants. Actual {}", getName(), input_arg.type->getName()); + + std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); + + if (!variant_global_discr.has_value()) + return arguments[2].column; + + const auto & variant_type = input_type_as_variant->getVariant(*variant_global_discr); + const auto & variant_column = input_col_as_variant->getVariantPtrByGlobalDiscriminator(*variant_global_discr); + + /// If Variant has only NULLs or our variant doesn't have any real values, + /// just create column with default values and create null mask with 1. + if (input_col_as_variant->hasOnlyNulls() || variant_column->empty()) + { + auto res = variant_type->createColumn(); + + if (variant_type->lowCardinality()) + assert_cast(*res).nestedToNullable(); + + res->insertManyDefaults(input_col_as_variant->size()); + if (!variant_type->canBeInsideNullable()) + return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); + + auto null_map = ColumnUInt8::create(); + auto & null_map_data = null_map->getData(); + null_map_data.resize_fill(input_col_as_variant->size(), 1); + return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(res), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); + } + + /// If we extract single non-empty column and have no NULLs, then just return this variant. + if (auto non_empty_local_discr = input_col_as_variant->getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + /// If we were trying to extract some other variant, + /// it would be empty and we would already processed this case above. + chassert(input_col_as_variant->globalDiscriminatorByLocal(*non_empty_local_discr) == variant_global_discr); + return wrapInArraysAndConstIfNeeded(makeNullableOrLowCardinalityNullableSafe(variant_column), array_offsets, input_arg_is_const, input_rows_count); + } + + /// In general case we should calculate null-mask for variant + /// according to the discriminators column and expand + /// variant column by this mask to get a full column (with default values on NULLs) + const auto & local_discriminators = input_col_as_variant->getLocalDiscriminators(); + auto null_map = ColumnUInt8::create(); + auto & null_map_data = null_map->getData(); + null_map_data.reserve(local_discriminators.size()); + auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); + for (auto local_discr : local_discriminators) + null_map_data.push_back(local_discr != variant_local_discr); + + auto expanded_variant_column = IColumn::mutate(variant_column); + if (variant_type->lowCardinality()) + expanded_variant_column = assert_cast(*expanded_variant_column).cloneNullable(); + expanded_variant_column->expand(null_map_data, /*inverted = */ true); + if (variant_type->canBeInsideNullable()) + return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(expanded_variant_column), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); + return wrapInArraysAndConstIfNeeded(std::move(expanded_variant_column), array_offsets, input_arg_is_const, input_rows_count); + } +private: + std::optional getVariantGlobalDiscriminator(const ColumnPtr & index_column, const DataTypeVariant & variant_type, size_t argument_size) const + { + const auto * name_col = checkAndGetColumnConst(index_column.get()); + if (!name_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Second argument to {} with Variant argument must be a constant String", + getName()); + + String variant_element_name = name_col->getValue(); + auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name); + if (variant_element_type) + { + const auto & variants = variant_type.getVariants(); + for (size_t i = 0; i != variants.size(); ++i) + { + if (variants[i]->getName() == variant_element_type->getName()) + return i; + } + } + + if (argument_size == 2) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} doesn't contain variant with type {}", variant_type.getName(), variant_element_name); + return std::nullopt; + } + + ColumnPtr wrapInArraysAndConstIfNeeded(ColumnPtr res, const Columns & array_offsets, bool input_arg_is_const, size_t input_rows_count) const + { + for (auto it = array_offsets.rbegin(); it != array_offsets.rend(); ++it) + res = ColumnArray::create(res, *it); + + if (input_arg_is_const) + res = ColumnConst::create(res, input_rows_count); + + return res; + } +}; + +} + +REGISTER_FUNCTION(VariantElement) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Extracts a column with specified type from a `Variant` column. +)", + .syntax{"tupleElement(variant, type_name, [, default_value])"}, + .arguments{{ + {"variant", "Variant column"}, + {"type_name", "The name of the variant type to extract"}, + {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}}, + .examples{{{ + "Example", + R"( +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test;)", + R"( +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +)"}}}, + .categories{"Variant"}, + }); +} + +} diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index ff5743a63af..2534f248d83 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -619,13 +619,16 @@ void readQuotedStringInto(Vector & s, ReadBuffer & buf) readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf); } -template +template bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf) { - return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf); + return readAnyQuotedStringInto<'\'', enable_sql_style_quoting, Vector, bool>(s, buf); } -template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto>(PaddedPODArray & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto>(PaddedPODArray & s, ReadBuffer & buf); template void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) @@ -633,6 +636,16 @@ void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) readAnyQuotedStringInto<'"', enable_sql_style_quoting>(s, buf); } +template +bool tryReadDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) +{ + return readAnyQuotedStringInto<'"', enable_sql_style_quoting, Vector, bool>(s, buf); +} + +template bool tryReadDoubleQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadDoubleQuotedStringInto(String & s, ReadBuffer & buf); + + template void readBackQuotedStringInto(Vector & s, ReadBuffer & buf) { @@ -652,6 +665,18 @@ void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readQuotedStringInto(s, buf); } +bool tryReadQuotedString(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadQuotedStringInto(s, buf); +} + +bool tryReadQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadQuotedStringInto(s, buf); +} + template void readQuotedStringInto(PaddedPODArray & s, ReadBuffer & buf); template void readQuotedStringInto(String & s, ReadBuffer & buf); @@ -672,6 +697,18 @@ void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readDoubleQuotedStringInto(s, buf); } +bool tryReadDoubleQuotedString(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadDoubleQuotedStringInto(s, buf); +} + +bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadDoubleQuotedStringInto(s, buf); +} + void readBackQuotedString(String & s, ReadBuffer & buf) { s.clear(); @@ -691,7 +728,7 @@ concept WithResize = requires (T value) { value.size() } -> std::integral<>; }; -template +template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings) { /// Empty string @@ -754,12 +791,20 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & { PeekableReadBuffer * peekable_buf = dynamic_cast(&buf); if (!peekable_buf) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer"); + { + if constexpr (allow_throw) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer"); + return; + } while (true) { if (peekable_buf->eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter); + { + if constexpr (allow_throw) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter); + return; + } char * next_pos = reinterpret_cast(memchr(peekable_buf->position(), custom_delimiter[0], peekable_buf->available())); if (!next_pos) @@ -948,6 +993,9 @@ String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const For template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template void readCSVStringInto(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto, false, false>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template @@ -1069,15 +1117,18 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf) } template void readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); +template bool readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); template void readJSONObjectPossiblyInvalid>(PaddedPODArray & s, ReadBuffer & buf); +template bool readJSONObjectPossiblyInvalid, bool>(PaddedPODArray & s, ReadBuffer & buf); -template -void readJSONArrayInto(Vector & s, ReadBuffer & buf) +template +ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf) { - readJSONObjectOrArrayPossiblyInvalid(s, buf); + return readJSONObjectOrArrayPossiblyInvalid(s, buf); } -template void readJSONArrayInto>(PaddedPODArray & s, ReadBuffer & buf); +template void readJSONArrayInto, void>(PaddedPODArray & s, ReadBuffer & buf); +template bool readJSONArrayInto, bool>(PaddedPODArray & s, ReadBuffer & buf); template ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf) @@ -1217,6 +1268,13 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D return false; } + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3]) + || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9])) + return false; + } + UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -1240,6 +1298,13 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D return false; } + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[3]) || !isNumericASCII(s[4]) + || !isNumericASCII(s[6]) || !isNumericASCII(s[7])) + return false; + } + hour = (s[0] - '0') * 10 + (s[1] - '0'); minute = (s[3] - '0') * 10 + (s[4] - '0'); second = (s[6] - '0') * 10 + (s[7] - '0'); @@ -1259,7 +1324,14 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D { /// Not very efficient. for (const char * digit_pos = s; digit_pos < s_pos; ++digit_pos) + { + if constexpr (!throw_exception) + { + if (!isNumericASCII(*digit_pos)) + return false; + } datetime = datetime * 10 + *digit_pos - '0'; + } } datetime *= negative_multiplier; @@ -1282,14 +1354,24 @@ template bool readDateTimeTextFallback(time_t &, ReadBuffer &, cons template bool readDateTimeTextFallback(time_t &, ReadBuffer &, const DateLUTImpl &); -void skipJSONField(ReadBuffer & buf, StringRef name_of_field) +template +ReturnType skipJSONFieldImpl(ReadBuffer & buf, StringRef name_of_field) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + return ReturnType(false); + } else if (*buf.position() == '"') /// skip double-quoted string { NullOutput sink; - readJSONStringInto(sink, buf); + if constexpr (throw_exception) + readJSONStringInto(sink, buf); + else if (!tryReadJSONStringInto(sink, buf)) + return ReturnType(false); } else if (isNumericASCII(*buf.position()) || *buf.position() == '-' || *buf.position() == '+' || *buf.position() == '.') /// skip number { @@ -1298,19 +1380,32 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) double v; if (!tryReadFloatText(v, buf)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString()); + return ReturnType(false); + } } else if (*buf.position() == 'n') /// skip null { - assertString("null", buf); + if constexpr (throw_exception) + assertString("null", buf); + else if (!checkString("null", buf)) + return ReturnType(false); } else if (*buf.position() == 't') /// skip true { - assertString("true", buf); + if constexpr (throw_exception) + assertString("true", buf); + else if (!checkString("true", buf)) + return ReturnType(false); } else if (*buf.position() == 'f') /// skip false { - assertString("false", buf); + if constexpr (throw_exception) + assertString("false", buf); + else if (!checkString("false", buf)) + return ReturnType(false); } else if (*buf.position() == '[') { @@ -1320,12 +1415,16 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) if (!buf.eof() && *buf.position() == ']') /// skip empty array { ++buf.position(); - return; + return ReturnType(true); } while (true) { - skipJSONField(buf, name_of_field); + if constexpr (throw_exception) + skipJSONFieldImpl(buf, name_of_field); + else if (!skipJSONFieldImpl(buf, name_of_field)) + return ReturnType(false); + skipWhitespaceIfAny(buf); if (!buf.eof() && *buf.position() == ',') @@ -1339,7 +1438,11 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) break; } else - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } } } else if (*buf.position() == '{') /// skip whole object @@ -1353,19 +1456,34 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) if (*buf.position() == '"') { NullOutput sink; - readJSONStringInto(sink, buf); + if constexpr (throw_exception) + readJSONStringInto(sink, buf); + else if (!tryReadJSONStringInto(sink, buf)) + return ReturnType(false); } else - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } // ':' skipWhitespaceIfAny(buf); if (buf.eof() || !(*buf.position() == ':')) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } ++buf.position(); skipWhitespaceIfAny(buf); - skipJSONField(buf, name_of_field); + if constexpr (throw_exception) + skipJSONFieldImpl(buf, name_of_field); + else if (!skipJSONFieldImpl(buf, name_of_field)) + return ReturnType(false); + skipWhitespaceIfAny(buf); // optional ',' @@ -1377,14 +1495,32 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) } if (buf.eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + return ReturnType(false); + } ++buf.position(); } else { - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'", - std::string(*buf.position(), 1), name_of_field.toString()); + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'", + std::string(*buf.position(), 1), name_of_field.toString()); + return ReturnType(false); } + + return ReturnType(true); +} + +void skipJSONField(ReadBuffer & buf, StringRef name_of_field) +{ + skipJSONFieldImpl(buf, name_of_field); +} + +bool trySkipJSONField(ReadBuffer & buf, StringRef name_of_field) +{ + return skipJSONFieldImpl(buf, name_of_field); } @@ -1597,23 +1733,31 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } // Use PeekableReadBuffer to copy field to string after parsing. -template -static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) +template +static ReturnType readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) { PeekableReadBuffer peekable_buf(buf); peekable_buf.setCheckpoint(); - parse_func(peekable_buf); + if constexpr (std::is_same_v) + parse_func(peekable_buf); + else if (!parse_func(peekable_buf)) + return ReturnType(false); peekable_buf.makeContinuousMemoryFromCheckpointToPos(); auto * end = peekable_buf.position(); peekable_buf.rollbackToCheckpoint(); s.append(peekable_buf.position(), end); peekable_buf.position() = end; + return ReturnType(true); } -template -static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) +template +static ReturnType readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) { - assertChar('\'', buf); + if constexpr (std::is_same_v) + assertChar('\'', buf); + else if (!checkChar('\'', buf)) + return ReturnType(false); + s.push_back('\''); while (!buf.eof()) { @@ -1641,16 +1785,23 @@ static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) } if (buf.eof()) - return; + return ReturnType(false); ++buf.position(); s.push_back('\''); + return ReturnType(true); } -template -static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) +template +static ReturnType readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) { - assertChar(opening_bracket, buf); + static constexpr bool throw_exception = std::is_same_v; + + if constexpr (throw_exception) + assertChar(opening_bracket, buf); + else if (!checkChar(opening_bracket, buf)) + return ReturnType(false); + s.push_back(opening_bracket); size_t balance = 1; @@ -1666,7 +1817,10 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) if (*buf.position() == '\'') { - readQuotedStringFieldInto(s, buf); + if constexpr (throw_exception) + readQuotedStringFieldInto(s, buf); + else if (!readQuotedStringFieldInto(s, buf)) + return ReturnType(false); } else if (*buf.position() == opening_bracket) { @@ -1681,13 +1835,20 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) ++buf.position(); } } + + if (balance) + return ReturnType(false); + + return ReturnType(true); } -template -void readQuotedFieldInto(Vector & s, ReadBuffer & buf) +template +ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) - return; + return ReturnType(false); /// Possible values in 'Quoted' field: /// - Strings: '...' @@ -1699,35 +1860,47 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedStringFieldInto(s, buf); + return readQuotedStringFieldInto(s, buf); else if (*buf.position() == '[') - readQuotedFieldInBracketsInto<'[', ']'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (*buf.position() == '(') - readQuotedFieldInBracketsInto<'(', ')'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (*buf.position() == '{') - readQuotedFieldInBracketsInto<'{', '}'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (checkCharCaseInsensitive('n', buf)) { /// NULL or NaN if (checkCharCaseInsensitive('u', buf)) { - assertStringCaseInsensitive("ll", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("ll", buf); + else if (!checkStringCaseInsensitive("ll", buf)) + return ReturnType(false); s.append("NULL"); } else { - assertStringCaseInsensitive("an", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("an", buf); + else if (!checkStringCaseInsensitive("an", buf)) + return ReturnType(false); s.append("NaN"); } } else if (checkCharCaseInsensitive('t', buf)) { - assertStringCaseInsensitive("rue", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("rue", buf); + else if (!checkStringCaseInsensitive("rue", buf)) + return ReturnType(false); s.append("true"); } else if (checkCharCaseInsensitive('f', buf)) { - assertStringCaseInsensitive("alse", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("alse", buf); + else if (!checkStringCaseInsensitive("alse", buf)) + return ReturnType(false); s.append("false"); } else @@ -1736,13 +1909,19 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf) auto parse_func = [](ReadBuffer & in) { Float64 tmp; - readFloatText(tmp, in); + if constexpr (throw_exception) + readFloatText(tmp, in); + else + return tryReadFloatText(tmp, in); }; - readParsedValueInto(s, buf, parse_func); + + return readParsedValueInto(s, buf, parse_func); } + + return ReturnType(true); } -template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); +template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); void readQuotedField(String & s, ReadBuffer & buf) { @@ -1750,11 +1929,24 @@ void readQuotedField(String & s, ReadBuffer & buf) readQuotedFieldInto(s, buf); } +bool tryReadQuotedField(String & s, ReadBuffer & buf) +{ + s.clear(); + return readQuotedFieldInto(s, buf); +} + void readJSONField(String & s, ReadBuffer & buf) { s.clear(); auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; - readParsedValueInto(s, buf, parse_func); + readParsedValueInto(s, buf, parse_func); +} + +bool tryReadJSONField(String & s, ReadBuffer & buf) +{ + s.clear(); + auto parse_func = [](ReadBuffer & in) { return trySkipJSONField(in, "json_field"); }; + return readParsedValueInto(s, buf, parse_func); } void readTSVField(String & s, ReadBuffer & buf) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 17f3d3d4151..ad62a3deaca 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -257,26 +257,43 @@ inline void readBoolText(bool & x, ReadBuffer & buf) x = tmp != '0'; } -inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false) +template +inline ReturnType readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + else + return ReturnType(false); + } switch (*buf.position()) { case 't': - assertString("true", buf); + if constexpr (throw_exception) + assertString("true", buf); + else if (!checkString("true", buf)) + return ReturnType(false); x = true; break; case 'f': - assertString("false", buf); + if constexpr (throw_exception) + assertString("false", buf); + else if (!checkString("false", buf)) + return ReturnType(false); x = false; break; case 'T': { if (support_upper_case) { - assertString("TRUE", buf); + if constexpr (throw_exception) + assertString("TRUE", buf); + else if (!checkString("TRUE", buf)) + return ReturnType(false); x = true; break; } @@ -287,7 +304,10 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case { if (support_upper_case) { - assertString("FALSE", buf); + if constexpr (throw_exception) + assertString("FALSE", buf); + else if (!checkString("FALSE", buf)) + return ReturnType(false); x = false; break; } @@ -295,8 +315,15 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case [[fallthrough]]; } default: - throw ParsingException(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value"); + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value"); + else + return ReturnType(false); + } } + + return ReturnType(true); } enum class ReadIntTextCheckOverflow @@ -468,7 +495,10 @@ void readIntText(T & x, ReadBuffer & buf) template bool tryReadIntText(T & x, ReadBuffer & buf) { - return readIntTextImpl(x, buf); + if constexpr (is_decimal) + return tryReadIntText(x.value, buf); + else + return readIntTextImpl(x, buf); } @@ -477,16 +507,18 @@ bool tryReadIntText(T & x, ReadBuffer & buf) * - for numbers starting with zero, parsed only zero; * - symbol '+' before number is not supported; */ -template -void readIntTextUnsafe(T & x, ReadBuffer & buf) +template +ReturnType readIntTextUnsafe(T & x, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; bool negative = false; make_unsigned_t res = 0; auto on_error = [] { - if (throw_on_error) + if constexpr (throw_exception) throwReadAfterEOF(); + return ReturnType(false); }; if (buf.eof()) [[unlikely]] @@ -504,7 +536,7 @@ void readIntTextUnsafe(T & x, ReadBuffer & buf) { ++buf.position(); x = 0; - return; + return ReturnType(true); } while (!buf.eof()) @@ -523,12 +555,13 @@ void readIntTextUnsafe(T & x, ReadBuffer & buf) /// See note about undefined behaviour above. x = is_signed_v && negative ? -res : res; + return ReturnType(true); } template -void tryReadIntTextUnsafe(T & x, ReadBuffer & buf) +bool tryReadIntTextUnsafe(T & x, ReadBuffer & buf) { - return readIntTextUnsafe(x, buf); + return readIntTextUnsafe(x, buf); } @@ -550,9 +583,15 @@ void readEscapedString(String & s, ReadBuffer & buf); void readQuotedString(String & s, ReadBuffer & buf); void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); +bool tryReadQuotedString(String & s, ReadBuffer & buf); +bool tryReadQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); + void readDoubleQuotedString(String & s, ReadBuffer & buf); void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); +bool tryReadDoubleQuotedString(String & s, ReadBuffer & buf); +bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); + void readJSONString(String & s, ReadBuffer & buf); void readBackQuotedString(String & s, ReadBuffer & buf); @@ -615,7 +654,7 @@ void readBackQuotedStringInto(Vector & s, ReadBuffer & buf); template void readStringUntilEOFInto(Vector & s, ReadBuffer & buf); -template +template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// ReturnType is either bool or void. If bool, the function will return false instead of throwing an exception. @@ -628,7 +667,7 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf) return readJSONStringInto(s, buf); } -template +template bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf); /// Reads chunk of data between {} in that way, @@ -637,8 +676,8 @@ bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf); template ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf); -template -void readJSONArrayInto(Vector & s, ReadBuffer & buf); +template +ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf); template void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf); @@ -962,6 +1001,13 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons { if (s[4] < '0' || s[4] > '9') { + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3]) + || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9])) + return ReturnType(false); + } + UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -974,6 +1020,13 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons bool dt_long = (s[10] == ' ' || s[10] == 'T'); if (dt_long) { + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[11]) || !isNumericASCII(s[12]) || !isNumericASCII(s[14]) || !isNumericASCII(s[15]) + || !isNumericASCII(s[17]) || !isNumericASCII(s[18])) + return ReturnType(false); + } + hour = (s[11] - '0') * 10 + (s[12] - '0'); minute = (s[14] - '0') * 10 + (s[15] - '0'); second = (s[17] - '0') * 10 + (s[18] - '0'); @@ -1311,6 +1364,11 @@ inline bool tryReadText(is_integer auto & x, ReadBuffer & buf) return tryReadIntText(x, buf); } +inline bool tryReadText(is_floating_point auto & x, ReadBuffer & buf) +{ + return tryReadFloatText(x, buf); +} + inline bool tryReadText(UUID & x, ReadBuffer & buf) { return tryReadUUIDText(x, buf); } inline bool tryReadText(IPv4 & x, ReadBuffer & buf) { return tryReadIPv4Text(x, buf); } inline bool tryReadText(IPv6 & x, ReadBuffer & buf) { return tryReadIPv6Text(x, buf); } @@ -1320,9 +1378,20 @@ inline void readText(is_floating_point auto & x, ReadBuffer & buf) { readFloatTe inline void readText(String & x, ReadBuffer & buf) { readEscapedString(x, buf); } inline void readText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { readDateText(x, buf, time_zone); } +inline bool tryReadText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { return tryReadDateText(x, buf, time_zone); } inline void readText(LocalDate & x, ReadBuffer & buf) { readDateText(x, buf); } +inline bool tryReadText(LocalDate & x, ReadBuffer & buf) { return tryReadDateText(x, buf); } inline void readText(LocalDateTime & x, ReadBuffer & buf) { readDateTimeText(x, buf); } +inline bool tryReadText(LocalDateTime & x, ReadBuffer & buf) +{ + time_t time; + if (!tryReadDateTimeText(time, buf)) + return false; + x = LocalDateTime(time, DateLUT::instance()); + return true; +} + inline void readText(UUID & x, ReadBuffer & buf) { readUUIDText(x, buf); } inline void readText(IPv4 & x, ReadBuffer & buf) { readIPv4Text(x, buf); } inline void readText(IPv6 & x, ReadBuffer & buf) { readIPv6Text(x, buf); } @@ -1400,39 +1469,71 @@ inline void readDoubleQuoted(LocalDateTime & x, ReadBuffer & buf) } /// CSV for numbers: quotes are optional, no special escaping rules. -template -inline void readCSVSimple(T & x, ReadBuffer & buf) +template +inline ReturnType readCSVSimple(T & x, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + return ReturnType(false); + } char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); - readText(x, buf); + if constexpr (throw_exception) + readText(x, buf); + else if (!tryReadText(x, buf)) + return ReturnType(false); if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, buf); + { + if constexpr (throw_exception) + assertChar(maybe_quote, buf); + else if (!checkChar(maybe_quote, buf)) + return ReturnType(false); + } + + return ReturnType(true); } // standalone overload for dates: to avoid instantiating DateLUTs while parsing other types -template -inline void readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) +template +inline ReturnType readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + return ReturnType(false); + } char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); - readText(x, buf, time_zone); + if constexpr (throw_exception) + readText(x, buf, time_zone); + else if (!tryReadText(x, buf, time_zone)) + return ReturnType(false); if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, buf); + { + if constexpr (throw_exception) + assertChar(maybe_quote, buf); + else if (!checkChar(maybe_quote, buf)) + return ReturnType(false); + } + + return ReturnType(true); } template @@ -1442,18 +1543,52 @@ inline void readCSV(T & x, ReadBuffer & buf) readCSVSimple(x, buf); } +template +requires is_arithmetic_v +inline bool tryReadCSV(T & x, ReadBuffer & buf) +{ + return readCSVSimple(x, buf); +} + inline void readCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) { readCSVString(x, buf, settings); } +inline bool tryReadCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + x.clear(); + readCSVStringInto(x, buf, settings); + return true; +} + inline void readCSV(LocalDate & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(LocalDate & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(DayNum & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(DayNum & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } inline void readCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { readCSVSimple(x, buf, time_zone); } +inline bool tryReadCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { return readCSVSimple(x, buf, time_zone); } + inline void readCSV(LocalDateTime & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(LocalDateTime & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UUID & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UUID & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(IPv4 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(IPv4 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(IPv6 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(IPv6 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UInt128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UInt128 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(Int128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(Int128 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UInt256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UInt256 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(Int256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(Int256 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } template void readBinary(std::vector & x, ReadBuffer & buf) @@ -1535,6 +1670,7 @@ inline void skipWhitespaceIfAny(ReadBuffer & buf, bool one_line = false) /// Skips json value. void skipJSONField(ReadBuffer & buf, StringRef name_of_field); +bool trySkipJSONField(ReadBuffer & buf, StringRef name_of_field); /** Read serialized exception. @@ -1749,12 +1885,14 @@ struct PcgDeserializer } }; -template -void readQuotedFieldInto(Vector & s, ReadBuffer & buf); +template +ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf); void readQuotedField(String & s, ReadBuffer & buf); +bool tryReadQuotedField(String & s, ReadBuffer & buf); void readJSONField(String & s, ReadBuffer & buf); +bool tryReadJSONField(String & s, ReadBuffer & buf); void readTSVField(String & s, ReadBuffer & buf); diff --git a/src/IO/readDecimalText.h b/src/IO/readDecimalText.h index 9fd9c439b87..81bde87f1f1 100644 --- a/src/IO/readDecimalText.h +++ b/src/IO/readDecimalText.h @@ -224,4 +224,24 @@ inline void readCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint assertChar(maybe_quote, buf); } +template +inline bool tryReadCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint32_t & scale) +{ + if (buf.eof()) + return false; + + char maybe_quote = *buf.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++buf.position(); + + if (!tryReadDecimalText(buf, x, precision, scale)) + return false; + + if ((maybe_quote == '\'' || maybe_quote == '\"') && !checkChar(maybe_quote, buf)) + return false; + + return true; +} + } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index bf07f4ed3ee..51f767afc04 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -939,6 +939,20 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat } } } + if (!create.attach && !settings.allow_experimental_variant_type) + { + for (const auto & [name, type] : properties.columns.getAllPhysical()) + { + if (isVariant(type)) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column '{}' which type is '{}' " + "because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", + name, type->getName()); + } + } + } } namespace diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 283289f0dfc..32b24cba940 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -532,7 +532,7 @@ BlockIO InterpreterInsertQuery::execute() { /// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with /// default column values (in AddingDefaultsTransform), so all values will be cast correctly. - if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) + if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name)); } } diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index c7a1cab8bac..fd8f5b154c4 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -237,17 +237,36 @@ static std::unordered_map collectOffsetsColumns( { auto & offsets_column = offsets_columns[stream_name]; if (!offsets_column) + { offsets_column = current_offsets_column; + } + else + { + /// If we are inside Variant element, it may happen that + /// offsets are different, because when we read Variant + /// element as a subcolumn, we expand this column according + /// to the discriminators, so, offsets column can be changed. + /// In this case we should select the original offsets column + /// of this stream, which is the smallest one. + bool inside_variant_element = false; + for (const auto & elem : subpath) + inside_variant_element |= elem.type == ISerialization::Substream::VariantElement; - #ifndef NDEBUG - const auto & offsets_data = assert_cast(*offsets_column).getData(); - const auto & current_offsets_data = assert_cast(*current_offsets_column).getData(); + if (offsets_column->size() != current_offsets_column->size() && inside_variant_element) + offsets_column = offsets_column->size() < current_offsets_column->size() ? offsets_column : current_offsets_column; +#ifndef NDEBUG + else + { + const auto & offsets_data = assert_cast(*offsets_column).getData(); + const auto & current_offsets_data = assert_cast(*current_offsets_column).getData(); - if (offsets_data != current_offsets_data) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Found non-equal columns with offsets (sizes: {} and {}) for stream {}", - offsets_data.size(), current_offsets_data.size(), stream_name); - #endif + if (offsets_data != current_offsets_data) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Found non-equal columns with offsets (sizes: {} and {}) for stream {}", + offsets_data.size(), current_offsets_data.size(), stream_name); + } +#endif + } } }, available_column->type, res_columns[i]); } diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 87f76f7f824..551a883d093 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -60,6 +60,17 @@ void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS); } } + + if (!settings.allow_experimental_variant_type) + { + if (isVariant(type)) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", type->getName()); + } + } } ColumnsDescription parseColumnsListFromString(const std::string & structure, const ContextPtr & context) diff --git a/src/Interpreters/parseColumnsListForTableFunction.h b/src/Interpreters/parseColumnsListForTableFunction.h index ef1bbe5498e..1fbbfa4b12f 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.h +++ b/src/Interpreters/parseColumnsListForTableFunction.h @@ -18,12 +18,14 @@ struct DataTypeValidationSettings : allow_suspicious_low_cardinality_types(settings.allow_suspicious_low_cardinality_types) , allow_experimental_object_type(settings.allow_experimental_object_type) , allow_suspicious_fixed_string_types(settings.allow_suspicious_fixed_string_types) + , allow_experimental_variant_type(settings.allow_experimental_variant_type) { } bool allow_suspicious_low_cardinality_types = true; bool allow_experimental_object_type = true; bool allow_suspicious_fixed_string_types = true; + bool allow_experimental_variant_type = true; }; void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings); diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 594221fe050..5dc791f7003 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -250,7 +250,7 @@ bool ParserTableAsStringLiteralIdentifier::parseImpl(Pos & pos, ASTPtr & node, E ReadBufferFromMemory in(pos->begin, pos->size()); String s; - if (!tryReadQuotedStringInto(s, in)) + if (!tryReadQuotedString(s, in)) { expected.add(pos, "string literal"); return false; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 5e12ec18d27..cab0f7523f1 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -391,7 +391,7 @@ bool CSVFormatReader::readFieldImpl(ReadBuffer & istr, DB::IColumn & column, con if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { /// If value is null but type is not nullable then use default value instead. - return SerializationNullable::deserializeTextCSVImpl(column, istr, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(column, istr, format_settings, serialization); } /// Read the column normally. diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 7e8b4accf4d..9c7f095e661 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -409,7 +409,7 @@ bool MySQLDumpRowInputFormat::readField(IColumn & column, size_t column_idx) const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - return SerializationNullable::deserializeTextQuotedImpl(column, *in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, *in, format_settings, serialization); serialization->deserializeTextQuoted(column, *in, format_settings); return true; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index f4f92583473..0f68c28ab1f 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -147,7 +147,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex const auto & type = getPort().getHeader().getByPosition(index).type; const auto & serialization = serializations[index]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization); + read_columns[index] = SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(*columns[index], *in, format_settings, serialization); else serialization->deserializeTextEscaped(*columns[index], *in, format_settings); } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 3205adc2a48..88eb11d130d 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -167,7 +167,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t if (is_raw) { if (as_nullable) - return SerializationNullable::deserializeTextRawImpl(column, *buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(column, *buf, format_settings, serialization); serialization->deserializeTextRaw(column, *buf, format_settings); return true; @@ -175,7 +175,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t if (as_nullable) - return SerializationNullable::deserializeTextEscapedImpl(column, *buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(column, *buf, format_settings, serialization); serialization->deserializeTextEscaped(column, *buf, format_settings); return true; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index d55ccce8879..a7b5795b89e 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -293,7 +293,7 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - read = SerializationNullable::deserializeTextQuotedImpl(column, *buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, *buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, *buf, format_settings); } diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.reference b/tests/queries/0_stateless/02940_variant_text_deserialization.reference new file mode 100644 index 00000000000..98725917567 --- /dev/null +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.reference @@ -0,0 +1,516 @@ +JSON +String +{"v":null,"variantElement(v, 'String')":null} +{"v":"string","variantElement(v, 'String')":"string"} +{"v":"42","variantElement(v, 'String')":null} +FixedString +{"v":null,"variantElement(v, 'FixedString(4)')":null} +{"v":"string","variantElement(v, 'FixedString(4)')":null} +{"v":"abcd","variantElement(v, 'FixedString(4)')":"abcd"} +Bool +{"v":null,"variantElement(v, 'Bool')":null} +{"v":"string","variantElement(v, 'Bool')":null} +{"v":true,"variantElement(v, 'Bool')":true} +Integers +{"v":null,"variantElement(v, 'Int8')":null} +{"v":"string","variantElement(v, 'Int8')":null} +{"v":-1,"variantElement(v, 'Int8')":-1} +{"v":0,"variantElement(v, 'Int8')":0} +{"v":"10000000000","variantElement(v, 'Int8')":null} +{"v":null,"variantElement(v, 'UInt8')":null} +{"v":"string","variantElement(v, 'UInt8')":null} +{"v":"-1","variantElement(v, 'UInt8')":null} +{"v":0,"variantElement(v, 'UInt8')":0} +{"v":"10000000000","variantElement(v, 'UInt8')":null} +{"v":null,"variantElement(v, 'Int16')":null} +{"v":"string","variantElement(v, 'Int16')":null} +{"v":-1,"variantElement(v, 'Int16')":-1} +{"v":0,"variantElement(v, 'Int16')":0} +{"v":"10000000000","variantElement(v, 'Int16')":null} +{"v":null,"variantElement(v, 'UInt16')":null} +{"v":"string","variantElement(v, 'UInt16')":null} +{"v":"-1","variantElement(v, 'UInt16')":null} +{"v":0,"variantElement(v, 'UInt16')":0} +{"v":"10000000000","variantElement(v, 'UInt16')":null} +{"v":null,"variantElement(v, 'Int32')":null} +{"v":"string","variantElement(v, 'Int32')":null} +{"v":-1,"variantElement(v, 'Int32')":-1} +{"v":0,"variantElement(v, 'Int32')":0} +{"v":"10000000000","variantElement(v, 'Int32')":null} +{"v":null,"variantElement(v, 'UInt32')":null} +{"v":"string","variantElement(v, 'UInt32')":null} +{"v":"-1","variantElement(v, 'UInt32')":null} +{"v":0,"variantElement(v, 'UInt32')":0} +{"v":"10000000000","variantElement(v, 'UInt32')":null} +{"v":null,"variantElement(v, 'Int64')":null} +{"v":"string","variantElement(v, 'Int64')":null} +{"v":"-1","variantElement(v, 'Int64')":"-1"} +{"v":"0","variantElement(v, 'Int64')":"0"} +{"v":"10000000000000000000000","variantElement(v, 'Int64')":null} +{"v":null,"variantElement(v, 'UInt64')":null} +{"v":"string","variantElement(v, 'UInt64')":null} +{"v":"-1","variantElement(v, 'UInt64')":null} +{"v":"0","variantElement(v, 'UInt64')":"0"} +{"v":"10000000000000000000000","variantElement(v, 'UInt64')":null} +{"v":null,"variantElement(v, 'Int128')":null} +{"v":"string","variantElement(v, 'Int128')":null} +{"v":"-1","variantElement(v, 'Int128')":"-1"} +{"v":"0","variantElement(v, 'Int128')":"0"} +{"v":null,"variantElement(v, 'UInt128')":null} +{"v":"string","variantElement(v, 'UInt128')":null} +{"v":"-1","variantElement(v, 'UInt128')":null} +{"v":"0","variantElement(v, 'UInt128')":"0"} +Floats +{"v":null,"variantElement(v, 'Float32')":null} +{"v":"string","variantElement(v, 'Float32')":null} +{"v":42.42,"variantElement(v, 'Float32')":42.42} +{"v":null,"variantElement(v, 'Float64')":null} +{"v":"string","variantElement(v, 'Float64')":null} +{"v":42.42,"variantElement(v, 'Float64')":42.42} +Decimals +{"v":null,"variantElement(v, 'Decimal32(6)')":null} +{"v":"string","variantElement(v, 'Decimal32(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal32(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal32(6)')":null} +{"v":null,"variantElement(v, 'Decimal64(6)')":null} +{"v":"string","variantElement(v, 'Decimal64(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal64(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal64(6)')":null} +{"v":null,"variantElement(v, 'Decimal128(6)')":null} +{"v":"string","variantElement(v, 'Decimal128(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal128(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal128(6)')":null} +{"v":null,"variantElement(v, 'Decimal256(6)')":null} +{"v":"string","variantElement(v, 'Decimal256(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal256(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal256(6)')":null} +Dates and DateTimes +{"v":null,"variantElement(v, 'Date')":null} +{"v":"string","variantElement(v, 'Date')":null} +{"v":"2020-01-01","variantElement(v, 'Date')":"2020-01-01"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'Date')":null} +{"v":null,"variantElement(v, 'Date32')":null} +{"v":"string","variantElement(v, 'Date32')":null} +{"v":"1900-01-01","variantElement(v, 'Date32')":"1900-01-01"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'Date32')":null} +{"v":null,"variantElement(v, 'DateTime')":null} +{"v":"string","variantElement(v, 'DateTime')":null} +{"v":"2020-01-01 00:00:00","variantElement(v, 'DateTime')":"2020-01-01 00:00:00"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'DateTime')":null} +{"v":null,"variantElement(v, 'DateTime64')":null} +{"v":"string","variantElement(v, 'DateTime64')":null} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'DateTime64')":"2020-01-01 00:00:00.999"} +{"v":"2020-01-01 00:00:00.999999999 ABC","variantElement(v, 'DateTime64')":null} +UUID +{"v":null,"variantElement(v, 'UUID')":null} +{"v":"string","variantElement(v, 'UUID')":null} +{"v":"c8619cca-0caa-445e-ae76-1d4f6e0b3927","variantElement(v, 'UUID')":"c8619cca-0caa-445e-ae76-1d4f6e0b3927"} +IPv4 +{"v":null,"variantElement(v, 'IPv4')":null} +{"v":"string","variantElement(v, 'IPv4')":null} +{"v":"127.0.0.1","variantElement(v, 'IPv4')":"127.0.0.1"} +IPv6 +{"v":null,"variantElement(v, 'IPv6')":null} +{"v":"string","variantElement(v, 'IPv6')":null} +{"v":"2001:db8:85a3::8a2e:370:7334","variantElement(v, 'IPv6')":"2001:db8:85a3::8a2e:370:7334"} +Enum +{"v":null,"variantElement(v, 'Enum(\\'a\\' = 1)')":null} +{"v":"string","variantElement(v, 'Enum(\\'a\\' = 1)')":null} +{"v":"a","variantElement(v, 'Enum(\\'a\\' = 1)')":"a"} +{"v":"a","variantElement(v, 'Enum(\\'a\\' = 1)')":"a"} +{"v":2,"variantElement(v, 'Enum(\\'a\\' = 1)')":null} +Map +{"v":null,"variantElement(v, 'Map(String, UInt64)')":{}} +{"v":"string","variantElement(v, 'Map(String, UInt64)')":{}} +{"v":{"a":"42","b":"43","c":"0"},"variantElement(v, 'Map(String, UInt64)')":{"a":"42","b":"43","c":"0"}} +{"v":"{\"c\" : 44, \"d\" : [1,2,3]}","variantElement(v, 'Map(String, UInt64)')":{}} +Tuple +{"v":null,"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"0","b":"0"}} +{"v":"string","variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"0","b":"0"}} +{"v":{"a":"42","b":"0"},"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"42","b":"0"}} +{"v":{"a":"44","b":"0"},"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"44","b":"0"}} +\N (0,0) +string (0,0) +(42,0) (42,0) +{"a" : 44, "d" : 32} (0,0) +Array +{"v":null,"variantElement(v, 'Array(UInt64)')":[]} +{"v":"string","variantElement(v, 'Array(UInt64)')":[]} +{"v":["1","2","3"],"variantElement(v, 'Array(UInt64)')":["1","2","3"]} +{"v":["0","0","0"],"variantElement(v, 'Array(UInt64)')":["0","0","0"]} +{"v":"[1, 2, \"hello\"]","variantElement(v, 'Array(UInt64)')":[]} +LowCardinality +{"v":null,"variantElement(v, 'LowCardinality(String)')":null} +{"v":"string","variantElement(v, 'LowCardinality(String)')":"string"} +{"v":"42","variantElement(v, 'LowCardinality(String)')":null} +{"v":null,"variantElement(v, 'Array(LowCardinality(Nullable(String)))')":[]} +{"v":["string",null],"variantElement(v, 'Array(LowCardinality(Nullable(String)))')":["string",null]} +{"v":"42","variantElement(v, 'Array(LowCardinality(Nullable(String)))')":[]} +Nullable +{"v":null,"variantElement(v, 'Array(Nullable(String))')":[]} +{"v":"string","variantElement(v, 'Array(Nullable(String))')":[]} +{"v":["hello",null,"world"],"variantElement(v, 'Array(Nullable(String))')":["hello",null,"world"]} +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +CSV +String +\N,\N +"string","string" +"string","string" +42,\N +FixedString +\N,\N +"string",\N +"string",\N +"abcd","abcd" +Bool +\N,\N +"Truee",\N +true,true +Integers +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000000000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000000000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +"42d42",\N +Floats +\N,\N +"string",\N +42.42,42.42 +"42.d42",\N +\N,\N +"string",\N +42.42,42.42 +"42.d42",\N +Decimals +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +Dates and DateTimes +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01","2020-01-01" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"1900-01-01","1900-01-01" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01 00:00:00","2020-01-01 00:00:00" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01 00:00:00.999","2020-01-01 00:00:00.999" +"2020-01-01 00:00:00.999999999 ABC",\N +UUID +\N,\N +"string",\N +"c8619cca-0caa-445e-ae76-1d4f6e0b3927","c8619cca-0caa-445e-ae76-1d4f6e0b3927" +"c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA",\N +IPv4 +\N,\N +"string",\N +"127.0.0.1","127.0.0.1" +"127.0.0.1AAA",\N +IPv6 +\N,\N +"string",\N +"2001:db8:85a3::8a2e:370:7334","2001:db8:85a3::8a2e:370:7334" +"2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA",\N +Enum +\N,\N +"string",\N +"a","a" +"a","a" +2,\N +"aa",\N +Map +\N,"{}" +"string","{}" +"{'a':42,'b':43,'c':0}","{'a':42,'b':43,'c':0}" +"{'c' : 44, 'd' : [1,2,3]}","{}" +"{'c' : 44","{}" +Array +\N,"[]" +"string","[]" +"[1,2,3]","[1,2,3]" +"[0,0,0]","[0,0,0]" +"[1, 2, 'hello']","[]" +"[1, 2","[]" +LowCardinality +\N,\N +"string","string" +42,\N +\N,"[]" +"['string',NULL]","['string',NULL]" +"['string', nul]","[]" +42,"[]" +Nullable +\N,"[]" +"string","[]" +"['hello',NULL,'world']","['hello',NULL,'world']" +"['hello', nul]","[]" +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +TSV +String +\N \N +string string +42 \N +FixedString +\N \N +string \N +abcd abcd +Bool +\N \N +Truee \N +true true +Integers +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000000000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000000000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +42d42 \N +\N \N +string \N +-1 \N +0 0 +42d42 \N +Floats +\N \N +string \N +42.42 42.42 +42.d42 \N +\N \N +string \N +42.42 42.42 +42.d42 \N +Decimals +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +Dates and DateTimes +\N \N +string \N +2020-01-d1 \N +2020-01-01 2020-01-01 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +1900-01-01 1900-01-01 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +2020-01-01 00:00:00 2020-01-01 00:00:00 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +2020-01-01 00:00:00.999 2020-01-01 00:00:00.999 +2020-01-01 00:00:00.999999999 ABC \N +UUID +\N \N +string \N +c8619cca-0caa-445e-ae76-1d4f6e0b3927 c8619cca-0caa-445e-ae76-1d4f6e0b3927 +c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA \N +IPv4 +\N \N +string \N +127.0.0.1 127.0.0.1 +127.0.0.1AAA \N +IPv6 +\N \N +string \N +2001:db8:85a3::8a2e:370:7334 2001:db8:85a3::8a2e:370:7334 +2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA \N +Enum +\N \N +string \N +a a +a a +2 \N +aa \N +Map +\N {} +string {} +{'a':42,'b':43,'c':0} {'a':42,'b':43,'c':0} +{\'c\' : 44, \'d\' : [1,2,3]} {} +{\'c\' : 44 {} +Array +\N [] +string [] +[1,2,3] [1,2,3] +[0,0,0] [0,0,0] +[1, 2, \'hello\'] [] +[1, 2 [] +LowCardinality +\N \N +string string +42 \N +\N [] +['string',NULL] ['string',NULL] +[\'string\', nul] [] +42 [] +Nullable +\N [] +string [] +['hello',NULL,'world'] ['hello',NULL,'world'] +[\'hello\', nul] [] +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +Values +String +(NULL,NULL),('string','string'),(42,NULL)FixedString +(NULL,NULL),('string',NULL),('abcd','abcd')Bool +(NULL,NULL),(true,true)Integers +(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0)(NULL,NULL),('string',NULL),(-1,NULL),(0,0)Floats +(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Decimals +(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Dates and DateTimes +(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000','1970-01-01 00:00:00.000'),('2020-01-01 00:00:00.999',NULL),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID +(NULL,NULL),('string',NULL),('c8619cca-0caa-445e-ae76-1d4f6e0b3927','c8619cca-0caa-445e-ae76-1d4f6e0b3927'),('c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA',NULL)IPv4 +(NULL,NULL),('string',NULL),('127.0.0.1','127.0.0.1'),('127.0.0.1AAA',NULL)IPv6 +(NULL,NULL),('string',NULL),('2001:db8:85a3::8a2e:370:7334','2001:db8:85a3::8a2e:370:7334'),('2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA',NULL)Enum +(NULL,NULL),('string',NULL),('a','a'),(1,NULL),(2,NULL),('aa',NULL)Map +(NULL,{}),('string',{}),({'a':42,'b':43,'c':0},{'a':42,'b':43,'c':0})Array +(NULL,[]),('string',[]),([1,2,3],[1,2,3]),([0,0,0],[0,0,0])LowCardinality +(NULL,NULL),('string','string'),(42,NULL)(NULL,[]),(['string',NULL],['string',NULL]),(42,[])Nullable +(NULL,[]),('string',[]),(['hello',NULL,'world'],['hello',NULL,'world']) diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.sql b/tests/queries/0_stateless/02940_variant_text_deserialization.sql new file mode 100644 index 00000000000..041d02088ef --- /dev/null +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.sql @@ -0,0 +1,266 @@ +set allow_experimental_variant_type = 1; +set session_timezone = 'UTC'; + +select 'JSON'; +select 'String'; +select v, variantElement(v, 'String') from format(JSONEachRow, 'v Variant(String, UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : 42}') format JSONEachRow; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(JSONEachRow, 'v Variant(String, FixedString(4))', '{"v" : null}, {"v" : "string"}, {"v" : "abcd"}') format JSONEachRow; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(JSONEachRow, 'v Variant(String, Bool)', '{"v" : null}, {"v" : "string"}, {"v" : true}') format JSONEachRow; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(JSONEachRow, 'v Variant(String, Int8, UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt8') from format(JSONEachRow, 'v Variant(String, UInt8, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int16') from format(JSONEachRow, 'v Variant(String, Int16, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt16') from format(JSONEachRow, 'v Variant(String, UInt16, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int32') from format(JSONEachRow, 'v Variant(String, Int32, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt32') from format(JSONEachRow, 'v Variant(String, UInt32, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int64') from format(JSONEachRow, 'v Variant(String, Int64, Int128)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000000000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt64') from format(JSONEachRow, 'v Variant(String, UInt64, Int128)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000000000000000}') format JSONEachRow; +select v, variantElement(v, 'Int128') from format(JSONEachRow, 'v Variant(String, Int128, Int256)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}') format JSONEachRow; +select v, variantElement(v, 'UInt128') from format(JSONEachRow, 'v Variant(String, UInt128, Int256)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}') format JSONEachRow; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(JSONEachRow, 'v Variant(String, Float32)', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}') format JSONEachRow; +select v, variantElement(v, 'Float64') from format(JSONEachRow, 'v Variant(String, Float64)', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}') format JSONEachRow; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(JSONEachRow, 'v Variant(String, Decimal32(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal64(6)') from format(JSONEachRow, 'v Variant(String, Decimal64(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal128(6)') from format(JSONEachRow, 'v Variant(String, Decimal128(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal256(6)') from format(JSONEachRow, 'v Variant(String, Decimal256(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(JSONEachRow, 'v Variant(String, Date, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'Date32') from format(JSONEachRow, 'v Variant(String, Date32, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "1900-01-01"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'DateTime') from format(JSONEachRow, 'v Variant(String, DateTime, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01 00:00:00"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'DateTime64') from format(JSONEachRow, 'v Variant(String, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01 00:00:00.999"}, {"v" : "2020-01-01 00:00:00.999999999 ABC"}') format JSONEachRow; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(JSONEachRow, 'v Variant(String, UUID)', '{"v" : null}, {"v" : "string"}, {"v" : "c8619cca-0caa-445e-ae76-1d4f6e0b3927"}') format JSONEachRow; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(JSONEachRow, 'v Variant(String, IPv4)', '{"v" : null}, {"v" : "string"}, {"v" : "127.0.0.1"}') format JSONEachRow; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(JSONEachRow, 'v Variant(String, IPv6)', '{"v" : null}, {"v" : "string"}, {"v" : "2001:0db8:85a3:0000:0000:8a2e:0370:7334"}') format JSONEachRow; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(JSONEachRow, 'v Variant(String, UInt32, Enum(''a'' = 1))', '{"v" : null}, {"v" : "string"}, {"v" : "a"}, {"v" : 1}, {"v" : 2}') format JSONEachRow; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(JSONEachRow, 'v Variant(String, Map(String, UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : 43, "c" : null}}, {"v" : {"c" : 44, "d" : [1,2,3]}}') format JSONEachRow; + +select 'Tuple'; +select v, variantElement(v, 'Tuple(a UInt64, b UInt64)') from format(JSONEachRow, 'v Variant(String, Tuple(a UInt64, b UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : null}}, {"v" : {"a" : 44, "d" : 32}}') format JSONEachRow; +select v, variantElement(v, 'Tuple(a UInt64, b UInt64)') from format(JSONEachRow, 'v Variant(String, Tuple(a UInt64, b UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : null}}, {"v" : {"a" : 44, "d" : 32}}') settings input_format_json_defaults_for_missing_elements_in_named_tuple=0; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(JSONEachRow, 'v Variant(String, Array(UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : [1, 2, 3]}, {"v" : [null, null, null]} {"v" : [1, 2, "hello"]}') format JSONEachRow; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(JSONEachRow, 'v Variant(LowCardinality(String), UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : 42}') format JSONEachRow; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(JSONEachRow, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64)', '{"v" : null}, {"v" : ["string", null]}, {"v" : 42}') format JSONEachRow; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(JSONEachRow, 'v Variant(String, Array(Nullable(String)))', '{"v" : null}, {"v" : "string"}, {"v" : ["hello", null, "world"]}') format JSONEachRow; + +select repeat('-', 80) format JSONEachRow; + +select 'CSV'; +select 'String'; +select v, variantElement(v, 'String') from format(CSV, 'v Variant(String, UInt64)', '\\N\n"string"\nstring\n42') format CSV; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(CSV, 'v Variant(String, FixedString(4))', '\\N\n"string"\nstring\n"abcd"') format CSV; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(CSV, 'v Variant(String, Bool)', '\\N\nTruee\nTrue') format CSV; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(CSV, 'v Variant(String, Int8, UInt64)', '\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt8') from format(CSV, 'v Variant(String, UInt8, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int16') from format(CSV, 'v Variant(String, Int16, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt16') from format(CSV, 'v Variant(String, UInt16, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int32') from format(CSV, 'v Variant(String, Int32, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt32') from format(CSV, 'v Variant(String, UInt32, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int64') from format(CSV, 'v Variant(String, Int64, Int128)', '\\N\n"string"\n-1\n0\n10000000000000000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt64') from format(CSV, 'v Variant(String, UInt64, Int128)', '\\N\n"string"\n-1\n0\n10000000000000000000000\n42d42') format CSV; +select v, variantElement(v, 'Int128') from format(CSV, 'v Variant(String, Int128, Int256)', '\\N\n"string"\n-1\n0\n42d42') format CSV; +select v, variantElement(v, 'UInt128') from format(CSV, 'v Variant(String, UInt128, Int256)', '\\N\n"string"\n-1\n0\n42d42') format CSV; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(CSV, 'v Variant(String, Float32)', '\\N\n"string"\n42.42\n42.d42') format CSV; +select v, variantElement(v, 'Float64') from format(CSV, 'v Variant(String, Float64)', '\\N\n"string"\n42.42\n42.d42') format CSV; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(CSV, 'v Variant(String, Decimal32(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal64(6)') from format(CSV, 'v Variant(String, Decimal64(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal128(6)') from format(CSV, 'v Variant(String, Decimal128(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal256(6)') from format(CSV, 'v Variant(String, Decimal256(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(CSV, 'v Variant(String, Date, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'Date32') from format(CSV, 'v Variant(String, Date32, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"1900-01-01"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'DateTime') from format(CSV, 'v Variant(String, DateTime, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01 00:00:00"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'DateTime64') from format(CSV, 'v Variant(String, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01 00:00:00.999"\n"2020-01-01 00:00:00.999999999 ABC"') format CSV; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(CSV, 'v Variant(String, UUID)', '\\N\n"string"\n"c8619cca-0caa-445e-ae76-1d4f6e0b3927"\nc8619cca-0caa-445e-ae76-1d4f6e0b3927AAA') format CSV; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(CSV, 'v Variant(String, IPv4)', '\\N\n"string"\n"127.0.0.1"\n"127.0.0.1AAA"') format CSV; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(CSV, 'v Variant(String, IPv6)', '\\N\n"string"\n"2001:0db8:85a3:0000:0000:8a2e:0370:7334"\n2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA') format CSV; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(CSV, 'v Variant(String, UInt32, Enum(''a'' = 1))', '\\N\n"string"\n"a"\n1\n2\naa') format CSV; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(CSV, 'v Variant(String, Map(String, UInt64))', '\\N\n"string"\n"{''a'' : 42, ''b'' : 43, ''c'' : null}"\n"{''c'' : 44, ''d'' : [1,2,3]}"\n"{''c'' : 44"') format CSV; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(CSV, 'v Variant(String, Array(UInt64))', '\\N\n"string"\n"[1, 2, 3]"\n"[null, null, null]"\n"[1, 2, ''hello'']"\n"[1, 2"') format CSV; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(CSV, 'v Variant(LowCardinality(String), UInt64)', '\\N\n"string"\n42') format CSV; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(CSV, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '\\N\n"[''string'', null]"\n"[''string'', nul]"\n42') format CSV; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(CSV, 'v Variant(String, Array(Nullable(String)))', '\\N\n"string"\n"[''hello'', null, ''world'']"\n"[''hello'', nul]"') format CSV; + +select repeat('-', 80) format JSONEachRow; + +select 'TSV'; +select 'String'; +select v, variantElement(v, 'String') from format(TSV, 'v Variant(String, UInt64)', '\\N\nstring\n42') format TSV; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(TSV, 'v Variant(String, FixedString(4))', '\\N\nstring\nabcd') format TSV; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(TSV, 'v Variant(String, Bool)', '\\N\nTruee\nTrue') format TSV; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(TSV, 'v Variant(String, Int8, UInt64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt8') from format(TSV, 'v Variant(String, UInt8, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int16') from format(TSV, 'v Variant(String, Int16, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt16') from format(TSV, 'v Variant(String, UInt16, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int32') from format(TSV, 'v Variant(String, Int32, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt32') from format(TSV, 'v Variant(String, UInt32, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int64') from format(TSV, 'v Variant(String, Int64, Int128)', '\\N\nstring\n-1\n0\n10000000000000000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt64') from format(TSV, 'v Variant(String, UInt64, Int128)', '\\N\nstring\n-1\n0\n10000000000000000000000\n42d42') format TSV; +select v, variantElement(v, 'Int128') from format(TSV, 'v Variant(String, Int128, Int256)', '\\N\nstring\n-1\n0\n42d42') format TSV; +select v, variantElement(v, 'UInt128') from format(TSV, 'v Variant(String, UInt128, Int256)', '\\N\nstring\n-1\n0\n42d42') format TSV; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(TSV, 'v Variant(String, Float32)', '\\N\nstring\n42.42\n42.d42') format TSV; +select v, variantElement(v, 'Float64') from format(TSV, 'v Variant(String, Float64)', '\\N\nstring\n42.42\n42.d42') format TSV; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(TSV, 'v Variant(String, Decimal32(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal64(6)') from format(TSV, 'v Variant(String, Decimal64(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal128(6)') from format(TSV, 'v Variant(String, Decimal128(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal256(6)') from format(TSV, 'v Variant(String, Decimal256(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(TSV, 'v Variant(String, Date, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'Date32') from format(TSV, 'v Variant(String, Date32, DateTime64)', '\\N\nstring\n2020-01-d1\n1900-01-01\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'DateTime') from format(TSV, 'v Variant(String, DateTime, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01 00:00:00\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'DateTime64') from format(TSV, 'v Variant(String, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01 00:00:00.999\n2020-01-01 00:00:00.999999999 ABC') format TSV; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(TSV, 'v Variant(String, UUID)', '\\N\nstring\nc8619cca-0caa-445e-ae76-1d4f6e0b3927\nc8619cca-0caa-445e-ae76-1d4f6e0b3927AAA') format TSV; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(TSV, 'v Variant(String, IPv4)', '\\N\nstring\n127.0.0.1\n127.0.0.1AAA') format TSV; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(TSV, 'v Variant(String, IPv6)', '\\N\nstring\n2001:0db8:85a3:0000:0000:8a2e:0370:7334\n2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA') format TSV; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(TSV, 'v Variant(String, UInt32, Enum(''a'' = 1))', '\\N\nstring\na\n1\n2\naa') format TSV; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(TSV, 'v Variant(String, Map(String, UInt64))', '\\N\nstring\n{''a'' : 42, ''b'' : 43, ''c'' : null}\n{''c'' : 44, ''d'' : [1,2,3]}\n{''c'' : 44') format TSV; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(TSV, 'v Variant(String, Array(UInt64))', '\\N\nstring\n[1, 2, 3]\n[null, null, null]\n[1, 2, ''hello'']\n[1, 2') format TSV; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(TSV, 'v Variant(LowCardinality(String), UInt64)', '\\N\nstring\n42') format TSV; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(TSV, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '\\N\n[''string'', null]\n[''string'', nul]\n42') format TSV; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(TSV, 'v Variant(String, Array(Nullable(String)))', '\\N\nstring\n[''hello'', null, ''world'']\n[''hello'', nul]') format TSV; + +select repeat('-', 80) format JSONEachRow; + +select 'Values'; +select 'String'; +select v, variantElement(v, 'String') from format(Values, 'v Variant(String, UInt64)', '(NULL), (''string''), (42)') format Values; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(Values, 'v Variant(String, FixedString(4))', '(NULL), (''string''), (''abcd'')') format Values; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(Values, 'v Variant(String, Bool)', '(NULL), (True)') format Values; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(Values, 'v Variant(String, Int8, UInt64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt8') from format(Values, 'v Variant(String, UInt8, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int16') from format(Values, 'v Variant(String, Int16, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt16') from format(Values, 'v Variant(String, UInt16, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int32') from format(Values, 'v Variant(String, Int32, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt32') from format(Values, 'v Variant(String, UInt32, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int64') from format(Values, 'v Variant(String, Int64, Int128)', '(NULL), (''string''), (-1), (0), (10000000000000000000000)') format Values; +select v, variantElement(v, 'UInt64') from format(Values, 'v Variant(String, UInt64, Int128)', '(NULL), (''string''), (-1), (0), (10000000000000000000000)') format Values; +select v, variantElement(v, 'Int128') from format(Values, 'v Variant(String, Int128, Int256)', '(NULL), (''string''), (-1), (0)') format Values; +select v, variantElement(v, 'UInt128') from format(Values, 'v Variant(String, UInt128, Int256)', '(NULL), (''string''), (-1), (0)') format Values; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(Values, 'v Variant(String, Float32)', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Float64') from format(Values, 'v Variant(String, Float64)', '(NULL), (''string''), (42.42)') format Values; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(Values, 'v Variant(String, Decimal32(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal64(6)') from format(Values, 'v Variant(String, Decimal64(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal128(6)') from format(Values, 'v Variant(String, Decimal128(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal256(6)') from format(Values, 'v Variant(String, Decimal256(6))', '(NULL), (''string''), (42.42)') format Values; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(Values, 'v Variant(String, Date, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'Date32') from format(Values, 'v Variant(String, Date32, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''1900-01-01''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'DateTime') from format(Values, 'v Variant(String, DateTime, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01 00:00:00''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'DateTime64') from format(Values, 'v Variant(String, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01 00:00:00.999''), (''2020-01-01 00:00:00.999999999 ABC'')') format Values; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(Values, 'v Variant(String, UUID)', '(NULL), (''string''), (''c8619cca-0caa-445e-ae76-1d4f6e0b3927''), (''c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA'')') format Values; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(Values, 'v Variant(String, IPv4)', '(NULL), (''string''), (''127.0.0.1''), (''127.0.0.1AAA'')') format Values; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(Values, 'v Variant(String, IPv6)', '(NULL), (''string''), (''2001:0db8:85a3:0000:0000:8a2e:0370:7334''), (''2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA'')') format Values; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(Values, 'v Variant(String, UInt32, Enum(''a'' = 1))', '(NULL), (''string''), (''a''), (1), (2), (''aa'')') format Values; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(Values, 'v Variant(String, Map(String, UInt64))', '(NULL), (''string''), ({''a'' : 42, ''b'' : 43, ''c'' : null})') format Values; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(Values, 'v Variant(String, Array(UInt64))', '(NULL), (''string''), ([1, 2, 3]), ([null, null, null])') format Values; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(Values, 'v Variant(LowCardinality(String), UInt64)', '(NULL), (''string''), (42)') format Values; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(Values, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '(NULL), ([''string'', null]), (42)') format Values; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(Values, 'v Variant(String, Array(Nullable(String)))', '(NULL), (''string''), ([''hello'', null, ''world''])') format Values; + +select ''; \ No newline at end of file diff --git a/tests/queries/0_stateless/02941_variant_type_1.reference b/tests/queries/0_stateless/02941_variant_type_1.reference new file mode 100644 index 00000000000..8a6e77d4f6d --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_1.reference @@ -0,0 +1,2472 @@ +Memory +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02941_variant_type_1.sh b/tests/queries/0_stateless/02941_variant_type_1.sh new file mode 100755 index 00000000000..774acb4bbef --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_1.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test1_insert() +{ + echo "test1 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(3);" + $CH_CLIENT -q "insert into test select number + 3, number from numbers(3);" + $CH_CLIENT -q "insert into test select number + 6, 'str_' || toString(number) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 9, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 12, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 15, range(number + 1)::Array(UInt64) from numbers(3);" +} + +function test1_select() +{ + echo "test1 select" + $CH_CLIENT -q "select v from test order by id;" + $CH_CLIENT -q "select v.String from test order by id;" + $CH_CLIENT -q "select v.UInt64 from test order by id;" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test2_insert() +{ + echo "test2 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(3);" + $CH_CLIENT -q "insert into test select number + 3, number % 2 ? NULL : number from numbers(3);" + $CH_CLIENT -q "insert into test select number + 6, number % 2 ? NULL : 'str_' || toString(number) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 9, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(('lc_str_' || toString(number))::LowCardinality(String), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" + $CH_CLIENT -q "insert into test select number + 12, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" + $CH_CLIENT -q "insert into test select number + 15, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(range(number + 1)::Array(UInt64), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" +} + +function test2_select() +{ + echo "test2 select" + $CH_CLIENT -q "select v from test order by id;" + $CH_CLIENT -q "select v.String from test order by id;" + $CH_CLIENT -q "select v.UInt64 from test order by id;" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test3_insert() +{ + echo "test3 insert" + $CH_CLIENT -q "insert into test with 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))' as type select number, multiIf(number % 6 == 0, CAST(NULL, type), number % 6 == 1, CAST('str_' || toString(number), type), number % 6 == 2, CAST(number, type), number % 6 == 3, CAST(('lc_str_' || toString(number))::LowCardinality(String), type), number % 6 == 4, CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), type), CAST(range(number + 1)::Array(UInt64), type)) as res from numbers(18);" +} + +function test3_select() +{ + echo "test3 select" + $CH_CLIENT -q "select v from test order by id;" + $CH_CLIENT -q "select v.String from test order by id;" + $CH_CLIENT -q "select v.UInt64 from test order by id;" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test1_insert + test1_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test1_select + fi + $CH_CLIENT -q "truncate table test;" + test2_insert + test2_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test2_select + fi + $CH_CLIENT -q "truncate table test;" + test3_insert + test3_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test3_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_2.reference b/tests/queries/0_stateless/02941_variant_type_2.reference new file mode 100644 index 00000000000..4b6d53c52ac --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_2.reference @@ -0,0 +1,51 @@ +Memory +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +MergeTree compact +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +MergeTree wide +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh new file mode 100755 index 00000000000..aef5bc3fe02 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test4_insert() +{ + echo "test4 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 200000, number from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 400000, 'str_' || toString(number) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 600000, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 800000, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 1000000, range(number % 20 + 1)::Array(UInt64) from numbers(200000);" +} + +function test4_select +{ + echo "test4 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`LowCardinality(String)\`);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test format Null;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b);" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test format Null;" + $CH_CLIENT -q "select count() from test where not empty(v.\`Array(UInt64)\`);" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" +} + +function run() +{ + test4_insert + test4_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test4_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_3.reference b/tests/queries/0_stateless/02941_variant_type_3.reference new file mode 100644 index 00000000000..1ccdb3acdff --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_3.reference @@ -0,0 +1,51 @@ +Memory +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +MergeTree compact +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +MergeTree wide +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 diff --git a/tests/queries/0_stateless/02941_variant_type_3.sh b/tests/queries/0_stateless/02941_variant_type_3.sh new file mode 100755 index 00000000000..d3692270deb --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_3.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test5_insert() +{ + echo "test5 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 200000, number % 2 ? NULL : number from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 400000, number % 2 ? NULL : 'str_' || toString(number) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 600000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(('lc_str_' || toString(number))::LowCardinality(String), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 800000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 1000000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(range(number % 20 + 1)::Array(UInt64), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" +} + +function test5_select() +{ + echo "test5 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`LowCardinality(String)\`);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test format Null;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b);" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test format Null;" + $CH_CLIENT -q "select count() from test where not empty(v.\`Array(UInt64)\`);" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" +} + +function run() +{ + test5_insert + test5_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test5_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_4.reference b/tests/queries/0_stateless/02941_variant_type_4.reference new file mode 100644 index 00000000000..e13d5820343 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_4.reference @@ -0,0 +1,56 @@ +Memory +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh new file mode 100755 index 00000000000..b3cc041bcd8 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test6_insert() +{ + echo "test6 insert" + $CH_CLIENT -q "insert into test with 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))' as type select number, multiIf(number % 6 == 0, CAST(NULL, type), number % 6 == 1, CAST('str_' || toString(number), type), number % 6 == 2, CAST(number, type), number % 6 == 3, CAST(('lc_str_' || toString(number))::LowCardinality(String), type), number % 6 == 4, CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), type), CAST(range(number % 20 + 1)::Array(UInt64), type)) as res from numbers(1200000);" +} + +function test6_select() +{ + echo "test6 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`LowCardinality(String)\`);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test format Null;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b);" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test format Null;" + $CH_CLIENT -q "select count() from test where not empty(v.\`Array(UInt64)\`);" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test6_insert + test6_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test6_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02942_variant_cast.reference b/tests/queries/0_stateless/02942_variant_cast.reference new file mode 100644 index 00000000000..f3fd7a9ba33 --- /dev/null +++ b/tests/queries/0_stateless/02942_variant_cast.reference @@ -0,0 +1,25 @@ +\N +42 +0 +\N +2 +\N +Hello +Hello +NULL +Hello +Hello +\N +Hello +\N +0 +\N +42 +\N +Hello +2 +\N +Hello +5 +0 +1 diff --git a/tests/queries/0_stateless/02942_variant_cast.sql b/tests/queries/0_stateless/02942_variant_cast.sql new file mode 100644 index 00000000000..33587e3e438 --- /dev/null +++ b/tests/queries/0_stateless/02942_variant_cast.sql @@ -0,0 +1,23 @@ +set allow_experimental_variant_type=1; + +select NULL::Variant(String, UInt64); +select 42::UInt64::Variant(String, UInt64); +select 42::UInt32::Variant(String, UInt64); -- {serverError CANNOT_CONVERT_TYPE} +select now()::Variant(String, UInt64); -- {serverError CANNOT_CONVERT_TYPE} +select CAST(number % 2 ? NULL : number, 'Variant(String, UInt64)') from numbers(4); +select 'Hello'::LowCardinality(String)::Variant(LowCardinality(String), UInt64); +select 'Hello'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select 'NULL'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select 'Hello'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select CAST(CAST(number % 2 ? NULL : 'Hello', 'LowCardinality(Nullable(String))'), 'Variant(LowCardinality(String), UInt64)') from numbers(4); + +select NULL::Variant(String, UInt64)::UInt64; +select NULL::Variant(String, UInt64)::Nullable(UInt64); +select '42'::Variant(String, UInt64)::UInt64; +select 'str'::Variant(String, UInt64)::UInt64; -- {serverError CANNOT_PARSE_TEXT} +select CAST(multiIf(number % 3 == 0, NULL::Variant(String, UInt64), number % 3 == 1, 'Hello'::Variant(String, UInt64), number::Variant(String, UInt64)), 'Nullable(String)') from numbers(6); +select CAST(multiIf(number == 1, NULL::Variant(String, UInt64), number == 2, 'Hello'::Variant(String, UInt64), number::Variant(String, UInt64)), 'UInt64') from numbers(6); -- {serverError CANNOT_PARSE_TEXT} + + +select number::Variant(UInt64)::Variant(String, UInt64)::Variant(Array(String), String, UInt64) from numbers(2); +select 'str'::Variant(String, UInt64)::Variant(String, Array(UInt64)); -- {serverError CANNOT_CONVERT_TYPE} diff --git a/tests/queries/0_stateless/02943_variant_element.reference b/tests/queries/0_stateless/02943_variant_element.reference new file mode 100644 index 00000000000..ab8aaa8fdef --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_element.reference @@ -0,0 +1,44 @@ +\N +\N +\N +\N +0 +1 +2 +3 +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +[] +[[0]] +[[NULL]] +[[2]] +[[NULL]] diff --git a/tests/queries/0_stateless/02943_variant_element.sql b/tests/queries/0_stateless/02943_variant_element.sql new file mode 100644 index 00000000000..c8eff9775ad --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_element.sql @@ -0,0 +1,16 @@ +set allow_experimental_variant_type=1; +set use_variant_when_no_common_type_in_if=1; + +select variantElement(NULL::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement(number::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement(number::Variant(String, UInt64), 'String') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(String, UInt64), 'String') from numbers(4); +select variantElement((number % 2 ? NULL : 'str_' || toString(number))::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64), 'LowCardinality(String)') from numbers(4); +select variantElement(NULL::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64), 'LowCardinality(String)') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(Array(UInt64), UInt64), 'Array(UInt64)') from numbers(4); +select variantElement(NULL::Variant(Array(UInt64), UInt64), 'Array(UInt64)') from numbers(4); +select variantElement(number % 2 ? NULL : range(number + 1), 'Array(UInt64)') from numbers(4); + +select variantElement([[(number % 2 ? NULL : number)::Variant(String, UInt64)]], 'UInt64') from numbers(4); + diff --git a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference new file mode 100644 index 00000000000..3803f39253c --- /dev/null +++ b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference @@ -0,0 +1,96 @@ +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +String str_0 +String str_1 +String str_2 +String str_3 +Nullable(String) str_0 +Nullable(String) str_1 +Nullable(String) str_2 +Nullable(String) str_3 +Array(UInt64) [0] +Array(UInt64) [0,1] +Array(UInt64) [0,1,2] +Array(UInt64) [0,1,2,3] +Array(UInt64) [0] +Array(UInt64) [0,1] +Array(UInt64) [0,1,2] +Array(UInt64) [0,1,2,3] +String str_0 +String str_1 +String str_2 +String str_3 +Nullable(String) str_0 +Nullable(String) str_1 +Nullable(String) str_2 +Nullable(String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) str_1 +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) str_1 +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 diff --git a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql new file mode 100644 index 00000000000..da36863bfda --- /dev/null +++ b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql @@ -0,0 +1,64 @@ +set allow_experimental_variant_type=1; +set use_variant_when_no_common_type_in_if=1; + +select toTypeName(res), if(1, [1,2,3], 'str_1') as res; +select toTypeName(res), if(1, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(0, [1,2,3], 'str_1') as res; +select toTypeName(res), if(0, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(NULL, [1,2,3], 'str_1') as res; +select toTypeName(res), if(NULL, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], 'str_1') as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(1, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(1, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(0, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(0, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(NULL, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(NULL, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(1, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(1, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(0, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(0, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(NULL, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(NULL, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], materialize('str_1')::Nullable(String)) as res; + + +select toTypeName(res), if(0, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(0, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(1, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(1, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(NULL, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(NULL, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(number % 2, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::LowCardinality(String)) as res from numbers(4); +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::LowCardinality(Nullable(String))) as res from numbers(4); + + +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, 'str_' || toString(number)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::Nullable(String)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::LowCardinality(String)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::LowCardinality(Nullable(String))) as res from numbers(6); + From 0a7ca36e7fbd02b4b64a30371fa3118144179e51 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:08:35 +0000 Subject: [PATCH 0055/1081] Remove unneded changes in IColumn.h --- src/Columns/IColumn.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 0dcba5b310c..3f866e6213d 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -631,17 +631,6 @@ struct IsMutableColumns template <> struct IsMutableColumns<> { static const bool value = true; }; -template -struct IsMutableColumnsOrRvalueReferences; - -template -struct IsMutableColumnsOrRvalueReferences -{ - static const bool value = (std::is_assignable::value || std::is_rvalue_reference_v) && IsMutableColumnsOrRvalueReferences::value; -}; - -template <> -struct IsMutableColumnsOrRvalueReferences<> { static const bool value = true; }; template const Type * checkAndGetColumn(const IColumn & column) From bd84799aecb0f8103fd88e9fb1491720f9ec90c8 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:21:45 +0000 Subject: [PATCH 0056/1081] Fix style --- src/Columns/ColumnVariant.h | 4 ++-- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 692fdd1709e..702107504f0 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -9,7 +9,7 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; } /** @@ -263,7 +263,7 @@ public: bool hasOnlyNulls() const { /// If all variants are empty, we have only NULL values. - return std::all_of(variants.begin(), variants.end(), [](const auto & v){ return v->empty(); } ); + return std::all_of(variants.begin(), variants.end(), [](const WrappedPtr & v){ return v->empty(); }); } /// Check if local and global order is the same. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 637ab0ce6d4..bc03f4b39f8 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2594,6 +2594,7 @@ uuid varPop varSamp variadic +variantElement varint varpop varsamp From e74ae96dd006f8ff5fc8150eba5ab0beb47ddba3 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:35:13 +0000 Subject: [PATCH 0057/1081] Fux typo --- src/DataTypes/Serializations/SerializationNullable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index e7f0e61f2a5..05c70827c35 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -206,7 +206,7 @@ ReturnType safeAppendToNullMap(ColumnNullable & column, bool is_null) } /// Deserialize value into non-nullable column. In case of NULL, insert default and set is_null to true. -/// If ReturnType is bool, return true if parsing was succesfull and false in case of any error. +/// If ReturnType is bool, return true if parsing was successful and false in case of any error. template static ReturnType deserializeImpl(IColumn & column, ReadBuffer & buf, CheckForNull && check_for_null, DeserializeNested && deserialize_nested, bool & is_null) { From 9edbfb3a31e67722a6af3b418a119e9b2bbb164e Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:55:42 +0000 Subject: [PATCH 0058/1081] Fix build after merging with master --- src/DataTypes/Serializations/SerializationEnum.cpp | 10 +++++----- src/DataTypes/Serializations/SerializationEnum.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index 6ad55913738..fb384547d64 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -47,7 +47,7 @@ bool SerializationEnum::tryDeserializeTextEscaped(IColumn & column, ReadBu { std::string field_name; readEscapedString(field_name, istr); - if (!this->tryGetValue(x, StringRef(field_name), true)) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) return false; } @@ -75,7 +75,7 @@ bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuf std::string field_name; readQuotedStringWithSQLStyle(field_name, istr); FieldType x; - if (!this->tryGetValue(x, StringRef(field_name))) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) return false; assert_cast(column).getData().push_back(x); return true; @@ -111,7 +111,7 @@ bool SerializationEnum::tryDeserializeWholeText(IColumn & column, ReadBuff { std::string field_name; readStringUntilEOF(field_name, istr); - if (!this->tryGetValue(x, StringRef(field_name), true)) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) return false; } @@ -157,7 +157,7 @@ bool SerializationEnum::tryDeserializeTextJSON(IColumn & column, ReadBuffe { std::string field_name; readJSONString(field_name, istr); - if (!this->tryGetValue(x, StringRef(field_name))) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) return false; } @@ -198,7 +198,7 @@ bool SerializationEnum::tryDeserializeTextCSV(IColumn & column, ReadBuffer { std::string field_name; readCSVString(field_name, istr, settings.csv); - if (!this->tryGetValue(x, StringRef(field_name), true)) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) return false; } diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 708161dc5fd..5152a3fbc93 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -60,7 +60,7 @@ public: bool tryReadValue(ReadBuffer & istr, FieldType & x) const { - if (!tryReadText(x, istr) || !this->hasValue(x)) + if (!tryReadText(x, istr) || !ref_enum_values.hasValue(x)) return false; return true; From 3c9dd07f7b2c036f5d299869f16ae0a39621b25f Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 21:17:39 +0000 Subject: [PATCH 0059/1081] Fix special builds, fix test --- src/Columns/tests/gtest_column_variant.cpp | 5 ++++- src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp | 7 +++---- tests/queries/0_stateless/02941_variant_type_1.sh | 1 + tests/queries/0_stateless/02941_variant_type_2.sh | 2 +- tests/queries/0_stateless/02941_variant_type_3.sh | 2 +- tests/queries/0_stateless/02941_variant_type_4.sh | 1 + 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Columns/tests/gtest_column_variant.cpp b/src/Columns/tests/gtest_column_variant.cpp index b701e2d3183..0a6512c46b7 100644 --- a/src/Columns/tests/gtest_column_variant.cpp +++ b/src/Columns/tests/gtest_column_variant.cpp @@ -582,7 +582,10 @@ TEST(ColumnVariant, PermuteAndIndexOneColumnNoNulls) ASSERT_EQ((*permuted_column)[2].get(), 2); auto index = ColumnUInt64::create(); - index->getData() = std::move(permutation); + index->getData().push_back(1); + index->getData().push_back(3); + index->getData().push_back(2); + index->getData().push_back(0); auto indexed_column = column->index(*index, 3); ASSERT_EQ(indexed_column->size(), 3); ASSERT_EQ((*indexed_column)[0].get(), 1); diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp index 81c4af97401..dfcd24aff58 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -15,10 +15,10 @@ void SerializationIP::deserializeText(DB::IColumn & column, DB::ReadBuffer IPv x; readText(x, istr); + assert_cast &>(column).getData().push_back(x); + if (whole && !istr.eof()) throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); - - assert_cast &>(column).getData().push_back(x); } template @@ -77,11 +77,10 @@ void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuf /// this code looks weird, but we want to throw specific exception to match original behavior... if (istr.eof()) assertChar('"', istr); + assert_cast &>(column).getData().push_back(x); if (*istr.position() != '"') throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); istr.ignore(); - - assert_cast &>(column).getData().push_back(x); } template diff --git a/tests/queries/0_stateless/02941_variant_type_1.sh b/tests/queries/0_stateless/02941_variant_type_1.sh index 774acb4bbef..4cf8ad25122 100755 --- a/tests/queries/0_stateless/02941_variant_type_1.sh +++ b/tests/queries/0_stateless/02941_variant_type_1.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh index aef5bc3fe02..7064dfbf4ec 100755 --- a/tests/queries/0_stateless/02941_variant_type_2.sh +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# tags: long +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment diff --git a/tests/queries/0_stateless/02941_variant_type_3.sh b/tests/queries/0_stateless/02941_variant_type_3.sh index d3692270deb..303039edef7 100755 --- a/tests/queries/0_stateless/02941_variant_type_3.sh +++ b/tests/queries/0_stateless/02941_variant_type_3.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# tags: long +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh index b3cc041bcd8..169e43c6d69 100755 --- a/tests/queries/0_stateless/02941_variant_type_4.sh +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment From e832599dfab7ba2304a4a00175ce48f6a63ed701 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 20 Dec 2023 04:57:56 +0000 Subject: [PATCH 0060/1081] fix materialize column for compact parts Signed-off-by: Duc Canh Le --- src/Storages/MergeTree/MutateTask.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index a04d9cdb886..dd84aa0d98a 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -80,7 +80,11 @@ static void splitAndModifyMutationCommands( /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + { + LOG_DEBUG(log, "Materializing column {}\n", command.column_name); + for_interpreter.push_back(command); mutated_columns.emplace(command.column_name); + } } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC @@ -92,7 +96,6 @@ static void splitAndModifyMutationCommands( for_interpreter.push_back(command); for (const auto & [column_name, expr] : command.column_to_update_expression) mutated_columns.emplace(column_name); - } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION From 7b49a0e530e2a2cb8629c249b96f43c6554ea51d Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 20 Dec 2023 04:59:03 +0000 Subject: [PATCH 0061/1081] remove junk log Signed-off-by: Duc Canh Le --- src/Storages/MergeTree/MutateTask.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index dd84aa0d98a..bb41608eb00 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -81,7 +81,6 @@ static void splitAndModifyMutationCommands( auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) { - LOG_DEBUG(log, "Materializing column {}\n", command.column_name); for_interpreter.push_back(command); mutated_columns.emplace(command.column_name); } From 1efd65b8c73951e60e94f74ccc45141a5b39d85e Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 20 Dec 2023 17:43:04 +0000 Subject: [PATCH 0062/1081] Fix tests --- src/Columns/ColumnVariant.cpp | 10 ++++++++++ src/Columns/ColumnVariant.h | 1 + src/DataTypes/DataTypeVariant.cpp | 12 ++++++++++++ src/DataTypes/DataTypeVariant.h | 1 + src/DataTypes/IDataType.h | 2 +- .../Serializations/SerializationVariantElement.cpp | 2 +- 6 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 67754e77992..a3a0362b646 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -480,6 +480,16 @@ void ColumnVariant::insertFrom(const IColumn & src_, size_t n) } } +void ColumnVariant::insertIntoVariant(const DB::Field & x, Discriminator global_discr) +{ + if (global_discr > variants.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator: {}. The number of variants is {}", size_t(global_discr), variants.size()); + auto & variant = getVariantByGlobalDiscriminator(global_discr); + variant.insert(x); + getLocalDiscriminators().push_back(localDiscriminatorByGlobal(global_discr)); + getOffsets().push_back(variant.size() - 1); +} + void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) { const size_t num_variants = variants.size(); diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 702107504f0..b388b118a69 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -174,6 +174,7 @@ public: StringRef getDataAt(size_t n) const override; void insertData(const char * pos, size_t length) override; void insert(const Field & x) override; + void insertIntoVariant(const Field & x, Discriminator global_discr); void insertFrom(const IColumn & src_, size_t n) override; void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; void insertManyFrom(const IColumn & src, size_t position, size_t length) override; diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 77e1c504cf8..334ed2c7b10 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -1,9 +1,11 @@ #include +#include #include #include #include #include #include +#include #include #include #include @@ -94,6 +96,16 @@ MutableColumnPtr DataTypeVariant::createColumn() const return ColumnVariant::create(std::move(nested_columns)); } +ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & field) const +{ + auto field_type = applyVisitor(FieldToDataType(), field); + auto discr = tryGetVariantDiscriminator(field_type); + if (!discr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); + auto column = createColumn(); + assert_cast(*column).insertIntoVariant(field, *discr); + return ColumnConst::create(std::move(column), size); +} Field DataTypeVariant::getDefault() const { diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index 60113a188b0..ca15dff1476 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -37,6 +37,7 @@ public: MutableColumnPtr createColumn() const override; + ColumnPtr createColumnConst(size_t size, const Field & field) const override; Field getDefault() const override; bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index ccdf54f57c3..4533c23a89f 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -150,7 +150,7 @@ public: /** Create ColumnConst for corresponding type, with specified size and value. */ - ColumnPtr createColumnConst(size_t size, const Field & field) const; + virtual ColumnPtr createColumnConst(size_t size, const Field & field) const; ColumnPtr createColumnConstWithDefaultValue(size_t size) const; /** Get default value of data type. diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 4b24ee5754e..1c0808db2a0 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -204,7 +204,7 @@ ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB: /// If this variant is empty, fill result column with default values. if (prev->empty()) { - auto res = IColumn::mutate(makeNullableOrLowCardinalityNullableSafe(prev)); + auto res = makeNullableOrLowCardinalityNullableSafe(prev)->cloneEmpty(); res->insertManyDefaults(local_discriminators->size()); return res; } From 4f8789927db4dd0d9c79a80bebc805895d82297c Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Dec 2023 15:53:21 +0000 Subject: [PATCH 0063/1081] Fix tests with analyzer, add more tests --- src/DataTypes/DataTypeVariant.cpp | 18 +- ...different_local_and_global_order.reference | 244 ++++++++++++++++++ ...e_with_different_local_and_global_order.sh | 82 ++++++ .../02944_variant_as_if_multi_if_result.sql | 1 + 4 files changed, 340 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference create mode 100755 tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 334ed2c7b10..0575f220f22 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -98,12 +98,20 @@ MutableColumnPtr DataTypeVariant::createColumn() const ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & field) const { - auto field_type = applyVisitor(FieldToDataType(), field); - auto discr = tryGetVariantDiscriminator(field_type); - if (!discr) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); auto column = createColumn(); - assert_cast(*column).insertIntoVariant(field, *discr); + if (field.isNull()) + { + column->insertDefault(); + } + else + { + auto field_type = applyVisitor(FieldToDataType(), field); + auto discr = tryGetVariantDiscriminator(field_type); + if (!discr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); + assert_cast(*column).insertIntoVariant(field, *discr); + } + return ColumnConst::create(std::move(column), size); } diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference new file mode 100644 index 00000000000..f2e355824f9 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference @@ -0,0 +1,244 @@ +Memory +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh new file mode 100755 index 00000000000..88bd2d3bd42 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + + +function test1_insert() +{ + echo "test1 insert" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(10, 10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(20, 10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number < 35, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(30, 10) settings max_block_size=3" +} + +function test1_select() +{ + echo "test1 select" + $CH_CLIENT -q "select v, v.String, v.UInt64 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test2_insert() +{ + echo "test2 insert" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 10000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(2000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number < 5, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + } + +function test2_select() +{ + echo "test2 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test1_insert + test1_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test1_select + fi + $CH_CLIENT -q "truncate table test;" + test2_insert + test2_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test2_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql index da36863bfda..1121b21e383 100644 --- a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql +++ b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql @@ -1,3 +1,4 @@ +set allow_experimental_analyzer=0; -- The result type for if function with constant is different with analyzer. set allow_experimental_variant_type=1; set use_variant_when_no_common_type_in_if=1; From 38ec9b5f719740b4e94758f9e5578acd562df939 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 22 Dec 2023 00:11:39 +0000 Subject: [PATCH 0064/1081] Fix variant element deserialization --- .../Serializations/SerializationVariant.cpp | 27 ++-- .../SerializationVariantElement.cpp | 149 ++++++++++-------- ...different_local_and_global_order.reference | 30 ++-- ...e_with_different_local_and_global_order.sh | 8 +- 4 files changed, 117 insertions(+), 97 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index ebd44fd6955..910ad1da303 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -277,13 +277,10 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); /// First, deserialize new discriminators. - /// We deserialize them into a separate column to be able to use substream cache, - /// so if we also need to deserialize some of sub columns, we will read discriminators only once. settings.path.push_back(Substream::VariantDiscriminators); - ColumnPtr discriminators; if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) { - discriminators = cached_discriminators; + col.getLocalDiscriminatorsPtr() = cached_discriminators; } else { @@ -291,29 +288,31 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( if (!discriminators_stream) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::deserializeBinaryBulkWithMultipleStreams"); - discriminators = ColumnVariant::ColumnDiscriminators::create(); - SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); - addToSubstreamsCache(cache, settings.path, discriminators); + SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); } settings.path.pop_back(); - /// Iterate through new discriminators, append them to column and calculate the limit for each variant. + /// Iterate through new discriminators and calculate the limit for each variant. /// While calculating limits we can also fill offsets column (we store offsets only in memory). - const auto & discriminators_data = assert_cast(*discriminators).getData(); - auto & local_discriminators = col.getLocalDiscriminators(); - local_discriminators.reserve(local_discriminators.size() + limit); + auto & discriminators_data = col.getLocalDiscriminators(); auto & offsets = col.getOffsets(); offsets.reserve(offsets.size() + limit); std::vector variant_limits(variants.size(), 0); - for (size_t i = 0; i != limit; ++i) + size_t discriminators_offset = discriminators_data.size() - limit; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) { ColumnVariant::Discriminator discr = discriminators_data[i]; - local_discriminators.push_back(discr); if (discr == ColumnVariant::NULL_DISCRIMINATOR) + { offsets.emplace_back(); + } else - offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]++); + { + offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]); + ++variant_limits[discr]; + } } /// Now we can deserialize variants according to their limits. diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 1c0808db2a0..e06a20d2990 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -40,11 +40,31 @@ void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinary ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); } +struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState +{ + /// During deserialization discriminators and variant streams can be shared. + /// For example we can read several variant elements together: "select v.UInt32, v.String from table", + /// or we can read the whole variant and some of variant elements: "select v, v.UInt32 from table". + /// To read the same column from the same stream more than once we use substream cache, + /// but this cache stores the whole column, not only the current range. + /// During deserialization of variant element discriminators and variant columns are not stored + /// in the result column, so we need to store them inside deserialization state, so we can use + /// substream cache correctly. + ColumnPtr discriminators; + ColumnPtr variant; + + ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; +}; + void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const { + auto variant_element_state = std::make_shared(); + addVariantToPath(settings.path); - nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state); removeVariantFromPath(settings.path); + + state = std::move(variant_element_state); } void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const @@ -53,22 +73,19 @@ void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const I } void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, + ColumnPtr & result_column, size_t limit, DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const { - auto mutable_column = column->assumeMutable(); - ColumnNullable * nullable_col = typeid_cast(mutable_column.get()); - NullMap * null_map = nullable_col ? &nullable_col->getNullMapData() : nullptr; + auto * variant_element_state = checkAndGetState(state); /// First, deserialize discriminators from Variant column. settings.path.push_back(Substream::VariantDiscriminators); - ColumnPtr discriminators; if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) { - discriminators = cached_discriminators; + variant_element_state->discriminators = cached_discriminators; } else { @@ -76,85 +93,87 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( if (!discriminators_stream) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams"); - discriminators = ColumnVariant::ColumnDiscriminators::create(); - SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); - addToSubstreamsCache(cache, settings.path, discriminators); + /// If we started to read a new column, reinitialize discriminators column in deserialization state. + if (!variant_element_state->discriminators || result_column->empty()) + variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create(); + + SerializationNumber().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators); } settings.path.pop_back(); - /// Iterate through discriminators to calculate the size of the variant. - const auto & discriminators_data = assert_cast(*discriminators).getData(); - size_t variant_size = 0; - for (auto discr : discriminators_data) - variant_size += discr == variant_discriminator; + /// Iterate through new discriminators to calculate the limit for our variant. + const auto & discriminators_data = assert_cast(*variant_element_state->discriminators).getData(); + size_t discriminators_offset = variant_element_state->discriminators->size() - limit; + size_t variant_limit = 0; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + variant_limit += (discriminators_data[i] == variant_discriminator); - /// Now we know the size of the variant and can deserialize it. + /// Now we know the limit for our variant and can deserialize it. - /// If the size of variant column is the same as the size of discriminators, - /// we can deserialize new values directly into our column. - if (variant_size == discriminators_data.size()) + /// If result column is Nullable, fill null map and extract nested column. + MutableColumnPtr mutable_column = result_column->assumeMutable(); + if (isColumnNullable(*mutable_column)) { - addVariantToPath(settings.path); - /// Special case when our result column is LowCardinality(Nullable(T)). - /// In this case the variant type is LowCardinality(T), and we cannot just - /// deserialize its values directly into LowCardinality(Nullable(T)) column. - /// We create a separate column with type LowCardinality(T), deserialize - /// values into it and then insert into result column using insertRangeFrom. - if (isColumnLowCardinalityNullable(*column)) + auto & nullable_column = assert_cast(*mutable_column); + NullMap & null_map = nullable_column.getNullMapData(); + /// If we have only our discriminator in range, fill null map with 0. + if (variant_limit == limit) { - ColumnPtr variant_col = mutable_column->cloneEmpty(); - /// LowCardinality(Nullable(T)) -> LowCardinality(T) - assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); - nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, limit, settings, state, cache); - mutable_column->insertRangeFrom(*variant_col, 0, variant_col->size()); + null_map.resize_fill(null_map.size() + limit, 0); } + /// If no our discriminator in current range, fill null map with 1. + else if (variant_limit == 0) + { + null_map.resize_fill(null_map.size() + limit, 1); + } + /// Otherwise we should iterate through discriminators to fill null map. else { - nested_serialization->deserializeBinaryBulkWithMultipleStreams(nullable_col ? nullable_col->getNestedColumnPtr() : column, limit, settings, state, cache); + null_map.reserve(null_map.size() + limit); + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + null_map.push_back(discriminators_data[i] != variant_discriminator); } - if (nullable_col) - null_map->resize_fill(null_map->size() + limit, 0); - removeVariantFromPath(settings.path); - return; + + mutable_column = nullable_column.getNestedColumnPtr()->assumeMutable(); } - /// If variant size is 0, just fill column with default values. - if (variant_size == 0) + /// If we started to read a new column, reinitialize variant column in deserialization state. + if (!variant_element_state->variant || result_column->empty()) { - mutable_column->insertManyDefaults(limit); - return; + variant_element_state->variant = mutable_column->cloneEmpty(); + + /// When result column is LowCardinality(Nullable(T)) we should + /// remove Nullable from variant column before deserialization. + if (isColumnLowCardinalityNullable(*mutable_column)) + assert_cast(*variant_element_state->variant->assumeMutable()).nestedRemoveNullable(); } - /// In general case we should deserialize variant into a separate column, - /// iterate through discriminators and insert values from variant only when - /// row contains its discriminator and default value otherwise. - mutable_column->reserve(mutable_column->size() + limit); - mutable_column = nullable_col ? nullable_col->getNestedColumnPtr()->assumeMutable() : std::move(mutable_column); - ColumnPtr variant_col = mutable_column->cloneEmpty(); - - /// Special case when our result column is LowCardinality(Nullable(T)). - /// We should remove Nullable from variant column before deserialization. - if (isColumnLowCardinalityNullable(*column)) - assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); - addVariantToPath(settings.path); - nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, variant_size, settings, state, cache); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); - size_t variant_index = 0; - for (auto discr : discriminators_data) + size_t variant_offset = variant_element_state->variant->size() - variant_limit; + + /// If don't have our discriminator in range, just insert defaults. + if (variant_limit == 0) { - if (discr == variant_discriminator) + mutable_column->insertManyDefaults(limit); + } + /// If we have only our discriminator in range, insert the whole range to result column. + else if (variant_limit == limit) + { + mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, variant_limit); + } + /// Otherwise iterate through discriminators and insert value from variant or default value depending on the discriminator. + else + { + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) { - if (null_map) - null_map->push_back(0); - mutable_column->insertFrom(*variant_col, variant_index++); - } - else - { - if (null_map) - null_map->push_back(1); - mutable_column->insertDefault(); + if (discriminators_data[i] == variant_discriminator) + mutable_column->insertFrom(*variant_element_state->variant, variant_offset++); + else + mutable_column->insertDefault(); } } } diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference index f2e355824f9..1736a307c42 100644 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference @@ -44,9 +44,9 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- MergeTree compact test1 insert @@ -136,14 +136,14 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- MergeTree wide test1 insert @@ -233,12 +233,12 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh index 88bd2d3bd42..9f4df8d7466 100755 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment @@ -29,14 +30,15 @@ function test2_insert() { echo "test2 insert" $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 10000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(2000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number < 5, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - } + $CH_CLIENT -q "insert into test select number, if(number < 3500000, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" +} function test2_select() { echo "test2 select" + $CH_CLIENT -q "select v, v.String, v.UInt64 from test format Null;" $CH_CLIENT -q "select v from test format Null;" $CH_CLIENT -q "select count() from test where isNotNull(v);" $CH_CLIENT -q "select v.String from test format Null;" From 319c20091efe8eebee5bde9bb8bae67e58a589d9 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 22 Dec 2023 00:15:44 +0000 Subject: [PATCH 0065/1081] Fix comments --- src/DataTypes/Serializations/SerializationVariant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 910ad1da303..3b51c51872f 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -276,7 +276,7 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( if (!col.hasGlobalVariantsOrder()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); - /// First, deserialize new discriminators. + /// First, deserialize discriminators. settings.path.push_back(Substream::VariantDiscriminators); if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) { @@ -451,7 +451,7 @@ std::unordered_map getTypesTextDeserializePriorityMap() /// then for types with the same depth we sort by the types priority, and last we sort by the depth of LowCardinality/Nullable types, /// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types /// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). -/// This is just a batch of heuristics, +/// This is just a batch of heuristics. std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, std::unordered_map & priority_map) { if (const auto * nullable_type = typeid_cast(type.get())) @@ -553,7 +553,7 @@ bool SerializationVariant::tryDeserializeImpl( for (size_t global_discr : deserialize_text_order) { ReadBufferFromString variant_buf(field); - /// Usually try_deserialize_variant should not throw an exception, but let's use try/catch just in case. + /// Usually try_deserialize_variant should not throw any exception, but let's use try/catch just in case. try { auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); From a61efedba8854e8f06b549deb595315ee40eb303 Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 00:10:58 +0000 Subject: [PATCH 0066/1081] Fix serialization again, add more tests --- src/Columns/ColumnVariant.h | 2 +- src/Columns/ColumnVector.cpp | 2 +- src/DataTypes/DataTypeVariant.cpp | 15 +++++- .../Serializations/ISerialization.cpp | 7 +++ src/DataTypes/Serializations/ISerialization.h | 1 + .../Serializations/SerializationVariant.cpp | 52 ++++++++++++++----- .../02943_variant_read_subcolumns_1.reference | 6 +++ .../02943_variant_read_subcolumns_1.sh | 38 ++++++++++++++ .../02943_variant_read_subcolumns_2.reference | 6 +++ .../02943_variant_read_subcolumns_2.sh | 38 ++++++++++++++ 10 files changed, 150 insertions(+), 17 deletions(-) create mode 100644 tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference create mode 100755 tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh create mode 100644 tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference create mode 100755 tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index b388b118a69..ec58553f5f3 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -163,7 +163,7 @@ public: size_t size() const override { - return local_discriminators->size(); + return offsets->size(); } Field operator[](size_t n) const override; diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 37e62c76596..b4e3fee5e42 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -469,7 +469,7 @@ void ColumnVector::insertRangeFrom(const IColumn & src, size_t start, size_t const ColumnVector & src_vec = assert_cast(src); if (start + length > src_vec.data.size()) - throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, + throw Exception(ErrorCodes::LOGICAL_ERROR, "Parameters start = {}, length = {} are out of bound " "in ColumnVector::insertRangeFrom method (data.size() = {}).", toString(start), toString(length), toString(src_vec.data.size())); diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 0575f220f22..5dc42cc7443 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -108,7 +108,20 @@ ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & fiel auto field_type = applyVisitor(FieldToDataType(), field); auto discr = tryGetVariantDiscriminator(field_type); if (!discr) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); + { + for (size_t i = 0; i != variants.size(); ++i) + { + if (field.getType() == variants[i]->getDefault().getType()) + { + discr = i; + break; + } + } + } + + if (!discr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" with type {} into column with type {}", toString(field), field.getTypeName(), getName()); + assert_cast(*column).insertIntoVariant(field, *discr); } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 86a37949dc8..46353fffb48 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -124,15 +124,20 @@ void ISerialization::deserializeBinaryBulkWithMultipleStreams( DeserializeBinaryBulkStatePtr & /* state */, SubstreamsCache * cache) const { + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize path {}. Initial column size: {}", settings.path.toString(), column->size()); + auto cached_column = getFromSubstreamsCache(cache, settings.path); if (cached_column) { column = cached_column; + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Use column from cache. Size: {}", cached_column->size()); } else if (ReadBuffer * stream = settings.getter(settings.path)) { auto mutable_column = column->assumeMutable(); + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize column. Initial size: {}", mutable_column->size()); deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialized column. Size: {}", mutable_column->size()); column = std::move(mutable_column); addToSubstreamsCache(cache, settings.path, column); } @@ -177,6 +182,8 @@ String getNameForSubstreamPath( } else if (it->type == Substream::VariantDiscriminators) stream_name += ".discr"; + else if (it->type == Substream::VariantOffsets) + stream_name += ".variant_offsets"; else if (it->type == Substream::VariantElement) stream_name += "." + it->variant_element_name; } diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index f0273f59d1f..5c6fe31ed9e 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -153,6 +153,7 @@ public: ObjectData, VariantDiscriminators, + VariantOffsets, VariantElements, VariantElement, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 3b51c51872f..d36151fe8e9 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -291,28 +291,17 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); } - settings.path.pop_back(); - /// Iterate through new discriminators and calculate the limit for each variant. - /// While calculating limits we can also fill offsets column (we store offsets only in memory). - auto & discriminators_data = col.getLocalDiscriminators(); - auto & offsets = col.getOffsets(); - offsets.reserve(offsets.size() + limit); + /// Second, calculate limits for each variant by iterating through new discriminators. std::vector variant_limits(variants.size(), 0); + auto & discriminators_data = col.getLocalDiscriminators(); size_t discriminators_offset = discriminators_data.size() - limit; for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) { ColumnVariant::Discriminator discr = discriminators_data[i]; - if (discr == ColumnVariant::NULL_DISCRIMINATOR) - { - offsets.emplace_back(); - } - else - { - offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]); + if (discr != ColumnVariant::NULL_DISCRIMINATOR) ++variant_limits[discr]; - } } /// Now we can deserialize variants according to their limits. @@ -325,6 +314,41 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( settings.path.pop_back(); } settings.path.pop_back(); + + /// Fill offsets column. + /// It's important to do it after deserialization of all variants, because to fill offsets we need + /// initial variants sizes without values in current range, but some variants can be shared with + /// other columns via substream cache and they can already contain values from this range even + /// before we call deserialize for them. So, before deserialize we cannot know for sure if + /// variant columns already contain values from current range or not. But after calling deserialize + /// we know for sure that they contain these values, so we can use valiant limits and their + /// new sizes to calculate correct offsets. + settings.path.push_back(Substream::VariantOffsets); + if (auto cached_offsets = getFromSubstreamsCache(cache, settings.path)) + { + col.getOffsetsPtr() = cached_offsets; + } + else + { + auto & offsets = col.getOffsets(); + offsets.reserve(offsets.size() + limit); + std::vector variant_offsets; + variant_offsets.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variant_offsets.push_back(col.getVariantByLocalDiscriminator(i).size() - variant_limits[i]); + + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + { + ColumnVariant::Discriminator discr = discriminators_data[i]; + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + offsets.emplace_back(); + else + offsets.push_back(variant_offsets[discr]++); + } + + addToSubstreamsCache(cache, settings.path, col.getOffsetsPtr()); + } + settings.path.pop_back(); } void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference new file mode 100644 index 00000000000..4b93782cddf --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference @@ -0,0 +1,6 @@ +Memory +test +MergeTree compact +test +MergeTree wide +test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh new file mode 100755 index 00000000000..9ccad55191f --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" + $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference new file mode 100644 index 00000000000..4b93782cddf --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference @@ -0,0 +1,6 @@ +Memory +test +MergeTree compact +test +MergeTree wide +test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh new file mode 100755 index 00000000000..9ccad55191f --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" + $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + From 4931b363079aa5dd4fbc35ff6faea62efaf218de Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 00:32:28 +0000 Subject: [PATCH 0067/1081] Fix style --- src/Columns/ColumnVector.cpp | 2 +- src/DataTypes/Serializations/SerializationArray.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index b4e3fee5e42..37e62c76596 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -469,7 +469,7 @@ void ColumnVector::insertRangeFrom(const IColumn & src, size_t start, size_t const ColumnVector & src_vec = assert_cast(src); if (start + length > src_vec.data.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameters start = {}, length = {} are out of bound " "in ColumnVector::insertRangeFrom method (data.size() = {}).", toString(start), toString(length), toString(src_vec.data.size())); diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index be23278ef25..bb22af16c69 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -348,6 +348,7 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( { auto mutable_column = column->assumeMutable(); ColumnArray & column_array = typeid_cast(*mutable_column); + size_t prev_last_offset = column_array.getOffsets().back(); settings.path.push_back(Substream::ArraySizes); if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) @@ -371,9 +372,9 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( /// Number of values corresponding with `offset_values` must be read. size_t last_offset = offset_values.back(); - if (last_offset < nested_column->size()) + if (last_offset < prev_last_offset) throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested column is longer than last offset"); - size_t nested_limit = last_offset - nested_column->size(); + size_t nested_limit = last_offset - prev_last_offset; if (unlikely(nested_limit > MAX_ARRAYS_SIZE)) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array sizes are too large: {}", nested_limit); From 4e4aa90430d02f1fcc17b517946799f23c59b83e Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 00:35:20 +0000 Subject: [PATCH 0068/1081] Remove debug logging --- src/DataTypes/Serializations/ISerialization.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 46353fffb48..08575f06f2a 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -124,20 +124,15 @@ void ISerialization::deserializeBinaryBulkWithMultipleStreams( DeserializeBinaryBulkStatePtr & /* state */, SubstreamsCache * cache) const { - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize path {}. Initial column size: {}", settings.path.toString(), column->size()); - auto cached_column = getFromSubstreamsCache(cache, settings.path); if (cached_column) { column = cached_column; - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Use column from cache. Size: {}", cached_column->size()); } else if (ReadBuffer * stream = settings.getter(settings.path)) { auto mutable_column = column->assumeMutable(); - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize column. Initial size: {}", mutable_column->size()); deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialized column. Size: {}", mutable_column->size()); column = std::move(mutable_column); addToSubstreamsCache(cache, settings.path, column); } From bc757559c9f3fd1943bf338dc4fdac9e0e61240a Mon Sep 17 00:00:00 2001 From: una Date: Sat, 23 Dec 2023 18:10:42 +0800 Subject: [PATCH 0069/1081] feat:add InitialQuery event --- src/Common/ProfileEvents.cpp | 1 + src/Databases/DatabaseReplicatedWorker.cpp | 7 +++++-- src/Interpreters/DDLWorker.cpp | 2 +- .../queries/0_stateless/02950_initialquery_event.reference | 1 + tests/queries/0_stateless/02950_initialquery_event.sql | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02950_initialquery_event.reference create mode 100644 tests/queries/0_stateless/02950_initialquery_event.sql diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index f342a19b2aa..a2dc7f5ecd6 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -8,6 +8,7 @@ M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \ + M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).")\ M(QueriesWithSubqueries, "Count queries with all subqueries") \ M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \ M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \ diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 2056b403ff6..c90af7d4ea8 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -6,7 +6,10 @@ #include namespace fs = std::filesystem; - +namespace ProfileEvents +{ + extern const Event InitialQuery; +} namespace DB { @@ -264,7 +267,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - + ProfileEvents::increment(ProfileEvents::InitialQuery); LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f08fd72ff7f..ac3af6e441c 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -490,7 +490,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) diff --git a/tests/queries/0_stateless/02950_initialquery_event.reference b/tests/queries/0_stateless/02950_initialquery_event.reference new file mode 100644 index 00000000000..7ad67a1e7e4 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.reference @@ -0,0 +1 @@ +InitialQuery 6 Same as Query, but only counts initial queries (see is_initial_query). diff --git a/tests/queries/0_stateless/02950_initialquery_event.sql b/tests/queries/0_stateless/02950_initialquery_event.sql new file mode 100644 index 00000000000..2b03607c5c7 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.sql @@ -0,0 +1 @@ +SELECT * FROM system.events where event = 'InitialQuery' \ No newline at end of file From b38e7060ef455e6ae569d371203309a1ad992c66 Mon Sep 17 00:00:00 2001 From: una Date: Sat, 23 Dec 2023 18:36:23 +0800 Subject: [PATCH 0070/1081] feat:add InitialQuery event --- src/Common/ProfileEvents.cpp | 1 + src/Databases/DatabaseReplicatedWorker.cpp | 7 +++++-- src/Interpreters/DDLWorker.cpp | 2 +- .../queries/0_stateless/02950_initialquery_event.reference | 1 + tests/queries/0_stateless/02950_initialquery_event.sql | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02950_initialquery_event.reference create mode 100644 tests/queries/0_stateless/02950_initialquery_event.sql diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index f342a19b2aa..a2dc7f5ecd6 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -8,6 +8,7 @@ M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \ + M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).")\ M(QueriesWithSubqueries, "Count queries with all subqueries") \ M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \ M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \ diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 2056b403ff6..c90af7d4ea8 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -6,7 +6,10 @@ #include namespace fs = std::filesystem; - +namespace ProfileEvents +{ + extern const Event InitialQuery; +} namespace DB { @@ -264,7 +267,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - + ProfileEvents::increment(ProfileEvents::InitialQuery); LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f08fd72ff7f..ac3af6e441c 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -490,7 +490,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) diff --git a/tests/queries/0_stateless/02950_initialquery_event.reference b/tests/queries/0_stateless/02950_initialquery_event.reference new file mode 100644 index 00000000000..7ad67a1e7e4 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.reference @@ -0,0 +1 @@ +InitialQuery 6 Same as Query, but only counts initial queries (see is_initial_query). diff --git a/tests/queries/0_stateless/02950_initialquery_event.sql b/tests/queries/0_stateless/02950_initialquery_event.sql new file mode 100644 index 00000000000..2b03607c5c7 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.sql @@ -0,0 +1 @@ +SELECT * FROM system.events where event = 'InitialQuery' \ No newline at end of file From 3e22f29b4529b6fefd5e92616ce9ef1ac33966d0 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 23 Dec 2023 11:40:58 +0100 Subject: [PATCH 0071/1081] Fixed parameters --- docs/en/operations/backup.md | 2 +- .../registerBackupEngineAzureBlobStorage.cpp | 25 +++++++++++++++---- .../test.py | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 15d953249a0..4871f97c270 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -463,7 +463,7 @@ To write backups to an AzureBlobStorage container you need the following pieces The destination for a backup will be specified like this: ``` -AzureBlobStorage('/', '', '', '', ') +AzureBlobStorage('/', '', '', '', '') ``` ```sql diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 6f7b5f38c28..ef95206831f 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -22,6 +22,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int SUPPORT_IS_DISABLED; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } #if USE_AZURE_BLOB_STORAGE @@ -54,20 +55,34 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) StorageAzureBlob::Configuration configuration; - if (args.size() == 4) + if (args.size() == 3) { configuration.connection_url = args[0].safeGet(); configuration.is_connection_string = true; configuration.container = args[1].safeGet(); configuration.blob_path = args[2].safeGet(); - configuration.format = args[3].safeGet(); LOG_TRACE(&Poco::Logger::get("registerBackupEngineAzureBlobStorage"), "configuration.connection_url = {}" "configuration.container = {}" - "configuration.blob_path = {}" - "configuration.format = {}", - configuration.connection_url, configuration.container, configuration.blob_path, configuration.format); + "configuration.blob_path = {}", + configuration.connection_url, configuration.container, configuration.blob_path); + } + else if (args.size() == 5) + { + configuration.connection_url = args[0].safeGet(); + configuration.is_connection_string = false; + + configuration.container = args[1].safeGet(); + configuration.blob_path = args[2].safeGet(); + configuration.account_name = args[3].safeGet(); + configuration.account_key = args[4].safeGet(); + + } + else + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Backup AzureBlobStorage requires 3 or 5 arguments: connection string>/ Date: Sat, 23 Dec 2023 18:42:41 +0800 Subject: [PATCH 0072/1081] feat:add InitialQuery event --- src/Interpreters/DDLWorker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index ac3af6e441c..f08fd72ff7f 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -490,7 +490,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) From f594ab34f50c1bcd860bd3b950c8d74ffe09662d Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 12:56:11 +0000 Subject: [PATCH 0073/1081] Fix special build --- src/Columns/ColumnVariant.cpp | 8 +++--- src/Columns/ColumnVariant.h | 2 +- .../Serializations/SerializationArray.cpp | 5 ++-- .../Serializations/SerializationVariant.cpp | 26 +++++++------------ src/Functions/if.cpp | 2 +- 5 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index a3a0362b646..f90ebfc54bb 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -102,7 +102,7 @@ ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColu { } -ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & global_discriminators) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), global_discriminators) +ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), local_to_global_discriminators_) { } @@ -449,12 +449,12 @@ void ColumnVariant::insertData(const char *, size_t) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertData is not supported for {}", getName()); } -void ColumnVariant::insert(const Field & field) +void ColumnVariant::insert(const Field & x) { - if (field.isNull()) + if (x.isNull()) insertDefault(); else - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot insert field {} to column {}", toString(field), getName()); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot insert field {} to column {}", toString(x), getName()); } void ColumnVariant::insertFrom(const IColumn & src_, size_t n) diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index ec58553f5f3..eb96205924c 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -194,7 +194,7 @@ public: template ColumnPtr indexImpl(const PaddedPODArray & indexes, size_t limit) const; ColumnPtr replicate(const Offsets & replicate_offsets) const override; - MutableColumns scatter(ColumnIndex num_variants, const Selector & selector) const override; + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override; void gather(ColumnGathererStream & gatherer_stream) override; /// Variant type is not comparable. diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index bb22af16c69..be23278ef25 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -348,7 +348,6 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( { auto mutable_column = column->assumeMutable(); ColumnArray & column_array = typeid_cast(*mutable_column); - size_t prev_last_offset = column_array.getOffsets().back(); settings.path.push_back(Substream::ArraySizes); if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) @@ -372,9 +371,9 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( /// Number of values corresponding with `offset_values` must be read. size_t last_offset = offset_values.back(); - if (last_offset < prev_last_offset) + if (last_offset < nested_column->size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested column is longer than last offset"); - size_t nested_limit = last_offset - prev_last_offset; + size_t nested_limit = last_offset - nested_column->size(); if (unlikely(nested_limit > MAX_ARRAYS_SIZE)) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array sizes are too large: {}", nested_limit); diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index d36151fe8e9..c88dd8e9e0d 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -564,7 +564,7 @@ bool SerializationVariant::tryDeserializeImpl( IColumn & column, const String & field, std::function check_for_null, - std::function try_deserialize_variant) const + std::function try_deserialize_nested) const { auto & column_variant = assert_cast(column); ReadBufferFromString null_buf(field); @@ -577,25 +577,17 @@ bool SerializationVariant::tryDeserializeImpl( for (size_t global_discr : deserialize_text_order) { ReadBufferFromString variant_buf(field); - /// Usually try_deserialize_variant should not throw any exception, but let's use try/catch just in case. - try + auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); + size_t prev_size = variant_column.size(); + if (try_deserialize_nested(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) { - auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); - size_t prev_size = variant_column.size(); - if (try_deserialize_variant(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) - { - column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); - column_variant.getOffsets().push_back(prev_size); - return true; - } - else if (variant_column.size() > prev_size) - { - variant_column.popBack(1); - } + column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); + column_variant.getOffsets().push_back(prev_size); + return true; } - catch (...) + else if (variant_column.size() > prev_size) { - /// Try next variant. + variant_column.popBack(1); } } diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index b15bc5938be..9ca4b487119 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -224,7 +224,7 @@ public: return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if); } - FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} + explicit FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} private: bool use_variant_when_no_common_type = false; From fa5dde0bff8f34ebe85e1cc6e929f834c5e6b496 Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 12:37:06 +0800 Subject: [PATCH 0074/1081] feat: Add initial query event --- src/Databases/DatabaseReplicatedWorker.cpp | 6 +-- src/Interpreters/InterpreterFactory.cpp | 5 +- ..._distributed_initial_query_event.reference | 6 +++ .../02950_distributed_initial_query_event.sh | 54 +++++++++++++++++++ .../02950_initialquery_event.reference | 1 - .../0_stateless/02950_initialquery_event.sql | 1 - 6 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02950_distributed_initial_query_event.reference create mode 100644 tests/queries/0_stateless/02950_distributed_initial_query_event.sh delete mode 100644 tests/queries/0_stateless/02950_initialquery_event.reference delete mode 100644 tests/queries/0_stateless/02950_initialquery_event.sql diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index c90af7d4ea8..317cda3cd3d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -7,9 +7,7 @@ namespace fs = std::filesystem; namespace ProfileEvents -{ - extern const Event InitialQuery; -} + namespace DB { @@ -267,7 +265,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - ProfileEvents::increment(ProfileEvents::InitialQuery); + LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index e32cbe4ccad..fdf7e8ebfbb 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -120,6 +120,7 @@ namespace ProfileEvents { extern const Event Query; + extern const Event InitialQuery; extern const Event QueriesWithSubqueries; extern const Event SelectQuery; extern const Event InsertQuery; @@ -137,7 +138,9 @@ namespace ErrorCodes std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMutablePtr context, const SelectQueryOptions & options) { ProfileEvents::increment(ProfileEvents::Query); - + + if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + ProfileEvents::increment(ProfileEvents::InitialQuery); /// SELECT and INSERT query will handle QueriesWithSubqueries on their own. if (!(query->as() || query->as() || diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference new file mode 100644 index 00000000000..af8542c7204 --- /dev/null +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference @@ -0,0 +1,6 @@ +Local situation +Initial Query Difference: 1 +Query Difference: 1 +Distributed situation +Initial Query Difference: 1 +Query Difference: 2 diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh new file mode 100644 index 00000000000..3a01aa63d87 --- /dev/null +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -0,0 +1,54 @@ +-- Tags: distributed + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh +# CREATE TABLE local (x UInt8) Engine=Memory; +# CREATE TABLE distributed ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), x) +$CLICKHOUSE_CLIENT -n -q " +DROP TABLE IF EXISTS local; +DROP TABLE IF EXISTS distributed; +CREATE TABLE local (x UInt8) Engine=Memory; +CREATE TABLE distributed AS local ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), local, x); +INSERT INTO distributed SELECT number FROM numbers(10); +SYSTEM FLUSH DISTRIBUTED distributed; +" +echo "Local situation" +# before SELECT * FROM local +query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Execute SELECT * FROM local +$CLICKHOUSE_CLIENT -q "SELECT * FROM local" > /dev/null + +# Counts after SELECT * FROM local +After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +After_query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Calculate the differences +Initial_query_diff=$(($After_query_countI-$query_countI-2)) +query_diff=$(($After_query_countQ-$query_countQ-2)) + +echo "Initial Query Difference: $Initial_query_diff" +echo "Query Difference: $query_diff" +echo "Distributed situation" + +# before SELECT * FROM distributed +query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Execute SELECT * FROM distributed +$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed" > /dev/null + +# Counts after SELECT * FROM distributed +After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +After_query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Calculate the differences +Initial_query_diff=$(($After_query_countI-$query_countI-2)) +query_diff=$(($After_query_countQ-$query_countQ-2)) + +echo "Initial Query Difference: $Initial_query_diff" +echo "Query Difference: $query_diff" + + diff --git a/tests/queries/0_stateless/02950_initialquery_event.reference b/tests/queries/0_stateless/02950_initialquery_event.reference deleted file mode 100644 index 7ad67a1e7e4..00000000000 --- a/tests/queries/0_stateless/02950_initialquery_event.reference +++ /dev/null @@ -1 +0,0 @@ -InitialQuery 6 Same as Query, but only counts initial queries (see is_initial_query). diff --git a/tests/queries/0_stateless/02950_initialquery_event.sql b/tests/queries/0_stateless/02950_initialquery_event.sql deleted file mode 100644 index 2b03607c5c7..00000000000 --- a/tests/queries/0_stateless/02950_initialquery_event.sql +++ /dev/null @@ -1 +0,0 @@ -SELECT * FROM system.events where event = 'InitialQuery' \ No newline at end of file From 1464c3d1aab8c6ecdc369facceb1b9f6cf4b36fb Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 15:13:21 +0800 Subject: [PATCH 0075/1081] feat: Add initial query event --- src/Databases/DatabaseReplicatedWorker.cpp | 3 +-- .../02950_distributed_initial_query_event.reference | 2 +- .../0_stateless/02950_distributed_initial_query_event.sh | 7 +++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 317cda3cd3d..2056b403ff6 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -6,7 +6,6 @@ #include namespace fs = std::filesystem; -namespace ProfileEvents namespace DB { @@ -265,7 +264,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - + LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference index af8542c7204..cf10427e9b3 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference @@ -3,4 +3,4 @@ Initial Query Difference: 1 Query Difference: 1 Distributed situation Initial Query Difference: 1 -Query Difference: 2 +Query Difference: 3 diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh index 3a01aa63d87..c8a955c4fe5 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -1,4 +1,5 @@ --- Tags: distributed +#!/usr/bin/env bash +# Tags:no-parallel shard CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -38,7 +39,7 @@ query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE even query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") # Execute SELECT * FROM distributed -$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed" > /dev/null +$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed SETTINGS prefer_localhost_replica = 0" > /dev/null # Counts after SELECT * FROM distributed After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") @@ -50,5 +51,3 @@ query_diff=$(($After_query_countQ-$query_countQ-2)) echo "Initial Query Difference: $Initial_query_diff" echo "Query Difference: $query_diff" - - From 22e1bcb9d638d5df0c43585b1d78228beedb0dc8 Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 16:12:10 +0800 Subject: [PATCH 0076/1081] feat:add InitialQuery event Signed-off-by: una --- .../0_stateless/02950_distributed_initial_query_event.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh index c8a955c4fe5..ddd0fb1e408 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags:no-parallel shard +# Tags:no-parallel, shard CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From a6f2eaf5a6ba2a26943d0c1c53c7cf7460a7471d Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 16:19:06 +0800 Subject: [PATCH 0077/1081] fix:use , to split tags Signed-off-by: una --- .../0_stateless/02950_distributed_initial_query_event.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh index ddd0fb1e408..7f690a681c4 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags:no-parallel, shard +# Tags:no-parallel,shard CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From d46d91452176414426e40f598a7a1aa989f1a584 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 27 Dec 2023 10:28:52 +0100 Subject: [PATCH 0078/1081] Updated thread name --- src/Backups/BackupIO_AzureBlobStorage.cpp | 8 +- src/Backups/BackupIO_AzureBlobStorage.h | 81 +++++++++---------- .../copyAzureBlobStorageFile.cpp | 25 +++--- 3 files changed, 59 insertions(+), 55 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index d41d23e3c36..a1fd5bd8327 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -35,7 +35,7 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupReaderDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupReaderAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} , configuration(configuration_) { client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); @@ -160,7 +160,7 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , data_source_description{DataSourceType::AzureBlobStorage,configuration_.container, false, false} , configuration(configuration_) { client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); @@ -209,7 +209,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu settings, read_settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); return; /// copied! } } @@ -221,7 +221,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); } BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 6ef66fc432d..65affb9f079 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -12,57 +12,54 @@ namespace DB { -// using AzureClientPtr = std::shared_ptr; - /// Represents a backup stored to Azure - class BackupReaderAzureBlobStorage : public BackupReaderDefault - { - public: - BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); - ~BackupReaderAzureBlobStorage() override; +class BackupReaderAzureBlobStorage : public BackupReaderDefault +{ +public: + BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupReaderAzureBlobStorage() override; - bool fileExists(const String & file_name) override; - UInt64 getFileSize(const String & file_name) override; - std::unique_ptr readFile(const String & file_name) override; + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; - void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; - private: - const DataSourceDescription data_source_description; - std::shared_ptr client; - StorageAzureBlob::Configuration configuration; - std::unique_ptr object_storage; - std::shared_ptr settings; - }; +private: + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; +}; +class BackupWriterAzureBlobStorage : public BackupWriterDefault +{ +public: + BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupWriterAzureBlobStorage() override; - class BackupWriterAzureBlobStorage : public BackupWriterDefault - { - public: - BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); - ~BackupWriterAzureBlobStorage() override; + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr writeFile(const String & file_name) override; - bool fileExists(const String & file_name) override; - UInt64 getFileSize(const String & file_name) override; - std::unique_ptr writeFile(const String & file_name) override; + void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; - void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; - void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, - bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void removeFile(const String & file_name) override; + void removeFiles(const Strings & file_names) override; - void removeFile(const String & file_name) override; - void removeFiles(const Strings & file_names) override; - - private: - std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; - void removeFilesBatch(const Strings & file_names); - const DataSourceDescription data_source_description; - std::shared_ptr client; - StorageAzureBlob::Configuration configuration; - std::unique_ptr object_storage; - std::shared_ptr settings; - }; +private: + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; + void removeFilesBatch(const Strings & file_names); + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; +}; } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index bf0bcac664b..0a0a080b5cb 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -22,6 +22,11 @@ namespace ProfileEvents extern const Event DiskAzureUploadPart; } +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + namespace DB { @@ -44,7 +49,8 @@ namespace std::shared_ptr settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, - bool for_disk_azure_blob_storage_) + bool for_disk_azure_blob_storage_, + const Poco::Logger * log_) : create_read_buffer(create_read_buffer_) , client(client_) , offset (offset_) @@ -55,7 +61,7 @@ namespace , object_metadata(object_metadata_) , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) - , log(&Poco::Logger::get("azureBlobStorageUploadHelper")) + , log(log_) , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) { } @@ -179,11 +185,11 @@ namespace try { auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); - auto buffer = std::make_unique(std::move(read_buffer), part_size); task->data = new char[part_size]; task->size = part_size; - buffer->read(task->data,part_size); - task->block_id = getRandomASCIIString(64); + size_t n = read_buffer->read(task->data,part_size); + if (n != part_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size"); schedule([this, task, task_finish_notify]() { @@ -208,9 +214,10 @@ namespace { UploadPartTask task; auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); - auto buffer = std::make_unique(std::move(read_buffer), part_size); task.data = new char[part_size]; - buffer->read(task.data,part_size); + size_t n = read_buffer->read(task.data,part_size); + if (n != part_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size"); task.size = part_size; processUploadTask(task); block_ids.emplace_back(task.block_id); @@ -274,7 +281,7 @@ void copyDataToAzureBlobStorageFile( ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; helper.performCopy(); } @@ -314,7 +321,7 @@ void copyAzureBlobStorageFile( settings->max_single_download_retries); }; - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; helper.performCopy(); } } From 0181bab23c38c2d1c15f199d522a4743b11586d6 Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 19:59:23 +0800 Subject: [PATCH 0079/1081] fix:style Signed-off-by: una --- src/Interpreters/InterpreterFactory.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index fdf7e8ebfbb..c5d7f0f891c 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -138,7 +138,6 @@ namespace ErrorCodes std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMutablePtr context, const SelectQueryOptions & options) { ProfileEvents::increment(ProfileEvents::Query); - if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) ProfileEvents::increment(ProfileEvents::InitialQuery); /// SELECT and INSERT query will handle QueriesWithSubqueries on their own. From 2c1513540768eaed34a13fd643c4ace491421c0e Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 20:53:30 +0800 Subject: [PATCH 0080/1081] fix test-file permissions Signed-off-by: una --- .../queries/0_stateless/02950_distributed_initial_query_event.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02950_distributed_initial_query_event.sh diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh old mode 100644 new mode 100755 From 5497fa79edfa6fdc2559d516486f80f88af40c68 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Dec 2023 14:11:53 +0000 Subject: [PATCH 0081/1081] Fix tests --- src/DataTypes/Serializations/SerializationEnum.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index fb384547d64..14b1a33e2ce 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -73,7 +73,9 @@ template bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { std::string field_name; - readQuotedStringWithSQLStyle(field_name, istr); + if (!tryReadQuotedStringWithSQLStyle(field_name, istr)) + return false; + FieldType x; if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) return false; From 4b2a0b99fc094e6b70e516af0360f126f62a886d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:02:50 +0100 Subject: [PATCH 0082/1081] Update docs/en/sql-reference/functions/other-functions.md --- docs/en/sql-reference/functions/other-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index d69d692d055..ebc80e4d308 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2839,7 +2839,7 @@ Extracts a column with specified type from a `Variant` column. **Syntax** ```sql -tupleElement(variant, type_name, [, default_value]) +variantElement(variant, type_name, [, default_value]) ``` **Arguments** From 275fbe3e986c8faee3bd396e3ed87e3707f0f25f Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Dec 2023 20:04:14 +0000 Subject: [PATCH 0083/1081] Support function to subcolumns optimization for Variant, better text priority for reading Bool --- .../Passes/FunctionToSubcolumnsPass.cpp | 17 +++++++++++++++++ .../Serializations/SerializationVariant.cpp | 4 ++++ .../RewriteFunctionToSubcolumnVisitor.cpp | 15 +++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp index cd635f87e0e..c74c1038173 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp @@ -176,6 +176,23 @@ public: node = std::make_shared(column, column_source); } + else if (function_name == "variantElement" && isVariant(column_type) && second_argument_constant_node) + { + /// Replace `variantElement(variant_argument, type_name)` with `variant_argument.type_name`. + const auto & variant_element_constant_value = second_argument_constant_node->getValue(); + String subcolumn_name; + + if (variant_element_constant_value.getType() != Field::Types::String) + return; + + subcolumn_name = variant_element_constant_value.get(); + + column.name += '.'; + column.name += subcolumn_name; + column.type = function_node->getResultType(); + + node = std::make_shared(column, column_source); + } else if (function_name == "mapContains" && column_type.isMap()) { const auto & data_type_map = assert_cast(*column.type); diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index c88dd8e9e0d..49ecb2fc546 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -534,6 +534,10 @@ std::tuple getTypeTextDeserializePriority(const DataType return {max_depth, max_priority, max_simple_nested_depth}; } + /// Bool type should have priority higher then all integers. + if (isBool(type)) + return {nested_depth, priority_map[TypeIndex::Int8] + 1 , simple_nested_depth}; + return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; } diff --git a/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp b/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp index 506fa13b7ba..0717abd4782 100644 --- a/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp +++ b/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp @@ -122,6 +122,21 @@ void RewriteFunctionToSubcolumnData::visit(ASTFunction & function, ASTPtr & ast) ast = transformToSubcolumn(name_in_storage, subcolumn_name); ast->setAlias(alias); } + else if (function.name == "variantElement" && column_type_id == TypeIndex::Variant) + { + const auto * literal = arguments[1]->as(); + if (!literal) + return; + + String subcolumn_name; + auto value_type = literal->value.getType(); + if (value_type != Field::Types::String) + return; + + subcolumn_name = literal->value.get(); + ast = transformToSubcolumn(name_in_storage, subcolumn_name); + ast->setAlias(alias); + } else { auto it = binary_function_to_subcolumn.find(function.name); From 8b4157141c0501d4498278947b468d03638cdf8a Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Dec 2023 20:36:10 +0000 Subject: [PATCH 0084/1081] Fix style --- src/DataTypes/Serializations/SerializationVariant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 49ecb2fc546..9cfc4b9e26f 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -536,7 +536,7 @@ std::tuple getTypeTextDeserializePriority(const DataType /// Bool type should have priority higher then all integers. if (isBool(type)) - return {nested_depth, priority_map[TypeIndex::Int8] + 1 , simple_nested_depth}; + return {nested_depth, priority_map[TypeIndex::Int8] + 1, simple_nested_depth}; return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; } From 32ff152f2d7e4798a7bbc916808cc9ca883cf13e Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 28 Dec 2023 11:41:06 +0000 Subject: [PATCH 0085/1081] Support negtive position arguments --- .../replaceForPositionalArguments.cpp | 24 ++++- .../0_stateless/01798_having_push_down.sql | 3 +- .../02006_test_positional_arguments.reference | 94 +++++++++++++++++++ .../02006_test_positional_arguments.sql | 21 +++++ .../02932_group_by_null_fuzzer.sql | 1 + 5 files changed, 137 insertions(+), 6 deletions(-) diff --git a/src/Interpreters/replaceForPositionalArguments.cpp b/src/Interpreters/replaceForPositionalArguments.cpp index 241dd7cf92c..bea87ad913a 100644 --- a/src/Interpreters/replaceForPositionalArguments.cpp +++ b/src/Interpreters/replaceForPositionalArguments.cpp @@ -27,14 +27,28 @@ bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * sel return false; auto which = ast_literal->value.getType(); - if (which != Field::Types::UInt64) + if (which != Field::Types::UInt64 && which != Field::Types::Int64) return false; - auto pos = ast_literal->value.get(); + UInt64 pos; + + if (which == Field::Types::UInt64) + { + pos = ast_literal->value.get(); + } + else if (which == Field::Types::Int64) + { + auto value = ast_literal->value.get(); + pos = value > 0 ? value : columns.size() + value + 1; + } + else + { + return false; + } + if (!pos || pos > columns.size()) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Positional argument out of bounds: {} (expected in range [1, {}]", - pos, columns.size()); + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Positional argument out of bounds: {} (expected in range [1, {}]", pos, columns.size()); const auto & column = columns[--pos]; if (typeid_cast(column.get()) || typeid_cast(column.get())) diff --git a/tests/queries/0_stateless/01798_having_push_down.sql b/tests/queries/0_stateless/01798_having_push_down.sql index b3a77c8f5b5..c0c3447f5ab 100644 --- a/tests/queries/0_stateless/01798_having_push_down.sql +++ b/tests/queries/0_stateless/01798_having_push_down.sql @@ -8,11 +8,12 @@ SELECT sum(c0 = 0), min(c0 + 1), sum(c0 + 2) FROM t_having GROUP BY c0 HAVING c0 = 0 SETTINGS enable_optimize_predicate_expression=0; +SET enable_positional_arguments=0; + SELECT c0 + -1, sum(intDivOrZero(intDivOrZero(NULL, NULL), '2'), intDivOrZero(10000000000., intDivOrZero(intDivOrZero(intDivOrZero(NULL, NULL), 10), NULL))) FROM t_having GROUP BY c0 = 2, c0 = 10, intDivOrZero(intDivOrZero(intDivOrZero(NULL, NULL), NULL), NULL), c0 HAVING c0 = 2 SETTINGS enable_optimize_predicate_expression = 0; SELECT sum(c0 + 257) FROM t_having GROUP BY c0 = -9223372036854775808, NULL, -2147483649, c0 HAVING c0 = -9223372036854775808 SETTINGS enable_optimize_predicate_expression = 0; -SET enable_positional_arguments=0; SELECT c0 + -2, c0 + -9223372036854775807, c0 = NULL FROM t_having GROUP BY c0 = 0.9998999834060669, 1023, c0 HAVING c0 = 0.9998999834060669 SETTINGS enable_optimize_predicate_expression = 0; DROP TABLE t_having; diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.reference b/tests/queries/0_stateless/02006_test_positional_arguments.reference index 40100e8d5be..079bd071103 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.reference +++ b/tests/queries/0_stateless/02006_test_positional_arguments.reference @@ -3,18 +3,50 @@ select x3, x2, x1 from test order by 1; 1 100 100 10 1 10 100 10 1 +select x3, x2, x1 from test order by -3; +1 100 100 +10 1 10 +100 10 1 select x3, x2, x1 from test order by x3; 1 100 100 10 1 10 100 10 1 +select x3, x2, x1 from test order by 3; +100 10 1 +10 1 10 +1 100 100 +select x3, x2, x1 from test order by -1; +100 10 1 +10 1 10 +1 100 100 +select x3, x2, x1 from test order by x1; +100 10 1 +10 1 10 +1 100 100 select x3, x2, x1 from test order by 1 desc; 100 10 1 10 1 10 1 100 100 +select x3, x2, x1 from test order by -3 desc; +100 10 1 +10 1 10 +1 100 100 select x3, x2, x1 from test order by x3 desc; 100 10 1 10 1 10 1 100 100 +select x3, x2, x1 from test order by 3 desc; +1 100 100 +10 1 10 +100 10 1 +select x3, x2, x1 from test order by -1 desc; +1 100 100 +10 1 10 +100 10 1 +select x3, x2, x1 from test order by x1 desc; +1 100 100 +10 1 10 +100 10 1 insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x3, x2 from test group by x3, x2 order by x3; 1 100 @@ -54,6 +86,20 @@ SELECT x1 FROM test ORDER BY x3 + 1 ASC +explain syntax select x3, x2, x1 from test order by -1; +SELECT + x3, + x2, + x1 +FROM test +ORDER BY x1 ASC +explain syntax select x3 + 1, x2, x1 from test order by -1; +SELECT + x3 + 1, + x2, + x1 +FROM test +ORDER BY x1 ASC explain syntax select x3, x3 - x2, x2, x1 from test order by 2; SELECT x3, @@ -62,6 +108,14 @@ SELECT x1 FROM test ORDER BY x3 - x2 ASC +explain syntax select x3, x3 - x2, x2, x1 from test order by -2; +SELECT + x3, + x3 - x2, + x2, + x1 +FROM test +ORDER BY x2 ASC explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by 2; SELECT x3, @@ -69,12 +123,28 @@ SELECT x1 + x2 FROM test ORDER BY if(x3 > 10, x3, x1 + x2) ASC +explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by -2; +SELECT + x3, + if(x3 > 10, x3, x1 + x2), + x1 + x2 +FROM test +ORDER BY if(x3 > 10, x3, x1 + x2) ASC explain syntax select max(x1), x2 from test group by 2 order by 1, 2; SELECT max(x1), x2 FROM test GROUP BY x2 +ORDER BY + max(x1) ASC, + x2 ASC +explain syntax select max(x1), x2 from test group by -1 order by -2, -1; +SELECT + max(x1), + x2 +FROM test +GROUP BY x2 ORDER BY max(x1) ASC, x2 ASC @@ -83,16 +153,34 @@ SELECT 1 + greatest(x1, 1), x2 FROM test +GROUP BY + 1 + greatest(x1, 1), + x2 +explain syntax select 1 + greatest(x1, 1), x2 from test group by -2, -1; +SELECT + 1 + greatest(x1, 1), + x2 +FROM test GROUP BY 1 + greatest(x1, 1), x2 select max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } +select max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } +select 1 + max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } explain syntax select x1 + x3, x3 from test group by 1, 2; SELECT x1 + x3, x3 FROM test +GROUP BY + x1 + x3, + x3 +explain syntax select x1 + x3, x3 from test group by -2, -1; +SELECT + x1 + x3, + x3 +FROM test GROUP BY x1 + x3, x3 @@ -102,8 +190,14 @@ select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, 1 2 10 100 10 20 1 10 100 200 100 1 +select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, -1 desc, -2 asc; +1 2 10 100 +10 20 1 10 +100 200 100 1 select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,4,5,6 order by a; 44 88 13 14 15 16 +select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,-3,-2,-1 order by a; +44 88 13 14 15 16 explain syntax select plus(1, 1) as a group by a; SELECT 1 + 1 AS a GROUP BY a diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.sql b/tests/queries/0_stateless/02006_test_positional_arguments.sql index 159ad6bd427..6f427e0298d 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.sql +++ b/tests/queries/0_stateless/02006_test_positional_arguments.sql @@ -9,11 +9,21 @@ insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); -- { echo } select x3, x2, x1 from test order by 1; +select x3, x2, x1 from test order by -3; select x3, x2, x1 from test order by x3; +select x3, x2, x1 from test order by 3; +select x3, x2, x1 from test order by -1; +select x3, x2, x1 from test order by x1; + select x3, x2, x1 from test order by 1 desc; +select x3, x2, x1 from test order by -3 desc; select x3, x2, x1 from test order by x3 desc; +select x3, x2, x1 from test order by 3 desc; +select x3, x2, x1 from test order by -1 desc; +select x3, x2, x1 from test order by x1 desc; + insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x3, x2 from test group by x3, x2 order by x3; select x3, x2 from test group by 1, 2 order by x3; @@ -25,21 +35,32 @@ select x1, x2, x3 from test order by 3 limit 1 by 1; explain syntax select x3, x2, x1 from test order by 1; explain syntax select x3 + 1, x2, x1 from test order by 1; +explain syntax select x3, x2, x1 from test order by -1; +explain syntax select x3 + 1, x2, x1 from test order by -1; explain syntax select x3, x3 - x2, x2, x1 from test order by 2; +explain syntax select x3, x3 - x2, x2, x1 from test order by -2; explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by 2; +explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by -2; explain syntax select max(x1), x2 from test group by 2 order by 1, 2; +explain syntax select max(x1), x2 from test group by -1 order by -2, -1; explain syntax select 1 + greatest(x1, 1), x2 from test group by 1, 2; +explain syntax select 1 + greatest(x1, 1), x2 from test group by -2, -1; select max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } +select max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } +select 1 + max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } explain syntax select x1 + x3, x3 from test group by 1, 2; +explain syntax select x1 + x3, x3 from test group by -2, -1; create table test2(x1 Int, x2 Int, x3 Int) engine=Memory; insert into test2 values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, 4 desc, 3 asc; +select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, -1 desc, -2 asc; select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,4,5,6 order by a; +select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,-3,-2,-1 order by a; explain syntax select plus(1, 1) as a group by a; select substr('aaaaaaaaaaaaaa', 8) as a group by a order by a; diff --git a/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql b/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql index 0c28c120d40..603c7783ef8 100644 --- a/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql +++ b/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql @@ -1,5 +1,6 @@ -- https://github.com/ClickHouse/ClickHouse/issues/43202 -- Queries are generated by the fuzzer, so don't expect them to make sense +SET enable_positional_arguments=0; SELECT NULL, '' FROM (SELECT toNullable(''), NULL AS key GROUP BY GROUPING SETS ((NULL))) AS s1 ALL LEFT JOIN (SELECT '' AS key, NULL AS value GROUP BY GROUPING SETS (('')) WITH TOTALS UNION ALL SELECT NULL AS key, toNullable(NULL) AS value GROUP BY '', NULL, '' WITH TOTALS) AS s2 USING (key); SELECT NULL GROUP BY NULL WITH TOTALS; SELECT 1048575, NULL, b FROM (SELECT '25.5' AS a, NULL, NULL AS b GROUP BY GROUPING SETS ((0.0001)) WITH TOTALS) AS js1 ANY RIGHT JOIN (SELECT NULL AS a, NULL AS b WHERE NULL GROUP BY NULL, -9223372036854775807 WITH CUBE WITH TOTALS UNION ALL SELECT NULL AS a, NULL AS b GROUP BY 1, '21474836.46' WITH TOTALS) AS js2 USING (a, b) ORDER BY nan DESC NULLS LAST, '9223372036854775807' DESC NULLS LAST, a ASC NULLS LAST; From 4bb63f0a6f066bca972b5b3754a20f0a56354b8d Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Dec 2023 13:05:15 +0100 Subject: [PATCH 0086/1081] Update test --- .../02916_broken_projection.reference | 124 ------------------ .../0_stateless/02916_broken_projection.sh | 16 +-- 2 files changed, 8 insertions(+), 132 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 358304de74a..d340326455a 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -2,11 +2,6 @@ insert new part insert new part insert new part insert new part -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -21,11 +16,6 @@ check table 1 0 broke metadata of part 'proj' (parent part: all_2_2_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -46,11 +36,6 @@ all_2_2_0 broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -63,11 +48,6 @@ check table broken projections info all_2_2_0 proj FILE_DOESNT_EXIST all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -94,14 +74,6 @@ broken projections info all_2_2_0 proj FILE_DOESNT_EXIST all_2_2_0 proj_2 NO_FILE_IN_DATA_PART all_3_3_0 proj_2 NO_FILE_IN_DATA_PART -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 1 ['proj'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -131,14 +103,6 @@ all_1_1_0 proj_2 FILE_DOESNT_EXIST all_2_2_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 1 ['proj'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -155,18 +119,6 @@ check table full (test - all_1_1_0) all_1_1_0 materialize projection proj check table full (test - ) -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_0_0_0_6 1 ['proj','proj_2'] -all_1_1_0 0 ['proj','proj_2'] -all_1_1_0_6 1 ['proj','proj_2'] -all_2_2_0 0 ['proj','proj_2'] -all_2_2_0_6 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 0 ['proj'] -all_3_5_1_6 1 ['proj'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -189,25 +141,6 @@ OPTIMIZE TABLE test FINAL insert new part optimize OPTIMIZE TABLE test FINAL -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_0_0_0_6 0 ['proj','proj_2'] -all_0_0_0_7 0 ['proj','proj_2'] -all_0_8_2_7 1 ['proj_2'] -all_1_1_0 0 ['proj','proj_2'] -all_1_1_0_6 0 ['proj','proj_2'] -all_1_1_0_7 0 ['proj','proj_2'] -all_2_2_0 0 ['proj','proj_2'] -all_2_2_0_6 0 ['proj','proj_2'] -all_2_2_0_7 0 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 0 ['proj'] -all_3_5_1_6 0 ['proj'] -all_3_5_1_7 0 ['proj','proj_2'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] -all_8_8_0 0 ['proj','proj_2'] -all_9_9_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -224,9 +157,6 @@ insert new part insert new part insert new part insert new part -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -235,7 +165,6 @@ used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 -system.parts select from projection 'proj' used projections SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -248,9 +177,6 @@ check table broke data of part 'proj' (parent part: all_0_0_0) check table full (test2 - all_0_0_0) all_0_0_0 -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -262,9 +188,6 @@ check table broke data of part 'all_0_0_0' check table full (test2 - all_0_0_0) all_0_0_0 -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -273,9 +196,6 @@ used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -288,11 +208,6 @@ insert new part insert new part insert new part insert new part -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -307,11 +222,6 @@ check table 1 0 broke data of part 'proj' (parent part: all_2_2_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj FILE_DOESNT_EXIST select from projection 'proj_2' @@ -325,11 +235,6 @@ broken projections info all_2_2_0 proj NO_FILE_IN_DATA_PART BACKUP_CREATED RESTORED -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -345,11 +250,6 @@ check table broken projections info 0 broke all data of part 'proj' (parent part: all_2_2_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj STD_EXCEPTION select from projection 'proj_2' @@ -363,15 +263,6 @@ broken projections info all_2_2_0 proj FILE_DOESNT_EXIST FILE_DOESNT_EXIST materialize projection proj -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_0_0_0_4 1 ['proj','proj_2'] -all_1_1_0 0 ['proj','proj_2'] -all_1_1_0_4 1 ['proj','proj_2'] -all_2_2_0 0 ['proj','proj_2'] -all_2_2_0_4 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_3_0_4 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -388,11 +279,6 @@ broken projections info all_2_2_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -407,11 +293,6 @@ check table 1 0 broke all data of part 'proj' (parent part: all_1_1_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj select from projection 'proj_2' 12 @@ -424,11 +305,6 @@ broken projections info all_1_1_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 55e613b8f3a..a1df5dc858d 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings, no-random-settings +# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) @@ -149,12 +149,12 @@ function check() expected_error=$3 fi - echo 'system.parts' - $CLICKHOUSE_CLIENT -q " - SELECT name, active, projections - FROM system.parts - WHERE table='$table' AND database=currentDatabase() - ORDER BY name;" + #echo 'system.parts' + #$CLICKHOUSE_CLIENT -q " + #SELECT name, active, projections + #FROM system.parts + #WHERE table='$table' AND database=currentDatabase() + #ORDER BY name;" query_id=$(random 8) @@ -447,7 +447,7 @@ function test3() break_projection test proj all_2_2_0 part - check test proj STD_EXCEPTION + check test broken_projections_info test From e2f4219c12c216ab32a267b153969b758126a077 Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 28 Dec 2023 12:22:30 +0000 Subject: [PATCH 0087/1081] Fix --- src/Interpreters/TreeOptimizer.cpp | 7 +++---- .../02943_positional_arguments_bugs.reference | 11 ++++++++++- .../0_stateless/02943_positional_arguments_bugs.sql | 13 +++++++------ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 729e2ed6007..57dba3eef89 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -76,11 +76,10 @@ const std::unordered_set possibly_injective_function_names */ void appendUnusedGroupByColumn(ASTSelectQuery * select_query) { - /// You must insert a constant that is not the name of the column in the table. Such a case is rare, but it happens. - /// Also start unused_column integer must not intersect with ([1, source_columns.size()]) - /// might be in positional GROUP BY. + /// Since ASTLiteral is different from ASTIdentifier, so we can use a special constant String Literal for this, + /// and do not need to worry about it conflict with the name of the column in the table. select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, std::make_shared()); - select_query->groupBy()->children.emplace_back(std::make_shared(static_cast(-1))); + select_query->groupBy()->children.emplace_back(std::make_shared("__unused_group_by_column")); } /// Eliminates injective function calls and constant expressions from group by statement. diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference index 702e1261186..47e8df9e382 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference @@ -1,2 +1,11 @@ -45 1 +0 0 +4 4 +3 3 +2 2 +5 5 +1 1 +6 6 +7 7 +9 9 +8 8 processed 99 0 diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql index b8cf73da42d..8cc3fb4b17d 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql @@ -2,18 +2,19 @@ DROP TABLE IF EXISTS t; CREATE TABLE t ( - `n` int + `n` int, + `__unused_group_by_column` int ) - ENGINE = MergeTree - ORDER BY n AS -SELECT * +ENGINE = MergeTree +ORDER BY n AS +SELECT number, number FROM numbers(10); SELECT sum(n), - 1 AS x + __unused_group_by_column FROM t -GROUP BY x; +GROUP BY __unused_group_by_column; SELECT 'processed' AS type, From 3d2e95dbf5f81185d2a091d5e58490f66ed04bef Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Dec 2023 13:49:49 +0100 Subject: [PATCH 0088/1081] Fix build --- src/Storages/MergeTree/checkDataPart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index ea46b6f0d56..5b60f0a7fc2 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -332,7 +332,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( if (throw_on_broken_projection && !broken_projections_message.empty()) { - throw Exception(ErrorCodes::BROKEN_PROJECTION, broken_projections_message.data()); + throw Exception(ErrorCodes::BROKEN_PROJECTION, "{}", broken_projections_message); } if (require_checksums && !projections_on_disk.empty()) From 2e9cdd17ef136f064042b541dbc68ef64ba8194f Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 28 Dec 2023 14:08:14 +0000 Subject: [PATCH 0089/1081] Fix flaky test --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 26 ++++++++++++++----- .../02943_positional_arguments_bugs.reference | 11 ++++---- .../02943_positional_arguments_bugs.sql | 6 +++-- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 3290d918a8b..9ec6d9e358c 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -2156,19 +2156,31 @@ void QueryAnalyzer::replaceNodesWithPositionalArguments(QueryTreeNodePtr & node_ node_to_replace = &sort_node->getExpression(); auto * constant_node = (*node_to_replace)->as(); - if (!constant_node || constant_node->getValue().getType() != Field::Types::UInt64) + + if (!constant_node + || (constant_node->getValue().getType() != Field::Types::UInt64 && constant_node->getValue().getType() != Field::Types::Int64)) continue; - UInt64 positional_argument_number = constant_node->getValue().get(); - if (positional_argument_number == 0 || positional_argument_number > projection_nodes.size()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, + UInt64 pos; + if (constant_node->getValue().getType() == Field::Types::UInt64) + { + pos = constant_node->getValue().get(); + } + else // Int64 + { + auto value = constant_node->getValue().get(); + pos = value > 0 ? value : projection_nodes.size() + value + 1; + } + + if (!pos || pos > projection_nodes.size()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "Positional argument number {} is out of bounds. Expected in range [1, {}]. In scope {}", - positional_argument_number, + pos, projection_nodes.size(), scope.scope_node->formatASTForErrorMessage()); - --positional_argument_number; - *node_to_replace = projection_nodes[positional_argument_number]; + *node_to_replace = projection_nodes[--pos]; } } diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference index 47e8df9e382..08310b7cf27 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference @@ -1,11 +1,12 @@ 0 0 -4 4 -3 3 -2 2 -5 5 1 1 +2 2 +3 3 +4 4 +5 5 6 6 7 7 -9 9 8 8 +9 9 +45 1 processed 99 0 diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql index 8cc3fb4b17d..9b1b872ae40 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql @@ -3,7 +3,7 @@ DROP TABLE IF EXISTS t; CREATE TABLE t ( `n` int, - `__unused_group_by_column` int + `__unused_group_by_column` int ) ENGINE = MergeTree ORDER BY n AS @@ -14,7 +14,9 @@ SELECT sum(n), __unused_group_by_column FROM t -GROUP BY __unused_group_by_column; +GROUP BY __unused_group_by_column ORDER BY __unused_group_by_column; + +SELECT sum(n), 1 as x from t group by x; SELECT 'processed' AS type, From 493f938c455e9bd507d521b7974b1e7a9e7c81b2 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:29:25 +0100 Subject: [PATCH 0090/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index a1df5dc858d..ca62d275189 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -447,7 +447,7 @@ function test3() break_projection test proj all_2_2_0 part - check test + check test proj ErrnoException broken_projections_info test From 91657185c8fc4349cb8825ac2e5d6126fddb8289 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 29 Dec 2023 13:05:15 +0100 Subject: [PATCH 0091/1081] Fxi --- tests/queries/0_stateless/02916_broken_projection.reference | 2 +- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index d340326455a..beaca49f99c 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -251,7 +251,7 @@ broken projections info 0 broke all data of part 'proj' (parent part: all_2_2_0) select from projection 'proj', expect error: proj -STD_EXCEPTION +Errno select from projection 'proj_2' 12 16 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index ca62d275189..99e54b08b74 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -447,7 +447,7 @@ function test3() break_projection test proj all_2_2_0 part - check test proj ErrnoException + check test proj Errno broken_projections_info test From e0f0100332085f3075951a6d9bf5c8d69f6d9940 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 29 Dec 2023 15:38:15 +0100 Subject: [PATCH 0092/1081] Update 02916_broken_projection.reference --- tests/queries/0_stateless/02916_broken_projection.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index beaca49f99c..3967215e5de 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -252,6 +252,7 @@ broken projections info broke all data of part 'proj' (parent part: all_2_2_0) select from projection 'proj', expect error: proj Errno +Errno select from projection 'proj_2' 12 16 From b70ff6d8ea71d4633cdcdbe3ef486707e70c1abb Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 1 Jan 2024 11:02:57 +0100 Subject: [PATCH 0093/1081] Fix build --- src/Backups/BackupIO_AzureBlobStorage.cpp | 33 +++++++++++++++++++++-- src/Backups/BackupIO_AzureBlobStorage.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index a1fd5bd8327..bd4efcf63ae 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -218,10 +218,39 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } +void BackupWriterAzureBlobStorage::copyFile(const String & destination, const String & source, size_t size) +{ + std::shared_ptr src_client; + std::shared_ptr dest_client; + StorageAzureBlob::Configuration src_configuration = configuration; + src_configuration.container = source; + src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); + + StorageAzureBlob::Configuration dest_configuration = configuration; + dest_configuration.container = destination; + dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); + + LOG_TRACE(log, "Copying file inside backup from {} to {} ", source, destination); + copyAzureBlobStorageFile( + src_client, + dest_client, + configuration.container, + fs::path(configuration.blob_path), + 0, + size, + /* dest_bucket= */ destination, + /* dest_key= */ configuration.blob_path, + settings, + read_settings, + {}, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), + /* for_disk_azure_blob_storage= */ true); +} + void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); } BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; @@ -257,7 +286,7 @@ UInt64 BackupWriterAzureBlobStorage::getFileSize(const String & file_name) RelativePathsWithMetadata children; object_storage->listObjects(key,children,/*max_keys*/0); if (children.empty()) - throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object {} must exist"); + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object must exist"); return children[0].metadata.size_bytes; } diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 65affb9f079..87a6c3ef675 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -48,6 +48,8 @@ public: void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void copyFile(const String & destination, const String & source, size_t size) override; + void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; From 4122de97213d835de5202d4ca741b4972973884b Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 2 Jan 2024 20:19:01 +0100 Subject: [PATCH 0094/1081] Updated tests and added settings --- src/Backups/BackupIO_AzureBlobStorage.cpp | 6 +- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 5 +- .../AzureBlobStorage/AzureObjectStorage.h | 11 ++- .../copyAzureBlobStorageFile.cpp | 68 +++++++++++++++++-- src/Storages/StorageAzureBlob.cpp | 2 +- .../configs/config.xml | 11 --- .../configs/disable_profilers.xml | 13 ---- .../configs/users.xml | 8 --- .../test.py | 2 - 9 files changed, 80 insertions(+), 46 deletions(-) delete mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml delete mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml delete mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index bd4efcf63ae..15e8e92a85d 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -139,7 +139,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, settings, read_settings, object_attributes, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupReaderAzureBlobStorage"), + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), /* for_disk_azure_blob_storage= */ true); return file_size; @@ -209,7 +209,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu settings, read_settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); return; /// copied! } } @@ -243,7 +243,7 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St settings, read_settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure"), /* for_disk_azure_blob_storage= */ true); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 6075b385a6c..9e703d6fc5e 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -164,7 +164,10 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), - config.getInt(config_prefix + ".list_object_keys_size", 1000) + config.getInt(config_prefix + ".list_object_keys_size", 1000), + config.getUInt64(config_prefix + ".min_upload_part_size", 16 * 1024 * 1024), + config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), + config.getUInt64(config_prefix + ".max_part_number", 10000) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 8e3d50418d3..55c81b4b7d9 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -24,12 +24,18 @@ struct AzureObjectStorageSettings uint64_t min_bytes_for_seek_, int max_single_read_retries_, int max_single_download_retries_, - int list_object_keys_size_) + int list_object_keys_size_, + size_t min_upload_part_size_, + size_t max_upload_part_size_, + size_t max_part_number_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) + , min_upload_part_size(min_upload_part_size_) + , max_upload_part_size(max_upload_part_size_) + , max_part_number(max_part_number_) { } @@ -40,6 +46,9 @@ struct AzureObjectStorageSettings size_t max_single_read_retries = 3; size_t max_single_download_retries = 3; int list_object_keys_size = 1000; + size_t min_upload_part_size = 16 * 1024 * 1024; + size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; + size_t max_part_number = 10000; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 0a0a080b5cb..5ca30fa8071 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -22,15 +22,17 @@ namespace ProfileEvents extern const Event DiskAzureUploadPart; } -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int INVALID_CONFIG_PARAMETER; +} + + size_t max_single_operation_copy_size = 256 * 1024 * 1024; @@ -106,6 +108,60 @@ namespace std::mutex bg_tasks_mutex; std::condition_variable bg_tasks_condvar; + void calculatePartSize() + { + if (!total_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); + + auto max_part_number = settings.get()->max_part_number; + auto min_upload_part_size = settings.get()->min_upload_part_size; + auto max_upload_part_size = settings.get()->max_upload_part_size; + + if (!max_part_number) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); + else if (!min_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "min_upload_part_size must not be 0"); + else if (max_upload_part_size < min_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be less than min_upload_part_size"); + + size_t part_size = min_upload_part_size; + size_t num_parts = (total_size + part_size - 1) / part_size; + + if (num_parts > max_part_number) + { + part_size = (total_size + max_part_number - 1) / max_part_number; + num_parts = (total_size + part_size - 1) / part_size; + } + + if (part_size > max_upload_part_size) + { + part_size = max_upload_part_size; + num_parts = (total_size + part_size - 1) / part_size; + } + + if (num_parts < 1 || num_parts > max_part_number || part_size < min_upload_part_size || part_size > max_upload_part_size) + { + String msg; + if (num_parts < 1) + msg = "Number of parts is zero"; + else if (num_parts > max_part_number) + msg = fmt::format("Number of parts exceeds {}", num_parts, max_part_number); + else if (part_size < min_upload_part_size) + msg = fmt::format("Size of a part is less than {}", part_size, min_upload_part_size); + else + msg = fmt::format("Size of a part exceeds {}", part_size, max_upload_part_size); + + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "{} while writing {} bytes to AzureBlobStorage. Check max_part_number = {}, " + "min_upload_part_size = {}, max_upload_part_size = {}", + msg, total_size, max_part_number, min_upload_part_size, max_upload_part_size); + } + + /// We've calculated the size of a normal part (the final part can be smaller). + normal_part_size = part_size; + } + public: void performCopy() { @@ -120,7 +176,7 @@ namespace void performMultipartUpload() { - normal_part_size = 1024; + calculatePartSize(); size_t position = offset; size_t end_position = offset + total_size; diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 1b28a2c2fac..f1070c8c31e 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -1139,7 +1139,7 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files + std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; if (num_rows_from_cache) { diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml deleted file mode 100644 index 5725dce40cd..00000000000 --- a/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml +++ /dev/null @@ -1,11 +0,0 @@ - - 1 - 0 - 0.0 - 0 - 1 - 1 - 0 - 16 - 16 - \ No newline at end of file diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml deleted file mode 100644 index b74bb1502ce..00000000000 --- a/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - 0 - 0 - 0 - 1000 - 1 - 1 - - - diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml deleted file mode 100644 index c12eb2f79f4..00000000000 --- a/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - default - - - diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py index 0a48d3523f0..06c18d7468f 100644 --- a/tests/integration/test_backup_restore_azure_blob_storage/test.py +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -24,8 +24,6 @@ def cluster(): cluster = ClickHouseCluster(__file__) cluster.add_instance( "node", - main_configs=["configs/config.xml"], - user_configs=["configs/disable_profilers.xml", "configs/users.xml"], with_azurite=True, ) cluster.start() From df221f7db65fd17af6a71704f756e47ceec7a928 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 3 Jan 2024 11:35:06 +0100 Subject: [PATCH 0095/1081] Renamed Bucket-Key to Container-Blob --- src/Backups/BackupIO_AzureBlobStorage.cpp | 14 +++--- .../copyAzureBlobStorageFile.cpp | 44 +++++++++---------- .../copyAzureBlobStorageFile.h | 10 ++--- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 15e8e92a85d..de40fc6b33b 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -134,8 +134,8 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, fs::path(configuration.blob_path) / path_in_backup, 0, file_size, - /* dest_bucket= */ blob_path[1], - /* dest_key= */ blob_path[0], + /* dest_container */ blob_path[1], + /* dest_path */ blob_path[0], settings, read_settings, object_attributes, @@ -178,7 +178,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu auto source_data_source_description = src_disk->getDataSourceDescription(); if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) { - /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in AzureBlobStorage bucket. + /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in AzureBlobStorage container. /// In this case we can't use the native copy. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { @@ -200,8 +200,8 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu copyAzureBlobStorageFile( src_client, client, - /* src_bucket */ blob_path[1], - /* src_key= */ blob_path[0], + /* src_container */ blob_path[1], + /* src_path */ blob_path[0], start_pos, length, configuration.container, @@ -238,8 +238,8 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St fs::path(configuration.blob_path), 0, size, - /* dest_bucket= */ destination, - /* dest_key= */ configuration.blob_path, + /* dest_container */ destination, + /* dest_path */ configuration.blob_path, settings, read_settings, {}, diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 5ca30fa8071..df1341efdd1 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -46,8 +46,8 @@ namespace std::shared_ptr client_, size_t offset_, size_t total_size_, - const String & dest_bucket_, - const String & dest_key_, + const String & dest_container_, + const String & dest_blob_, std::shared_ptr settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, @@ -57,8 +57,8 @@ namespace , client(client_) , offset (offset_) , total_size (total_size_) - , dest_bucket(dest_bucket_) - , dest_key(dest_key_) + , dest_container(dest_container_) + , dest_blob(dest_blob_) , settings(settings_) , object_metadata(object_metadata_) , schedule(schedule_) @@ -75,8 +75,8 @@ namespace std::shared_ptr client; size_t offset; size_t total_size; - const String & dest_bucket; - const String & dest_key; + const String & dest_container; + const String & dest_blob; std::shared_ptr settings; const std::optional> & object_metadata; ThreadPoolCallbackRunner schedule; @@ -170,7 +170,7 @@ namespace void completeMultipartUpload() { - auto block_blob_client = client->GetBlockBlobClient(dest_key); + auto block_blob_client = client->GetBlockBlobClient(dest_blob); block_blob_client.CommitBlockList(block_ids); } @@ -207,7 +207,7 @@ namespace void uploadPart(size_t part_offset, size_t part_size) { - LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Size: {}", dest_bucket, dest_key, part_size); + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, Size: {}", dest_container, dest_blob, part_size); if (!part_size) { @@ -286,7 +286,7 @@ namespace std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race task.block_id = block_id; - LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, block_id: {}, Parts: {}", dest_bucket, dest_key, block_id, bg_tasks.size()); + LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, block_id: {}, Parts: {}", dest_container, dest_blob, block_id, bg_tasks.size()); } String processUploadPartRequest(UploadPartTask & task) @@ -295,7 +295,7 @@ namespace if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); - auto block_blob_client = client->GetBlockBlobClient(dest_key); + auto block_blob_client = client->GetBlockBlobClient(dest_blob); task.block_id = getRandomASCIIString(64); Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); block_blob_client.StageBlock(task.block_id, memory); @@ -330,14 +330,14 @@ void copyDataToAzureBlobStorageFile( size_t offset, size_t size, std::shared_ptr & dest_client, - const String & dest_bucket, - const String & dest_key, + const String & dest_container, + const String & dest_blob, std::shared_ptr settings, const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; helper.performCopy(); } @@ -345,12 +345,12 @@ void copyDataToAzureBlobStorageFile( void copyAzureBlobStorageFile( std::shared_ptr src_client, std::shared_ptr dest_client, - const String & src_bucket, - const String & src_key, + const String & src_container, + const String & src_blob, size_t offset, size_t size, - const String & dest_bucket, - const String & dest_key, + const String & dest_container, + const String & dest_blob, std::shared_ptr settings, const ReadSettings & read_settings, const std::optional> & object_metadata, @@ -363,21 +363,21 @@ void copyAzureBlobStorageFile( ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - auto block_blob_client_src = src_client->GetBlockBlobClient(src_key); - auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_key); + auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); auto uri = block_blob_client_src.GetUrl(); block_blob_client_dest.CopyFromUri(uri); } else { - LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Bucket: {}, Key: {}", src_bucket, src_key); + LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client, src_key, read_settings, settings->max_single_read_retries, + return std::make_unique(src_client, src_blob, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); }; - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; helper.performCopy(); } } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 31228fbcb23..059d0318f57 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -23,12 +23,12 @@ using CreateReadBuffer = std::function()>; void copyAzureBlobStorageFile( std::shared_ptr src_client, std::shared_ptr dest_client, - const String & src_bucket, - const String & src_key, + const String & src_container, + const String & src_path, size_t src_offset, size_t src_size, - const String & dest_bucket, - const String & dest_key, + const String & dest_container, + const String & dest_path, std::shared_ptr settings, const ReadSettings & read_settings, const std::optional> & object_metadata = std::nullopt, @@ -46,8 +46,8 @@ void copyDataToAzureBlobStorageFile( size_t offset, size_t size, std::shared_ptr & client, + const String & dest_container, const String & dest_bucket, - const String & dest_key, std::shared_ptr settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, From 91bad5bc39963e9450f284dfc6b45fd69fa146de Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 4 Jan 2024 16:06:36 +0100 Subject: [PATCH 0096/1081] Updated to use MultiVersion for BlobContainerClient in Backups and updated to get client from disk --- src/Backups/BackupIO_AzureBlobStorage.cpp | 72 +++++-------------- src/Backups/BackupIO_AzureBlobStorage.h | 4 +- .../AzureBlobStorage/AzureObjectStorage.h | 5 ++ .../copyAzureBlobStorageFile.cpp | 20 +++--- .../copyAzureBlobStorageFile.h | 6 +- 5 files changed, 37 insertions(+), 70 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index de40fc6b33b..968a60c566f 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -27,8 +27,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -//using AzureClientPtr = std::shared_ptr; - BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, @@ -38,12 +36,13 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} , configuration(configuration_) { - client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupReaderAzureBlobStorage", - std::make_unique(*client.get()), + std::move(client_ptr), std::move(settings_as_unique_ptr)); + client = object_storage->getClient(); } BackupReaderAzureBlobStorage::~BackupReaderAzureBlobStorage() = default; @@ -89,7 +88,7 @@ std::unique_ptr BackupReaderAzureBlobStorage::readFile(const key = file_name; } return std::make_unique( - client, key, read_settings, settings->max_single_read_retries, + client.get(), key, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); } @@ -113,23 +112,9 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, "Blob writing function called with unexpected blob_path.size={} or mode={}", blob_path.size(), mode); - std::shared_ptr dest_client; - if (configuration.container == blob_path[1]) - { - dest_client = client; - } - else - { - StorageAzureBlob::Configuration dest_configuration = configuration; - dest_configuration.container = blob_path[1]; - dest_configuration.blob_path = blob_path[0]; - dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); - } - - copyAzureBlobStorageFile( client, - dest_client, + reinterpret_cast(destination_disk->getObjectStorage().get())->getClient(), configuration.container, fs::path(configuration.blob_path) / path_in_backup, 0, @@ -163,12 +148,13 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( , data_source_description{DataSourceType::AzureBlobStorage,configuration_.container, false, false} , configuration(configuration_) { - client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupWriterAzureBlobStorage", - std::make_unique(*client.get()), - std::move(settings_as_unique_ptr)); + std::move(client_ptr), + std::move(settings_as_unique_ptr)); + client = object_storage->getClient(); } void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -182,23 +168,9 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu /// In this case we can't use the native copy. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { - - std::shared_ptr src_client; - if (configuration.container == blob_path[1]) - { - src_client = client; - } - else - { - StorageAzureBlob::Configuration src_configuration = configuration; - src_configuration.container = blob_path[1]; - src_configuration.blob_path = blob_path[0]; - src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); - } - LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); copyAzureBlobStorageFile( - src_client, + reinterpret_cast(src_disk->getObjectStorage().get())->getClient(), client, /* src_container */ blob_path[1], /* src_path */ blob_path[0], @@ -220,26 +192,16 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu void BackupWriterAzureBlobStorage::copyFile(const String & destination, const String & source, size_t size) { - std::shared_ptr src_client; - std::shared_ptr dest_client; - StorageAzureBlob::Configuration src_configuration = configuration; - src_configuration.container = source; - src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); - - StorageAzureBlob::Configuration dest_configuration = configuration; - dest_configuration.container = destination; - dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); - LOG_TRACE(log, "Copying file inside backup from {} to {} ", source, destination); copyAzureBlobStorageFile( - src_client, - dest_client, + client, + client, configuration.container, - fs::path(configuration.blob_path), + fs::path(source), 0, size, - /* dest_container */ destination, - /* dest_path */ configuration.blob_path, + /* dest_container */ configuration.container, + /* dest_path */ destination, settings, read_settings, {}, @@ -303,7 +265,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String } return std::make_unique( - client, key, read_settings, settings->max_single_read_retries, + client.get(), key, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); } @@ -319,7 +281,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin key = file_name; } return std::make_unique( - client, + client.get(), key, settings->max_single_part_upload_size, DBMS_DEFAULT_BUFFER_SIZE, diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 87a6c3ef675..12bf073cd08 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -28,7 +28,7 @@ public: private: const DataSourceDescription data_source_description; - std::shared_ptr client; + MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; std::shared_ptr settings; @@ -57,7 +57,7 @@ private: std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; void removeFilesBatch(const Strings & file_names); const DataSourceDescription data_source_description; - std::shared_ptr client; + MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; std::shared_ptr settings; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 55c81b4b7d9..1ff4537742f 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -134,6 +134,11 @@ public: bool isRemote() const override { return true; } + MultiVersion & getClient() + { + return client; + } + private: const String name; /// client used to access the files in the Blob Storage cloud diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index df1341efdd1..4ec90d2830e 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -43,7 +43,7 @@ namespace public: UploadHelper( const CreateReadBuffer & create_read_buffer_, - std::shared_ptr client_, + MultiVersion & client_, size_t offset_, size_t total_size_, const String & dest_container_, @@ -72,7 +72,7 @@ namespace protected: std::function()> create_read_buffer; - std::shared_ptr client; + MultiVersion & client; size_t offset; size_t total_size; const String & dest_container; @@ -170,7 +170,7 @@ namespace void completeMultipartUpload() { - auto block_blob_client = client->GetBlockBlobClient(dest_blob); + auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); block_blob_client.CommitBlockList(block_ids); } @@ -295,7 +295,7 @@ namespace if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); - auto block_blob_client = client->GetBlockBlobClient(dest_blob); + auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); task.block_id = getRandomASCIIString(64); Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); block_blob_client.StageBlock(task.block_id, memory); @@ -329,7 +329,7 @@ void copyDataToAzureBlobStorageFile( const std::function()> & create_read_buffer, size_t offset, size_t size, - std::shared_ptr & dest_client, + MultiVersion & dest_client, const String & dest_container, const String & dest_blob, std::shared_ptr settings, @@ -343,8 +343,8 @@ void copyDataToAzureBlobStorageFile( void copyAzureBlobStorageFile( - std::shared_ptr src_client, - std::shared_ptr dest_client, + MultiVersion & src_client, + MultiVersion & dest_client, const String & src_container, const String & src_blob, size_t offset, @@ -363,8 +363,8 @@ void copyAzureBlobStorageFile( ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); - auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); + auto block_blob_client_src = src_client.get()->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client.get()->GetBlockBlobClient(dest_blob); auto uri = block_blob_client_src.GetUrl(); block_blob_client_dest.CopyFromUri(uri); } @@ -373,7 +373,7 @@ void copyAzureBlobStorageFile( LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client, src_blob, read_settings, settings->max_single_read_retries, + return std::make_unique(src_client.get(), src_blob, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); }; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 059d0318f57..a6502541db1 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -21,8 +21,8 @@ using CreateReadBuffer = std::function()>; /// Copies a file from AzureBlobStorage to AzureBlobStorage. /// The parameters `src_offset` and `src_size` specify a part in the source to copy. void copyAzureBlobStorageFile( - std::shared_ptr src_client, - std::shared_ptr dest_client, + MultiVersion & src_client, + MultiVersion & dest_client, const String & src_container, const String & src_path, size_t src_offset, @@ -45,7 +45,7 @@ void copyDataToAzureBlobStorageFile( const std::function()> & create_read_buffer, size_t offset, size_t size, - std::shared_ptr & client, + MultiVersion & client, const String & dest_container, const String & dest_bucket, std::shared_ptr settings, From c14605caa7f403531a6ff0663c242aa5d466ab07 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 4 Jan 2024 18:27:54 +0100 Subject: [PATCH 0097/1081] Added flag use_native_copy and updated to use StartCopyFromUri for native copy with large files --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 3 ++- .../AzureBlobStorage/AzureObjectStorage.h | 10 +++---- .../copyAzureBlobStorageFile.cpp | 26 ++++++++++++++++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 9e703d6fc5e..e29def06363 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -167,7 +167,8 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getInt(config_prefix + ".list_object_keys_size", 1000), config.getUInt64(config_prefix + ".min_upload_part_size", 16 * 1024 * 1024), config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), - config.getUInt64(config_prefix + ".max_part_number", 10000) + config.getUInt64(config_prefix + ".max_part_number", 10000), + config.getBool(config_prefix + ".use_native_copy", false) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 1ff4537742f..436b48c0ad4 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -27,7 +27,8 @@ struct AzureObjectStorageSettings int list_object_keys_size_, size_t min_upload_part_size_, size_t max_upload_part_size_, - size_t max_part_number_) + size_t max_part_number_, + bool use_native_copy_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) @@ -36,6 +37,7 @@ struct AzureObjectStorageSettings , min_upload_part_size(min_upload_part_size_) , max_upload_part_size(max_upload_part_size_) , max_part_number(max_part_number_) + , use_native_copy(use_native_copy_) { } @@ -49,6 +51,7 @@ struct AzureObjectStorageSettings size_t min_upload_part_size = 16 * 1024 * 1024; size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; size_t max_part_number = 10000; + bool use_native_copy = false; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; @@ -134,10 +137,7 @@ public: bool isRemote() const override { return true; } - MultiVersion & getClient() - { - return client; - } + MultiVersion & getClient() { return client; } private: const String name; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 4ec90d2830e..9db5ddb476a 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INVALID_CONFIG_PARAMETER; + extern const int AZURE_BLOB_STORAGE_ERROR; } @@ -358,15 +359,34 @@ void copyAzureBlobStorageFile( bool for_disk_azure_blob_storage) { - if (size < max_single_operation_copy_size) + if (settings->use_native_copy ) { ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); + auto block_blob_client_src = src_client.get()->GetBlockBlobClient(src_blob); auto block_blob_client_dest = dest_client.get()->GetBlockBlobClient(dest_blob); - auto uri = block_blob_client_src.GetUrl(); - block_blob_client_dest.CopyFromUri(uri); + auto source_uri = block_blob_client_src.GetUrl(); + + if (size < max_single_operation_copy_size) + { + block_blob_client_dest.CopyFromUri(source_uri); + } + else + { + Azure::Storage::Blobs::StartBlobCopyOperation operation = block_blob_client_dest.StartCopyFromUri(source_uri); + + // Wait for the operation to finish, checking for status every 100 second. + auto copy_response = operation.PollUntilDone(std::chrono::milliseconds(100)); + auto properties_model = copy_response.Value; + + if (properties_model.CopySource.HasValue()) + { + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy failed"); + } + + } } else { From 2ee68933123583fe585093868e65c3562d36d66a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 5 Jan 2024 10:58:04 +0100 Subject: [PATCH 0098/1081] Updated to return container for getObjectsNamespace --- src/Backups/BackupIO_AzureBlobStorage.cpp | 6 ++++-- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp | 7 +++++-- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 6 ++++-- .../AzureBlobStorage/registerDiskAzureBlobStorage.cpp | 4 +++- src/Storages/StorageAzureBlob.cpp | 2 +- src/TableFunctions/TableFunctionAzureBlobStorage.cpp | 4 ++-- .../TableFunctionAzureBlobStorageCluster.cpp | 4 ++-- 7 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 968a60c566f..5ddbb42e2c0 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -41,7 +41,8 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr)); + std::move(settings_as_unique_ptr), + configuration_.container); client = object_storage->getClient(); } @@ -153,7 +154,8 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupWriterAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr)); + std::move(settings_as_unique_ptr), + configuration_.container); client = object_storage->getClient(); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 068e2aebab1..1f92ef48350 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -92,10 +92,12 @@ private: AzureObjectStorage::AzureObjectStorage( const String & name_, AzureClientPtr && client_, - SettingsPtr && settings_) + SettingsPtr && settings_, + const String & container_) : name(name_) , client(std::move(client_)) , settings(std::move(settings_)) + , container(container_) , log(&Poco::Logger::get("AzureObjectStorage")) { data_source_description.type = DataSourceType::AzureBlobStorage; @@ -379,7 +381,8 @@ std::unique_ptr AzureObjectStorage::cloneObjectStorage(const std return std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), - getAzureBlobStorageSettings(config, config_prefix, context) + getAzureBlobStorageSettings(config, config_prefix, context), + container ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 436b48c0ad4..660d4a30889 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -66,7 +66,8 @@ public: AzureObjectStorage( const String & name_, AzureClientPtr && client_, - SettingsPtr && settings_); + SettingsPtr && settings_, + const String & container_); void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; @@ -125,7 +126,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - String getObjectsNamespace() const override { return ""; } + String getObjectsNamespace() const override { return container ; } std::unique_ptr cloneObjectStorage( const std::string & new_namespace, @@ -144,6 +145,7 @@ private: /// client used to access the files in the Blob Storage cloud MultiVersion client; MultiVersion settings; + const String container; Poco::Logger * log; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp index 7ba9d21db62..2ffd910f92a 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp @@ -26,10 +26,12 @@ void registerDiskAzureBlobStorage(DiskFactory & factory, bool global_skip_access { auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); + String container_name = config.getString(config_prefix + ".container_name", "default-container"); ObjectStoragePtr azure_object_storage = std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), - getAzureBlobStorageSettings(config, config_prefix, context)); + getAzureBlobStorageSettings(config, config_prefix, context), + container_name); String key_prefix; auto metadata_storage = std::make_shared(metadata_disk, key_prefix); diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index f1070c8c31e..fcd7074b9d2 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -314,7 +314,7 @@ void registerStorageAzureBlob(StorageFactory & factory) return std::make_shared( std::move(configuration), - std::make_unique("AzureBlobStorage", std::move(client), std::move(settings)), + std::make_unique("AzureBlobStorage", std::move(client), std::move(settings),configuration.container), args.getContext(), args.table_id, args.columns, diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index d394c836369..b098cac5144 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -262,7 +262,7 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); auto settings = StorageAzureBlob::createSettings(context); - auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); + auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings), configuration.container); return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context, false); } @@ -293,7 +293,7 @@ StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_funct StoragePtr storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), context, StorageID(getDatabaseName(), table_name), columns, diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp index eee585967c2..1c3b302a186 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp @@ -40,7 +40,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( /// On worker node this filename won't contains globs storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), context, StorageID(getDatabaseName(), table_name), columns, @@ -55,7 +55,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( storage = std::make_shared( cluster_name, configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, From b250acff789620be57e21977d8f3d4a3468070d5 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 5 Jan 2024 11:26:32 +0100 Subject: [PATCH 0099/1081] Fixed style check --- src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 9db5ddb476a..3399f1705f4 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -359,7 +359,7 @@ void copyAzureBlobStorageFile( bool for_disk_azure_blob_storage) { - if (settings->use_native_copy ) + if (settings->use_native_copy) { ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) From 356fc0aadb8f7c0f15f72c3b72955e1db7046e48 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 7 Jan 2024 14:49:24 +0100 Subject: [PATCH 0100/1081] Fix tests --- src/Storages/StorageView.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index 2f7267e3701..1898e49de86 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,8 +112,14 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - if (is_parameterized_view_ && !query.isParameterizedView()) + if (is_parameterized_view_) + { + if (!query.isParameterizedView()) + storage_metadata.setColumns(columns_); + } + else storage_metadata.setColumns(columns_); + storage_metadata.setComment(comment); if (!query.select) From fd92c1961e5f09411d83b21c4fe9f00b78be22ba Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 7 Jan 2024 16:33:48 +0100 Subject: [PATCH 0101/1081] Fix clang tidy build --- src/Backups/BackupIO_AzureBlobStorage.cpp | 12 ++++++------ src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp | 10 +++++----- src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 5ddbb42e2c0..8c6c1040eec 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -305,21 +305,21 @@ void BackupWriterAzureBlobStorage::removeFile(const String & file_name) object_storage->removeObjectIfExists(object); } -void BackupWriterAzureBlobStorage::removeFiles(const Strings & keys) +void BackupWriterAzureBlobStorage::removeFiles(const Strings & file_names) { StoredObjects objects; - for (const auto & key : keys) - objects.emplace_back(key); + for (const auto & file_name : file_names) + objects.emplace_back(file_name); object_storage->removeObjectsIfExist(objects); } -void BackupWriterAzureBlobStorage::removeFilesBatch(const Strings & keys) +void BackupWriterAzureBlobStorage::removeFilesBatch(const Strings & file_names) { StoredObjects objects; - for (const auto & key : keys) - objects.emplace_back(key); + for (const auto & file_name : file_names) + objects.emplace_back(file_name); object_storage->removeObjectsIfExist(objects); } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 3399f1705f4..272be914cc1 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -65,11 +65,11 @@ namespace , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) , log(log_) - , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) + , max_single_part_upload_size(settings_->max_single_part_upload_size) { } - ~UploadHelper() {} + virtual ~UploadHelper() = default; protected: std::function()> create_read_buffer; @@ -114,9 +114,9 @@ namespace if (!total_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - auto max_part_number = settings.get()->max_part_number; - auto min_upload_part_size = settings.get()->min_upload_part_size; - auto max_upload_part_size = settings.get()->max_upload_part_size; + auto max_part_number = settings->max_part_number; + auto min_upload_part_size = settings->min_upload_part_size; + auto max_upload_part_size = settings->max_upload_part_size; if (!max_part_number) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index a6502541db1..b022151d32d 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -24,11 +24,11 @@ void copyAzureBlobStorageFile( MultiVersion & src_client, MultiVersion & dest_client, const String & src_container, - const String & src_path, + const String & src_blob, size_t src_offset, size_t src_size, const String & dest_container, - const String & dest_path, + const String & dest_blob, std::shared_ptr settings, const ReadSettings & read_settings, const std::optional> & object_metadata = std::nullopt, @@ -47,7 +47,7 @@ void copyDataToAzureBlobStorageFile( size_t size, MultiVersion & client, const String & dest_container, - const String & dest_bucket, + const String & dest_blob, std::shared_ptr settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, From f50f7f56949021d01ba692f6788e50d411ca8af9 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 8 Jan 2024 14:25:33 +0100 Subject: [PATCH 0102/1081] Removed unwanted includes --- .../registerBackupEngineAzureBlobStorage.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index ef95206831f..810da5adb3f 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -10,13 +10,11 @@ #include #include #include -#include #endif namespace DB { -namespace fs = std::filesystem; namespace ErrorCodes { @@ -25,23 +23,6 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -#if USE_AZURE_BLOB_STORAGE -namespace -{ - String removeFileNameFromURL(String & url) - { - Poco::URI url2{url}; - String path = url2.getPath(); - size_t slash_pos = path.find_last_of('/'); - String file_name = path.substr(slash_pos + 1); - path.resize(slash_pos + 1); - url2.setPath(path); - url = url2.toString(); - return file_name; - } -} -#endif - void registerBackupEngineAzureBlobStorage(BackupFactory & factory) { From 2d914721e5101215c2c63c97151552cb7c8ff746 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 8 Jan 2024 15:10:37 +0100 Subject: [PATCH 0103/1081] Fix build --- .../registerBackupEngineAzureBlobStorage.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 810da5adb3f..3480ea75f1f 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #endif @@ -23,6 +24,22 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } +#if USE_AZURE_BLOB_STORAGE +namespace +{ + String removeFileNameFromURL(String & url) + { + Poco::URI url2{url}; + String path = url2.getPath(); + size_t slash_pos = path.find_last_of('/'); + String file_name = path.substr(slash_pos + 1); + path.resize(slash_pos + 1); + url2.setPath(path); + url = url2.toString(); + return file_name; + } +} +#endif void registerBackupEngineAzureBlobStorage(BackupFactory & factory) { From 3de5b27c48483962285de0b16f152cc35eadd1a6 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 8 Jan 2024 16:50:17 +0100 Subject: [PATCH 0104/1081] Fix conflicts --- .../Serializations/SerializationString.cpp | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index a6bf29336b7..b2c254e63c5 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -381,7 +381,7 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist str_value = "false"; } - read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); + read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); } else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { @@ -406,6 +406,26 @@ bool SerializationString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') return read(column, [&](ColumnString::Chars & data) { return readJSONArrayInto(data, istr); }); + if (settings.json.read_bools_as_strings && !istr.eof() && (*istr.position() == 't' || *istr.position() == 'f')) + { + String str_value; + if (*istr.position() == 't') + { + if (!checkString("true", istr)) + return false; + str_value = "true"; + } + else if (*istr.position() == 'f') + { + if (!checkString("false", istr)) + return false; + str_value = "false"; + } + + read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); + return true; + } + if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { String field; From c5bf722ee2d2b50d1b0691112b769e3e67612214 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 8 Jan 2024 21:24:44 +0300 Subject: [PATCH 0105/1081] Create ch/chc/chl symlinks by cmake as well (for develop mode) Before, they had been created only by install target. Follow-up for: #56634 Signed-off-by: Azat Khuzhin --- programs/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index b3a5af6d6c9..6e544bac81c 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -328,6 +328,10 @@ set (CLICKHOUSE_BUNDLE) if (ENABLE_CLICKHOUSE_SELF_EXTRACTING) list(APPEND CLICKHOUSE_BUNDLE self-extracting) endif () + +if (NOT BUILD_STANDALONE_KEEPER) + add_custom_target (ch ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse ch DEPENDS clickhouse) +endif() if (ENABLE_CLICKHOUSE_SERVER) add_custom_target (clickhouse-server ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-server DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-server" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) @@ -335,11 +339,13 @@ if (ENABLE_CLICKHOUSE_SERVER) endif () if (ENABLE_CLICKHOUSE_CLIENT) add_custom_target (clickhouse-client ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-client DEPENDS clickhouse) + add_custom_target (chc ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse chc DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-client" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) list(APPEND CLICKHOUSE_BUNDLE clickhouse-client) endif () if (ENABLE_CLICKHOUSE_LOCAL) add_custom_target (clickhouse-local ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-local DEPENDS clickhouse) + add_custom_target (chl ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse chl DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-local" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) list(APPEND CLICKHOUSE_BUNDLE clickhouse-local) endif () From 21e4b453dfc7df905ed304c5513b50f57ef19228 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 8 Jan 2024 22:02:40 +0100 Subject: [PATCH 0106/1081] Fix pretty type name --- src/DataTypes/DataTypeVariant.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 5dc42cc7443..2bc4dfa5a7a 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -71,17 +71,17 @@ std::string DataTypeVariant::doGetPrettyName(size_t indent) const { size_t size = variants.size(); WriteBufferFromOwnString s; - s << "Variant(\n"; + s << "Variant("; for (size_t i = 0; i != size; ++i) { if (i != 0) - s << ",\n"; + s << ", "; - s << fourSpaceIndent(indent + 1) << variants[i]->getPrettyName(indent + 1); + s << variants[i]->getPrettyName(indent); } - s << '\n' << fourSpaceIndent(indent) << ')'; + s << ')'; return s.str(); } From 629d4b921e5cf2d709d2ca7a55658d95407e2ff7 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 9 Jan 2024 15:38:04 +0000 Subject: [PATCH 0107/1081] Fix style --- src/Analyzer/Passes/IfConstantConditionPass.cpp | 2 +- src/Storages/StorageMerge.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Analyzer/Passes/IfConstantConditionPass.cpp b/src/Analyzer/Passes/IfConstantConditionPass.cpp index f3b8b712dbf..6b24eb1d539 100644 --- a/src/Analyzer/Passes/IfConstantConditionPass.cpp +++ b/src/Analyzer/Passes/IfConstantConditionPass.cpp @@ -57,7 +57,7 @@ public: } -void IfConstantConditionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) +void IfConstantConditionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { IfConstantConditionVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 15ca6e65482..ffbf98e85c7 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -92,7 +92,6 @@ namespace ErrorCodes extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int LOGICAL_ERROR; } StorageMerge::DatabaseNameOrRegexp::DatabaseNameOrRegexp( From 633b4a5dcfcf63bec8e2b5a1b5f38e648348639d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 9 Jan 2024 19:23:34 +0100 Subject: [PATCH 0108/1081] Apply suggestions from code review Co-authored-by: Antonio Andelic --- src/Columns/ColumnNullable.cpp | 2 +- src/Columns/ColumnVariant.cpp | 8 +++++--- src/Columns/ColumnVariant.h | 2 +- src/DataTypes/EnumValues.cpp | 4 +--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index d2a579d6800..25b0e35e15e 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -928,7 +928,7 @@ ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column) return assert_cast(*column).cloneNullable(); if (column->canBeInsideNullable()) - return makeNullableSafe(column); + return makeNullable(column); return column; } diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index f90ebfc54bb..10d79f59d37 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -631,9 +631,9 @@ void ColumnVariant::popBack(size_t n) size_t size = local_discriminators_data.size(); const size_t num_variants = variants.size(); std::vector nested_n(num_variants, 0); - for (size_t i = 0; i != n; ++i) + for (size_t i = size - n; i < size; ++i) { - Discriminator discr = local_discriminators_data[size - i - 1]; + Discriminator discr = local_discriminators_data[i]; if (discr != NULL_DISCRIMINATOR) ++nested_n[discr]; } @@ -966,7 +966,7 @@ ColumnPtr ColumnVariant::replicate(const Offsets & replicate_offsets) const { new_offsets_data.reserve(new_size); for (size_t i = old_size; i < new_size; ++i) - new_offsets_data.push_back(new_offsets_data[i - 1] + 1); + new_offsets_data.push_back(i); } else { @@ -1260,6 +1260,8 @@ std::optional ColumnVariant::getLocalDiscriminator { if (variants[i]->size() == local_discriminators->size()) return i; + if (!variants[i]->empty()) + return std::nullopt } return std::nullopt; diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index eb96205924c..8f0c5a6eef9 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -205,7 +205,7 @@ public: void compareColumn(const IColumn &, size_t, PaddedPODArray *, PaddedPODArray &, int, int) const override { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method compareColumn is not supported for ColumnAggregateFunction"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method compareColumn is not supported for ColumnVariant"); } bool hasEqualValues() const override; diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp index 8a4b1304d5e..a15136b9335 100644 --- a/src/DataTypes/EnumValues.cpp +++ b/src/DataTypes/EnumValues.cpp @@ -85,9 +85,7 @@ bool EnumValues::tryGetValue(T & x, StringRef field_name, bool try_treat_as_i if (try_treat_as_id) { ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); - if (!tryReadText(x, tmp_buf) || !tmp_buf.eof() || !value_to_name_map.contains(x)) - return false; - return true; + return tryReadText(x, tmp_buf) && tmp_buf.eof() && value_to_name_map.contains(x); } return false; } From fb758e48b04c5f799a5169af584f6a562866640d Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 Jan 2024 19:02:20 +0000 Subject: [PATCH 0109/1081] Apply suggestions --- src/Columns/ColumnVariant.cpp | 172 +++++++++++++++------------------- 1 file changed, 74 insertions(+), 98 deletions(-) diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 10d79f59d37..a707ec8e153 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -204,10 +204,13 @@ ColumnVariant::ColumnVariant(DB::MutableColumnPtr local_discriminators_, DB::Mut } } -ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::vector & local_to_global_discriminators) +namespace +{ + +MutableColumns getVariantsAssumeMutable(const Columns & variants) { MutableColumns mutable_variants; - mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) { if (isColumnConst(*variant)) @@ -215,35 +218,24 @@ ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::ve mutable_variants.emplace_back(variant->assumeMutable()); } - return ColumnVariant::create(std::move(mutable_variants), local_to_global_discriminators); + return mutable_variants; +} + +} + +ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::vector & local_to_global_discriminators) +{ + return ColumnVariant::create(getVariantsAssumeMutable(variants), local_to_global_discriminators); } ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::Columns & variants, const std::vector & local_to_global_discriminators) { - MutableColumns mutable_variants; - mutable_variants.reserve(variants.size()); - for (const auto & variant : variants) - { - if (isColumnConst(*variant)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); - mutable_variants.emplace_back(variant->assumeMutable()); - } - - return ColumnVariant::create(local_discriminators->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); + return ColumnVariant::create(local_discriminators->assumeMutable(), getVariantsAssumeMutable(variants), local_to_global_discriminators); } ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::ColumnPtr & offsets, const DB::Columns & variants, const std::vector & local_to_global_discriminators) { - MutableColumns mutable_variants; - mutable_variants.reserve(variants.size()); - for (const auto & variant : variants) - { - if (isColumnConst(*variant)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); - mutable_variants.emplace_back(variant->assumeMutable()); - } - - return ColumnVariant::create(local_discriminators->assumeMutable(), offsets->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); + return ColumnVariant::create(local_discriminators->assumeMutable(), offsets->assumeMutable(), getVariantsAssumeMutable(variants), local_to_global_discriminators); } MutableColumnPtr ColumnVariant::cloneEmpty() const @@ -309,104 +301,88 @@ MutableColumnPtr ColumnVariant::cloneResized(size_t new_size) const const auto & local_discriminators_data = getLocalDiscriminators(); const auto & offsets_data = getOffsets(); - /// We can find all variants sizes by scanning all new_size local_discriminators and calculating - /// sizes for all new variants. This code is below and commented. - -// std::vector new_nested_sizes(num_variants, 0); -// for (size_t i = 0; i != new_size; ++i) -// { -// Discriminator discr = local_discriminators_data[i]; -// if (discr != NULL_DISCRIMINATOR) -// ++new_nested_sizes[discr]; -// } -// -// MutableColumns new_variants; -// new_variants.reserve(num_variants); -// for (size_t i = 0; i != num_variants; ++i) -// { -// if (new_nested_sizes[i]) -// new_variants.emplace_back(variants[i]->cloneResized(new_nested_sizes[i])); -// else -// new_variants.emplace_back(variants[i]->cloneEmpty()); -// } -// -// return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); - + /// We can find all variants sizes by scanning all new_size local_discriminators and calculating sizes for all new variants. /// But instead we are trying to optimize it using offsets column: /// For all non-empty variants we are trying to find last occurrence of its discriminator in local_discriminators[:new_size] or - /// first occurrence in local_discriminators[new_size:]. The same row in offsets column will contain the desired size (or size - 1) of variant. + /// first occurrence in local_discriminators[new_size:] depending on what range is smaller. The same row in offsets column will + /// contain the desired size (or size - 1) of variant. /// All empty variants will remain empty. - /// Not sure how good this optimization is, feel free to remove it and use simpler version above. + /// Not sure how good this optimization is, feel free to remove it and use simpler version without using offsets. MutableColumns new_variants(num_variants); - std::unordered_set seen_variants; + std::vector seen_variants(num_variants, 0); + size_t number_of_seen_variants = 0; /// First, check which variants are empty. They will remain empty. for (Discriminator i = 0; i != num_variants; ++i) { if (variants[i]->empty()) { - seen_variants.insert(i); + seen_variants[i] = 1; + ++number_of_seen_variants; new_variants[i] = variants[i]->cloneEmpty(); } } - /// Now, iterate through local discriminators using two pointers. - /// First will go from new_size - 1 to 0, second from new_size to size. - /// Finish when we find all variants or hit lower or upper bound. - ssize_t i = new_size - 1; - size_t j = new_size; - while (i != -1 && j != size) + /// Now, choose what range is smaller and use it. + /// [0, new_size) + if (2 * new_size <= size) { - Discriminator i_discr = local_discriminators_data[i]; - if (i_discr != NULL_DISCRIMINATOR) + for (ssize_t i = new_size - 1; i > -1; --i) { - auto [_, inserted] = seen_variants.insert(i_discr); - /// If this is the first occurrence of this discriminator, - /// we can get new size for this variant. - if (inserted) + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) { - new_variants[i_discr] = variants[i_discr]->cloneResized(offsets_data[i] + 1); - if (seen_variants.size() == num_variants) - break; + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (!seen_variants[discr]) + { + seen_variants[discr] = 1; + ++number_of_seen_variants; + new_variants[discr] = variants[discr]->cloneResized(offsets_data[i] + 1); + /// Break if we found sizes for all variants. + if (number_of_seen_variants == num_variants) + break; + } } } - Discriminator j_discr = local_discriminators_data[j]; - if (j_discr != NULL_DISCRIMINATOR) + /// All variants that weren't found in range [0, new_size] will be empty in the result column. + if (number_of_seen_variants != num_variants) { - auto [_, inserted] = seen_variants.insert(j_discr); - /// If this is the first occurrence of this discriminator, - /// we can get new size for this variant. - if (inserted) - { - new_variants[j_discr] = variants[j_discr]->cloneResized(offsets_data[j]); - if (seen_variants.size() == num_variants) - break; - } - } - - --i; - ++j; - } - - /// We can finish in 3 cases: - /// 1) seen_variants.size() == num_variants - we found local_discriminators of all variants, nothing to do. - /// 2) i == -1 - we scanned all values in local_discriminators[:new_size]. Not found variants doesn't have - /// values in local_discriminators[:new_size], so they should be empty in the resized version. - /// 3) j == size - we scanned all values in local_discriminators[new_size:]. Not found variants doesn't have - /// values in local_discriminators[new_size:], so, we should use the full variant in the resized version. - if (seen_variants.size() != num_variants) - { - for (size_t discr = 0; discr != num_variants; ++discr) - { - if (!seen_variants.contains(discr)) - { - if (i == -1) + for (size_t discr = 0; discr != num_variants; ++discr) + if (!seen_variants[discr]) new_variants[discr] = variants[discr]->cloneEmpty(); - else - new_variants[discr] = IColumn::mutate(variants[discr]); + } + } + /// [new_size, size) + else + { + for (size_t i = new_size; i < size; ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (!seen_variants[discr]) + { + seen_variants[discr] = 1; + ++number_of_seen_variants; + new_variants[discr] = variants[discr]->cloneResized(offsets_data[i]); + /// Break if we found sizes for all variants. + if (number_of_seen_variants == num_variants) + break; + } } } + + if (number_of_seen_variants != num_variants) + { + /// All variants that weren't found in range [new_size, size) will not change their sizes. + for (size_t discr = 0; discr != num_variants; ++discr) + if (!seen_variants[discr]) + new_variants[discr] = IColumn::mutate(variants[discr]); + } } return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); @@ -1261,7 +1237,7 @@ std::optional ColumnVariant::getLocalDiscriminator if (variants[i]->size() == local_discriminators->size()) return i; if (!variants[i]->empty()) - return std::nullopt + return std::nullopt; } return std::nullopt; From 10af0d406fb536917a84d23f4bacba073ea9443e Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 10 Jan 2024 16:55:58 +0100 Subject: [PATCH 0110/1081] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 99e54b08b74..fbd26e59f6f 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage +# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage, no-parallel # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From 1deaaf5466a2633d58fba87521435491546df0a2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:20:06 +0100 Subject: [PATCH 0111/1081] Apply suggestions from code review Co-authored-by: Antonio Andelic --- .../Serializations/SerializationDateTime64.cpp | 6 +++--- src/DataTypes/Serializations/SerializationEnum.h | 5 +---- .../Serializations/SerializationNamed.cpp | 1 - .../Serializations/SerializationTuple.cpp | 15 +++------------ 4 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index a19619bf8d3..442e29edd52 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -50,7 +50,7 @@ void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & ist bool SerializationDateTime64::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const { DateTime64 result = 0; - if (tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && istr.eof())) + if (!tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && !istr.eof())) return false; assert_cast(column).getData().push_back(result); @@ -151,7 +151,7 @@ bool SerializationDateTime64::tryDeserializeTextQuoted(IColumn & column, ReadBuf DateTime64 x = 0; if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' { - if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) return false; } else /// Just 1504193808 or 01504193808 @@ -265,7 +265,7 @@ bool SerializationDateTime64::tryDeserializeTextCSV(IColumn & column, ReadBuffer { if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) { - if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) return false; } else diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 5152a3fbc93..bb720ee9b1f 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -60,10 +60,7 @@ public: bool tryReadValue(ReadBuffer & istr, FieldType & x) const { - if (!tryReadText(x, istr) || !ref_enum_values.hasValue(x)) - return false; - - return true; + return tryReadText(x, istr) && ref_enum_values.hasValue(x); } std::optional> own_enum_values; diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index 1a9cbe9a37d..ca60948ce68 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -1,5 +1,4 @@ #include -#include namespace DB { diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index c0b0658e6b4..79b7fa84242 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -212,10 +212,7 @@ ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer return ReturnType(true); }; - if constexpr (throw_exception) - addElementSafe(elems.size(), column, impl); - else - return addElementSafe(elems.size(), column, impl); + return addElementSafe(elems.size(), column, impl); } void SerializationTuple::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const @@ -457,10 +454,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf return ReturnType(true); }; - if constexpr (throw_exception) - addElementSafe(elems.size(), column, impl); - else - return addElementSafe(elems.size(), column, impl); + return addElementSafe(elems.size(), column, impl); } else { @@ -502,10 +496,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf return ReturnType(true); }; - if constexpr (throw_exception) - addElementSafe(elems.size(), column, impl); - else - return addElementSafe(elems.size(), column, impl); + return addElementSafe(elems.size(), column, impl); } } From f05d89bc2b26206b1b6854ad48dd35840b82a123 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 Jan 2024 14:48:57 +0000 Subject: [PATCH 0112/1081] Apply review suggestions --- .../Serializations/ISerialization.cpp | 47 +++--- .../Serializations/SerializationTuple.cpp | 3 + .../Serializations/SerializationVariant.cpp | 135 +++++++++--------- 3 files changed, 101 insertions(+), 84 deletions(-) diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 08575f06f2a..c699b3b0748 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -176,7 +176,7 @@ String getNameForSubstreamPath( stream_name += "." + it->tuple_element_name; } else if (it->type == Substream::VariantDiscriminators) - stream_name += ".discr"; + stream_name += ".variant_discr"; else if (it->type == Substream::VariantOffsets) stream_name += ".variant_offsets"; else if (it->type == Substream::VariantElement) @@ -261,43 +261,51 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } -#define TRY_DESERIALIZE_TEXT(deserialize) \ - size_t prev_size = column.size(); \ - try \ - { \ - deserialize(column, istr, settings); \ - return true; \ - } \ - catch (...) \ - { \ - if (column.size() > prev_size) \ - column.popBack(column.size() - prev_size); \ - return false; \ - } \ +namespace +{ + +template +bool tryDeserializeText(const F deserialize, DB::IColumn & column) +{ + size_t prev_size = column.size(); + try + { + deserialize(column); + return true; + } + catch (...) + { + if (column.size() > prev_size) + column.popBack(column.size() - prev_size); + return false; + } +} + +} bool ISerialization::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextCSV) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextCSV(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextEscaped) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextEscaped(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextJSON) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextJSON(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextQuoted) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextQuoted(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeWholeText) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeWholeText(my_column, istr, settings); }, column); } void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -346,7 +354,6 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref return path[last_elem].type == Substream::NullMap || path[last_elem].type == Substream::TupleElement || path[last_elem].type == Substream::ArraySizes - || path[last_elem].type == Substream::VariantDiscriminators || path[last_elem].type == Substream::VariantElement; } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 79b7fa84242..c249ee69e46 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -76,7 +76,10 @@ static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) { auto & element_column = extractElementColumn(column, i); if (element_column.size() > old_size) + { + chassert(old_size - element_column.size() == 1); element_column.popBack(1); + } } }; diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 9cfc4b9e26f..64fcb63d604 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -397,70 +397,76 @@ void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr namespace { -std::unordered_map getTypesTextDeserializePriorityMap() +const std::unordered_map & getTypesTextDeserializePriorityMap() { - static const std::vector priorities = { - /// Complex types have highest priority. - TypeIndex::Array, - TypeIndex::Tuple, - TypeIndex::Map, - TypeIndex::AggregateFunction, + static std::unordered_map priority_map = [] + { + static constexpr std::array priorities = { + /// Complex types have highest priority. + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + TypeIndex::AggregateFunction, - /// Enums can be parsed both from strings and numbers. - /// So they have high enough priority. - TypeIndex::Enum8, - TypeIndex::Enum16, + /// Enums can be parsed both from strings and numbers. + /// So they have high enough priority. + TypeIndex::Enum8, + TypeIndex::Enum16, - /// Types that can be parsed from strings. - TypeIndex::UUID, - TypeIndex::IPv4, - TypeIndex::IPv6, + /// Types that can be parsed from strings. + TypeIndex::UUID, + TypeIndex::IPv4, + TypeIndex::IPv6, - /// Types that can be parsed from numbers. - /// The order: - /// 1) Integers - /// 2) Big Integers - /// 3) Decimals - /// 4) Floats - /// In each group small types have higher priority. - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Decimal32, - TypeIndex::Decimal64, - TypeIndex::Decimal128, - TypeIndex::Decimal256, - TypeIndex::Float32, - TypeIndex::Float64, + /// Types that can be parsed from numbers. + /// The order: + /// 1) Integers + /// 2) Big Integers + /// 3) Decimals + /// 4) Floats + /// In each group small types have higher priority. + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Float32, + TypeIndex::Float64, - /// Dates and DateTimes. More simple Date types have higher priority. - /// They have lower priority as numbers as some DateTimes sometimes can - /// be also parsed from numbers, but we don't want it usually. - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::DateTime64, + /// Dates and DateTimes. More simple Date types have higher priority. + /// They have lower priority as numbers as some DateTimes sometimes can + /// be also parsed from numbers, but we don't want it usually. + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, - /// String types have almost the lowest priority, - /// as in text formats almost all data can - /// be deserialized into String type. - TypeIndex::FixedString, - TypeIndex::String, - }; + /// String types have almost the lowest priority, + /// as in text formats almost all data can + /// be deserialized into String type. + TypeIndex::FixedString, + TypeIndex::String, + }; + + std::unordered_map pm; + + pm.reserve(priorities.size()); + for (size_t i = 0; i != priorities.size(); ++i) + pm[priorities[i]] = priorities.size() - i; + return pm; + }(); - std::unordered_map priority_map; - priority_map.reserve(priorities.size()); - for (size_t i = 0; i != priorities.size(); ++i) - priority_map[priorities[i]] = priorities.size() - i; return priority_map; } @@ -476,7 +482,7 @@ std::unordered_map getTypesTextDeserializePriorityMap() /// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types /// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). /// This is just a batch of heuristics. -std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, std::unordered_map & priority_map) +std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, const std::unordered_map & priority_map) { if (const auto * nullable_type = typeid_cast(type.get())) return getTypeTextDeserializePriority(nullable_type->getNestedType(), nested_depth, simple_nested_depth + 1, priority_map); @@ -487,7 +493,7 @@ std::tuple getTypeTextDeserializePriority(const DataType if (const auto * array_type = typeid_cast(type.get())) { auto [elements_nested_depth, elements_priority, elements_simple_nested_depth] = getTypeTextDeserializePriority(array_type->getNestedType(), nested_depth + 1, simple_nested_depth, priority_map); - return {elements_nested_depth, elements_priority + priority_map[TypeIndex::Array], elements_simple_nested_depth}; + return {elements_nested_depth, elements_priority + priority_map.at(TypeIndex::Array), elements_simple_nested_depth}; } if (const auto * tuple_type = typeid_cast(type.get())) @@ -505,14 +511,14 @@ std::tuple getTypeTextDeserializePriority(const DataType max_simple_nested_depth = elem_simple_nested_depth; } - return {max_nested_depth, sum_priority + priority_map[TypeIndex::Tuple], max_simple_nested_depth}; + return {max_nested_depth, sum_priority + priority_map.at(TypeIndex::Tuple), max_simple_nested_depth}; } if (const auto * map_type = typeid_cast(type.get())) { auto [key_max_depth, key_priority, key_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getKeyType(), nested_depth + 1, simple_nested_depth, priority_map); auto [value_max_depth, value_priority, value_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getValueType(), nested_depth + 1, simple_nested_depth, priority_map); - return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map[TypeIndex::Map], std::max(key_simple_nested_depth, value_simple_nested_depth)}; + return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map.at(TypeIndex::Map), std::max(key_simple_nested_depth, value_simple_nested_depth)}; } if (const auto * variant_type = typeid_cast(type.get())) @@ -536,9 +542,10 @@ std::tuple getTypeTextDeserializePriority(const DataType /// Bool type should have priority higher then all integers. if (isBool(type)) - return {nested_depth, priority_map[TypeIndex::Int8] + 1, simple_nested_depth}; + return {nested_depth, priority_map.at(TypeIndex::Int8) + 1, simple_nested_depth}; - return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; + auto it = priority_map.find(type->getTypeId()); + return {nested_depth, it == priority_map.end() ? 0 : it->second, simple_nested_depth}; } } @@ -549,7 +556,7 @@ std::vector SerializationVariant::getVariantsDeserializeTextOrder(const priorities.reserve(variant_types.size()); std::vector order; order.reserve(variant_types.size()); - auto priority_map = getTypesTextDeserializePriorityMap(); + const auto & priority_map = getTypesTextDeserializePriorityMap(); for (size_t i = 0; i != variant_types.size(); ++i) { priorities.push_back(getTypeTextDeserializePriority(variant_types[i], 0, 0, priority_map)); From 9e639df12e69c7373e400115977c432b8fdf31f2 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 Jan 2024 18:44:05 +0000 Subject: [PATCH 0113/1081] Add fixes, add new mode to getLeastSupertype and use it in if/multiIf --- .../Serializations/SerializationTuple.cpp | 2 +- src/DataTypes/getLeastSupertype.cpp | 73 ++++++++++++++++--- src/DataTypes/getLeastSupertype.h | 12 +++ src/Functions/if.cpp | 14 +--- src/Functions/multiIf.cpp | 8 +- ...940_variant_text_deserialization.reference | 2 +- 6 files changed, 79 insertions(+), 32 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index c249ee69e46..5d8c84b70bf 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -77,7 +77,7 @@ static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) auto & element_column = extractElementColumn(column, i); if (element_column.size() > old_size) { - chassert(old_size - element_column.size() == 1); + chassert(element_column.size() - old_size == 1); element_column.popBack(1); } } diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index e5bdb4b267f..5d67f888c4b 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -58,6 +59,25 @@ DataTypePtr throwOrReturn(const DataTypes & types, std::string_view message_suff if constexpr (on_error == LeastSupertypeOnError::String) return std::make_shared(); + if constexpr (on_error == LeastSupertypeOnError::Variant && std::is_same_v>) + { + DataTypes variants; + for (const auto & type : types) + { + if (isVariant(type)) + { + const DataTypes & nested_variants = assert_cast(*type).getVariants(); + variants.insert(variants.end(), nested_variants.begin(), nested_variants.end()); + } + else + { + variants.push_back(removeNullableOrLowCardinalityNullable(type)); + } + } + + return std::make_shared(variants); + } + if constexpr (on_error == LeastSupertypeOnError::Null) return nullptr; @@ -67,8 +87,8 @@ DataTypePtr throwOrReturn(const DataTypes & types, std::string_view message_suff throw Exception(error_code, "There is no supertype for types {} {}", getExceptionMessagePrefix(types), message_suffix); } -template -DataTypePtr getNumericType(const TypeIndexSet & types) +template +DataTypePtr getNumericType(const TypeIndexSet & types, ThrowOrReturnFunc throwOrReturnFunc) { bool all_numbers = true; @@ -119,7 +139,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) { if (!all_numbers) - return throwOrReturn(types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throwOrReturnFunc(types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. /// Example, common of Int32, UInt32 = Int64. @@ -134,7 +154,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) if (min_bit_width_of_integer != 64) ++min_bit_width_of_integer; else - return throwOrReturn(types, + return throwOrReturnFunc(types, "because some of them are signed integers and some are unsigned integers," " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); @@ -149,7 +169,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) else if (min_mantissa_bits <= 53) return std::make_shared(); else - return throwOrReturn(types, + return throwOrReturnFunc(types, " because some of them are integers and some are floating point," " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE); } @@ -170,7 +190,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) else if (min_bit_width_of_integer <= 256) return std::make_shared(); else - return throwOrReturn(types, + return throwOrReturnFunc(types, " because some of them are signed integers and some are unsigned integers," " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); } @@ -190,7 +210,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) else if (min_bit_width_of_integer <= 256) return std::make_shared(); else - return throwOrReturn(types, + return throwOrReturnFunc(types, " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE); } } @@ -382,7 +402,18 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (!all_maps) return throwOrReturn(types, "because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); - auto keys_common_type = getLeastSupertype(key_types); + DataTypePtr keys_common_type; + if constexpr (on_error == LeastSupertypeOnError::Variant) + { + keys_common_type = getLeastSupertype(key_types); + if (!keys_common_type) + return throwOrReturn(types, "", ErrorCodes::NO_COMMON_TYPE); + } + else + { + keys_common_type = getLeastSupertype(key_types); + } + auto values_common_type = getLeastSupertype(value_types); /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype for keys or values, /// keys_common_type or values_common_type will be nullptr, we should return nullptr in this case. @@ -423,7 +454,18 @@ DataTypePtr getLeastSupertype(const DataTypes & types) return getLeastSupertype(nested_types); else { - auto nested_type = getLeastSupertype(nested_types); + DataTypePtr nested_type; + if constexpr (on_error == LeastSupertypeOnError::Variant) + { + nested_type = getLeastSupertype(nested_types); + if (!nested_type) + return throwOrReturn(types, "", ErrorCodes::NO_COMMON_TYPE); + } + else + { + nested_type = getLeastSupertype(nested_types); + } + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, /// nested_type will be nullptr, we should return nullptr in this case. if (!nested_type) @@ -456,6 +498,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_nullable) { auto nested_type = getLeastSupertype(nested_types); + if (isVariant(nested_type)) + return nested_type; /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, /// nested_type will be nullptr, we should return nullptr in this case. if (!nested_type) @@ -623,7 +667,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types) { /// First, if we have signed integers, try to convert all UInt64 to Int64 if possible. convertUInt64toInt64IfPossible(types, type_ids); - auto numeric_type = getNumericType(type_ids); + auto throw_or_return = [&](const TypeIndexSet &, std::string_view message_suffix, int error_code){ return throwOrReturn(types, message_suffix, error_code); }; + auto numeric_type = getNumericType(type_ids, throw_or_return); if (numeric_type) return numeric_type; } @@ -637,6 +682,11 @@ DataTypePtr getLeastSupertypeOrString(const DataTypes & types) return getLeastSupertype(types); } +DataTypePtr getLeastSupertypeOrVariant(const DataTypes & types) +{ + return getLeastSupertype(types); +} + DataTypePtr tryGetLeastSupertype(const DataTypes & types) { return getLeastSupertype(types); @@ -676,7 +726,8 @@ DataTypePtr getLeastSupertype(const TypeIndexSet & types) return std::make_shared(); } - auto numeric_type = getNumericType(types); + auto throw_or_return = [](const TypeIndexSet & type_ids, std::string_view message_suffix, int error_code){ return throwOrReturn(type_ids, message_suffix, error_code); }; + auto numeric_type = getNumericType(types, throw_or_return); if (numeric_type) return numeric_type; diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h index 2ef4a0e6850..d949fad69c5 100644 --- a/src/DataTypes/getLeastSupertype.h +++ b/src/DataTypes/getLeastSupertype.h @@ -8,6 +8,7 @@ enum class LeastSupertypeOnError { Throw, String, + Variant, Null, }; @@ -24,6 +25,17 @@ DataTypePtr getLeastSupertype(const DataTypes & types); /// All types can be casted to String, because they can be serialized to String. DataTypePtr getLeastSupertypeOrString(const DataTypes & types); +/// Same as getLeastSupertype but in case when there is no supertype for some types +/// it uses Variant of these types as a supertype. Any type can be casted to a Variant +/// that contains this type. +/// As nested Variants are not allowed, if one of the types is Variant, it's variants +/// are used in the resulting Variant. +/// Examples: +/// (UInt64, String) -> Variant(UInt64, String) +/// (Array(UInt64), Array(String)) -> Array(Variant(UInt64, String)) +/// (Variant(UInt64, String), Array(UInt32)) -> Variant(UInt64, String, Array(UInt32)) +DataTypePtr getLeastSupertypeOrVariant(const DataTypes & types); + /// Same as above but return nullptr instead of throwing exception. DataTypePtr tryGetLeastSupertype(const DataTypes & types); diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 1dc7443f124..c247938f885 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -688,15 +688,9 @@ private: DataTypePtr common_type; if (use_variant_when_no_common_type) - { - common_type = tryGetLeastSupertype(DataTypes{arg1.type, arg2.type}); - if (!common_type) - common_type = std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arg1.type), removeNullableOrLowCardinalityNullable(arg2.type)}); - } + common_type = getLeastSupertypeOrVariant(DataTypes{arg1.type, arg2.type}); else - { common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); - } ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -1118,11 +1112,7 @@ public: "Must be UInt8.", arguments[0]->getName()); if (use_variant_when_no_common_type) - { - if (auto res = tryGetLeastSupertype(DataTypes{arguments[1], arguments[2]})) - return res; - return std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arguments[1]), removeNullableOrLowCardinalityNullable(arguments[2])}); - } + return getLeastSupertypeOrVariant(DataTypes{arguments[1], arguments[2]}); return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index 7a2e9444b2c..cefbea9f352 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -119,13 +119,7 @@ public: }); if (context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if) - { - if (auto res = tryGetLeastSupertype(types_of_branches)) - return res; - for (auto & type : types_of_branches) - type = removeNullableOrLowCardinalityNullable(type); - return std::make_shared(types_of_branches); - } + return getLeastSupertypeOrVariant(types_of_branches); return getLeastSupertype(types_of_branches); } diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.reference b/tests/queries/0_stateless/02940_variant_text_deserialization.reference index 98725917567..8836e6c4e57 100644 --- a/tests/queries/0_stateless/02940_variant_text_deserialization.reference +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.reference @@ -505,7 +505,7 @@ String (NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0)(NULL,NULL),('string',NULL),(-1,NULL),(0,0)Floats (NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Decimals (NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Dates and DateTimes -(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000','1970-01-01 00:00:00.000'),('2020-01-01 00:00:00.999',NULL),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID +(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01 00:00:00.999','2020-01-01 00:00:00.999'),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID (NULL,NULL),('string',NULL),('c8619cca-0caa-445e-ae76-1d4f6e0b3927','c8619cca-0caa-445e-ae76-1d4f6e0b3927'),('c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA',NULL)IPv4 (NULL,NULL),('string',NULL),('127.0.0.1','127.0.0.1'),('127.0.0.1AAA',NULL)IPv6 (NULL,NULL),('string',NULL),('2001:db8:85a3::8a2e:370:7334','2001:db8:85a3::8a2e:370:7334'),('2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA',NULL)Enum From c30736d415fcdaccb68a1c0e37e8c4de9242e014 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 12 Jan 2024 15:31:15 +0000 Subject: [PATCH 0114/1081] Cosmetics --- src/Storages/MergeTree/MutateTask.cpp | 8 +-- ...mn_must_not_override_past_values.reference | 33 ++++++++++++ ...e_column_must_not_override_past_values.sql | 53 +++++++++++++++++++ ..._column_not_override_past_values.reference | 29 ---------- ...ialize_column_not_override_past_values.sql | 49 ----------------- 5 files changed, 90 insertions(+), 82 deletions(-) create mode 100644 tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference create mode 100644 tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql delete mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference delete mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index bb41608eb00..25fa45e7b68 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -76,8 +76,8 @@ static void splitAndModifyMutationCommands( { if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) { - /// For ordinary column with default expression, materialize column should not override past values - /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) { @@ -206,8 +206,8 @@ static void splitAndModifyMutationCommands( { if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) { - /// For ordinary column with default expression, materialize column should not override past values - /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) for_interpreter.push_back(command); diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference new file mode 100644 index 00000000000..a5a0370620b --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference @@ -0,0 +1,33 @@ +-- Compact parts +Origin +1 2 +2 54321 +After materialize +1 2 +2 54321 +-- Wide parts +Origin +1 2 +2 54321 +After materialize +1 2 +2 54321 +-- Nullable column != physically absent +Origin +1 2 +2 \N +3 54321 +After materialize +1 2 +2 \N +3 54321 +-- Parts with renamed column +Origin +1 2 +2 54321 +After rename +1 2 +2 54321 +After materialize +1 2 +2 54321 diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql new file mode 100644 index 00000000000..825c7eab048 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql @@ -0,0 +1,53 @@ +SET mutations_sync = 2; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id) VALUES (2); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Wide parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id) VALUES (2); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Nullable column != physically absent'; + +CREATE TABLE tab (id Int64, dflt Nullable(Int64) DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id, dflt) VALUES (2, NULL); +INSERT INTO tab (id) VALUES (3); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Parts with renamed column'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id) VALUES (2); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab RENAME COLUMN dflt TO dflt2; +SELECT 'After rename'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN bar; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference deleted file mode 100644 index 6b0d88bd09b..00000000000 --- a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference +++ /dev/null @@ -1,29 +0,0 @@ ---Origin-- -1 2 -2 54321 ---After materialize-- -1 2 -2 54321 ---Origin-- -1 2 -2 54321 ---After materialize-- -1 2 -2 54321 ---Origin-- -1 2 -2 \N -3 54321 ---After materialize-- -1 2 -2 \N -3 54321 ---Origin-- -1 2 -2 54321 ---After rename-- -1 2 -2 54321 ---After materialize-- -1 2 -2 54321 diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql deleted file mode 100644 index 1815661e097..00000000000 --- a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql +++ /dev/null @@ -1,49 +0,0 @@ - -SET mutations_sync = 2; --- Compact parts -CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id ) values ( 2 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN foo; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; - --- Wide parts -CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id ) values ( 2 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN foo; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; - --- Nullable column != physically absent -CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id, foo ) values ( 2, NULL ); -INSERT INTO test ( id ) values ( 3 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN foo; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; - --- Parts with renamed column -CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id ) values ( 2 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test RENAME COLUMN foo TO bar; -SELECT '--After rename--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN bar; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; \ No newline at end of file From 1dacfc53ff97fbab6ee349c6df27b3ad2f9df1e8 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Fri, 12 Jan 2024 17:28:45 +0000 Subject: [PATCH 0115/1081] weather data --- .../getting-started/example-datasets/noaa.md | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 docs/en/getting-started/example-datasets/noaa.md diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md new file mode 100644 index 00000000000..8d34ff8d3ee --- /dev/null +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -0,0 +1,340 @@ +--- +slug: /en/getting-started/example-datasets/noaa +sidebar_label: NOAA Global Historical Climatology Network +sidebar_position: 1 +description: 2.5 billion rows of climate data for the last 120 yrs +--- + +# NOAA Global Historical Climatology Network + +This dataset contains weather measurements for the last 120 years. Each row is a measurement for a point in time and station. + +More precisely and according to the [origin of this data](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): + +> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: + + - Daily maximum temperature + - Daily minimum temperature + - Temperature at the time of observation + - Precipitation (i.e., rain, melted snow) + - Snowfall + - Snow depth + - Other elements where available + +## Downloading the data + +- A [pre-prepared version](#pre-prepared-data) of the data for ClickHouse, which has been cleansed, re-structured, and enriched. This data covers the years 1900 to 2022. +- [Download the original data](#original-data) and convert to the format required by ClickHouse. Users wanting to add their own columns may wish to explore this approach. + +### Pre-prepared data + +More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date i.e. + +```csv +"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" +"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0 +"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0 +"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0 +"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0 +``` + +This is simpler to query and ensures the resulting table is less sparse. Finally, the data has also been enriched with latitude and longitude. + +This data is available in the following S3 location. Either download the data to your local filesystem (and insert using the ClickHouse client) or insert directly into ClickHouse (see [Inserting from S3](#inserting-from-s3)). + +To download: + +```bash +wget https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet +``` + +### Original data + +The following details the steps to download and transform the original data in preparation for loading into ClickHouse. + +#### Download + +To download the original data: + +```bash +for i in {1900..2023}; do wget https://noaa-ghcn-pds.s3.amazonaws.com/csv.gz/${i}.csv.gz; done +``` + +#### Sampling the data + +```bash +zcat 2021.csv.gz | head +AE000041196,20210101,TMAX,278,,,S, +AE000041196,20210101,PRCP,0,D,,S, +AE000041196,20210101,TAVG,214,H,,S, +AEM00041194,20210101,TMAX,266,,,S, +AEM00041194,20210101,TMIN,178,,,S, +AEM00041194,20210101,PRCP,0,,,S, +AEM00041194,20210101,TAVG,217,H,,S, +AEM00041217,20210101,TMAX,262,,,S, +AEM00041217,20210101,TMIN,155,,,S, +AEM00041217,20210101,TAVG,202,H,,S, +``` + +Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): + + +Summarizing the format documentation and the columns in order: + + - An 11 character station identification code. This itself encodes some useful information + - YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) + - ELEMENT = 4 character indicator of element type. Effectively the measurement type. While there are many measurements available, we select the following: + - PRCP - Precipitation (tenths of mm) + - SNOW - Snowfall (mm) + - SNWD - Snow depth (mm) + - TMAX - Maximum temperature (tenths of degrees C) + - TAVG - Average temperature (tenths of a degrees C) + - TMIN - Minimum temperature (tenths of degrees C) + - PSUN - Daily percent of possible sunshine (percent) + - AWND - Average daily wind speed (tenths of meters per second) + - WSFG - Peak gust wind speed (tenths of meters per second) + - WT** = Weather Type where ** defines the weather type. Full list of weather types here. +- DATA VALUE = 5 character data value for ELEMENT i.e. the value of the measurement. +- M-FLAG = 1 character Measurement Flag. This has 10 possible values. Some of these values indicate questionable data accuracy. We accept data where this is set to “P” - identified as missing presumed zero, as this is only relevant to the PRCP, SNOW and SNWD measurements. +- Q-FLAG is the measurement quality flag with 14 possible values. We are only interested in data with an empty value i.e. it did not fail any quality assurance checks. +- S-FLAG is the source flag for the observation. Not useful for our analysis and ignored. +- OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am). Typically not present in older data. We ignore this for our purposes. + +A measurement per line would result in a sparse table structure in ClickHouse. We should transform to a row per time and station, with measurements as columns. First, we limit the dataset to those rows without issues i.e. where `qFlag` is equal to an empty string. + +#### Clean the data + +Using [ClickHouse local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) we can filter rows that represent measurements of interest and pass our quality requirements: + +```bash +clickhouse local --query "SELECT count() +FROM file('*.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT'))" + +2679264563 +``` + +With over 2.6 billion rows, this isn’t a fast query since it involves parsing all the files. On our 8 core machine, this takes around 160 seconds. + + +### Pivot data + +While the measurement per line structure can be used with ClickHouse, it will unnecessarily complicate future queries. Ideally, we need a row per station id and date, where each measurement type and associated value are a column i.e. + +```csv +"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" +"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0 +"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0 +"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0 +"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0 +``` + +Using ClickHouse local and a simple `GROUP BY`, we can repivot our data to this structure. To limit memory overhead, we do this one file at a time. + +```bash +for i in {1900..2022} +do +clickhouse-local --query "SELECT station_id, + toDate32(date) as date, + anyIf(value, measurement = 'TAVG') as tempAvg, + anyIf(value, measurement = 'TMAX') as tempMax, + anyIf(value, measurement = 'TMIN') as tempMin, + anyIf(value, measurement = 'PRCP') as precipitation, + anyIf(value, measurement = 'SNOW') as snowfall, + anyIf(value, measurement = 'SNWD') as snowDepth, + anyIf(value, measurement = 'PSUN') as percentDailySun, + anyIf(value, measurement = 'AWND') as averageWindSpeed, + anyIf(value, measurement = 'WSFG') as maxWindSpeed, + toUInt8OrZero(replaceOne(anyIf(measurement, startsWith(measurement, 'WT') AND value = 1), 'WT', '')) as weatherType +FROM file('$i.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') + WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT')) +GROUP BY station_id, date +ORDER BY station_id, date FORMAT CSV" >> "noaa.csv"; +done +``` + +This query produces a single 50GB file `noaa.csv`. + +### Enriching the data + +The data has no indication of location aside from a station id, which includes a prefix country code. Ideally, each station would have a latitude and longitude associated with it. To achieve this, NOAA conveniently provides the details of each station as a separate [ghcnd-stations.txt](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file). This file has [several columns](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file), of which five are useful to our future analysis: id, latitude, longitude, elevation, and name. + +```bash +wget http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt +``` + +```bash +clickhouse local --query "WITH stations AS (SELECT id, lat, lon, elevation, splitByString(' GSN ',name)[1] as name FROM file('ghcnd-stations.txt', Regexp, 'id String, lat Float64, lon Float64, elevation Float32, name String')) +SELECT station_id, + date, + tempAvg, + tempMax, + tempMin, + precipitation, + snowfall, + snowDepth, + percentDailySun, + averageWindSpeed, + maxWindSpeed, + weatherType, + tuple(lon, lat) as location, + elevation, + name +FROM file('noaa.csv', CSV, + 'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER + JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" +``` +This query takes a few minutes to run and produces an 6.4GB file `noaa_enriched.parquet`. + +## Create table + +Create a MergeTree table in ClickHouse (from the ClickHouse client). + +```sql +CREATE TABLE noaa +( + `station_id` LowCardinality(String), + `date` Date32, + `tempAvg` Int32 COMMENT 'Average temperature (tenths of a degrees C)', + `tempMax` Int32 COMMENT 'Maximum temperature (tenths of degrees C)', + `tempMin` Int32 COMMENT 'Minimum temperature (tenths of degrees C)', + `precipitation` UInt32 COMMENT 'Precipitation (tenths of mm)', + `snowfall` UInt32 COMMENT 'Snowfall (mm)', + `snowDepth` UInt32 COMMENT 'Snow depth (mm)', + `percentDailySun` UInt8 COMMENT 'Daily percent of possible sunshine (percent)', + `averageWindSpeed` UInt32 COMMENT 'Average daily wind speed (tenths of meters per second)', + `maxWindSpeed` UInt32 COMMENT 'Peak gust wind speed (tenths of meters per second)', + `weatherType` Enum8('Normal' = 0, 'Fog' = 1, 'Heavy Fog' = 2, 'Thunder' = 3, 'Small Hail' = 4, 'Hail' = 5, 'Glaze' = 6, 'Dust/Ash' = 7, 'Smoke/Haze' = 8, 'Blowing/Drifting Snow' = 9, 'Tornado' = 10, 'High Winds' = 11, 'Blowing Spray' = 12, 'Mist' = 13, 'Drizzle' = 14, 'Freezing Drizzle' = 15, 'Rain' = 16, 'Freezing Rain' = 17, 'Snow' = 18, 'Unknown Precipitation' = 19, 'Ground Fog' = 21, 'Freezing Fog' = 22), + `location` Point, + `elevation` Float32, + `name` LowCardinality(String) +) ENGINE = MergeTree() ORDER BY (station_id, date); + +``` + +## Inserting into ClickHouse + +### Inserting from local file + +Data can be inserted from local file as follows (from the ClickHouse client): + +```sql +INSERT INTO noaa FROM INFILE '/noaa_enriched.parquet' +``` + +where `` represents the full path to the local file on disk. + +See [here](https://clickhouse.com/blog/real-world-data-noaa-climate-data#load-the-data) for how to speed this load up. + +### Inserting from S3 + +```sql +INSERT INTO noaa SELECT * +FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet') + +``` +For how to speed this up, see our blog post on [tuning large data loads](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2). + +## Sample queries + +### Highest temperature ever + +```sql +SELECT + tempMax / 10 AS maxTemp, + location, + name, + date +FROM blogs.noaa +WHERE tempMax > 500 +ORDER BY + tempMax DESC, + date ASC +LIMIT 5 + +┌─maxTemp─┬─location──────────┬─name───────────────────────────────────────────┬───────date─┐ +│ 56.7 │ (-116.8667,36.45) │ CA GREENLAND RCH │ 1913-07-10 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-08-20 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-09-18 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-07-17 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-09-04 │ +└─────────┴───────────────────┴────────────────────────────────────────────────┴────────────┘ + +5 rows in set. Elapsed: 0.514 sec. Processed 1.06 billion rows, 4.27 GB (2.06 billion rows/s., 8.29 GB/s.) +``` + +Reassuringly consistent with the [documented record](https://en.wikipedia.org/wiki/List_of_weather_records#Highest_temperatures_ever_recorded) at [Furnace Creek](https://www.google.com/maps/place/36%C2%B027'00.0%22N+116%C2%B052'00.1%22W/@36.1329666,-116.1104099,8.95z/data=!4m5!3m4!1s0x0:0xf2ed901b860f4446!8m2!3d36.45!4d-116.8667) as of 2023. + +### Best ski resorts + +Using a [list of ski resorts](https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv) in the united states and their respective locations, we join these against the top 1000 weather stations with the most in any month in the last 5 yrs. Sorting this join by [geoDistance](https://clickhouse.com/docs/en/sql-reference/functions/geo/coordinates/#geodistance) and restricting the results to those where the distance is less than 20km, we select the top result per resort and sort this by total snow. Note we also restrict resorts to those above 1800m, as a broad indicator of good skiing conditions. + +```sql +SELECT + resort_name, + total_snow / 1000 AS total_snow_m, + resort_location, + month_year +FROM +( + WITH resorts AS + ( + SELECT + resort_name, + state, + (lon, lat) AS resort_location, + 'US' AS code + FROM url('https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv', CSVWithNames) + ) + SELECT + resort_name, + highest_snow.station_id, + geoDistance(resort_location.1, resort_location.2, station_location.1, station_location.2) / 1000 AS distance_km, + highest_snow.total_snow, + resort_location, + station_location, + month_year + FROM + ( + SELECT + sum(snowfall) AS total_snow, + station_id, + any(location) AS station_location, + month_year, + substring(station_id, 1, 2) AS code + FROM noaa + WHERE (date > '2017-01-01') AND (code = 'US') AND (elevation > 1800) + GROUP BY + station_id, + toYYYYMM(date) AS month_year + ORDER BY total_snow DESC + LIMIT 1000 + ) AS highest_snow + INNER JOIN resorts ON highest_snow.code = resorts.code + WHERE distance_km < 20 + ORDER BY + resort_name ASC, + total_snow DESC + LIMIT 1 BY + resort_name, + station_id +) +ORDER BY total_snow DESC +LIMIT 5 + +┌─resort_name──────────┬─total_snow_m─┬─resort_location─┬─month_year─┐ +│ Sugar Bowl, CA │ 7.799 │ (-120.3,39.27) │ 201902 │ +│ Donner Ski Ranch, CA │ 7.799 │ (-120.34,39.31) │ 201902 │ +│ Boreal, CA │ 7.799 │ (-120.35,39.33) │ 201902 │ +│ Homewood, CA │ 4.926 │ (-120.17,39.08) │ 201902 │ +│ Alpine Meadows, CA │ 4.926 │ (-120.22,39.17) │ 201902 │ +└──────────────────────┴──────────────┴─────────────────┴────────────┘ + +5 rows in set. Elapsed: 0.750 sec. Processed 689.10 million rows, 3.20 GB (918.20 million rows/s., 4.26 GB/s.) +Peak memory usage: 67.66 MiB. +``` + +## Credits + +We would like to acknowledge the efforts of the Global Historical Climatology Network for preparing, cleansing, and distributing this data. We appreciate your efforts. + +Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - Daily (GHCN-Daily), Version 3. [indicate subset used following decimal, e.g. Version 3.25]. NOAA National Centers for Environmental Information. http://doi.org/10.7289/V5D21VHZ [17/08/2020] From ffde721f08359e0437c44026881e2514012a4966 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Jan 2024 23:09:10 +0300 Subject: [PATCH 0116/1081] Update 02932_set_ttl_where.sql --- tests/queries/0_stateless/02932_set_ttl_where.sql | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql index bf2b317c4bf..ee8473e1af2 100644 --- a/tests/queries/0_stateless/02932_set_ttl_where.sql +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -1,3 +1,5 @@ +-- Tags: no-ordinary-database + create or replace table t_temp ( a UInt32, timestamp DateTime @@ -12,3 +14,5 @@ select sleep(1); insert into t_temp select rand(), now() from system.numbers limit 1_000_000; select sleep(1); optimize table t_temp final; + +DROP TABLE t_temp; From 12585ea0e4cae1771ee6b51dd85a309e5923f12c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Jan 2024 23:10:27 +0300 Subject: [PATCH 0117/1081] Update TTLDescription.cpp --- src/Storages/TTLDescription.cpp | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index bfd3afc30d8..3db5269b617 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -117,11 +117,6 @@ TTLDescription::TTLDescription(const TTLDescription & other) , if_exists(other.if_exists) , recompression_codec(other.recompression_codec) { - // if (other.expression) - // expression = other.expression->clone(); - - // if (other.where_expression) - // where_expression = other.where_expression->clone(); } TTLDescription & TTLDescription::operator=(const TTLDescription & other) @@ -135,11 +130,6 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else expression_ast.reset(); - // if (other.expression) - // expression = other.expression->clone(); - // else - // expression.reset(); - expression_columns = other.expression_columns; result_column = other.result_column; @@ -148,11 +138,6 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else where_expression_ast.reset(); - // if (other.where_expression) - // where_expression = other.where_expression->clone(); - // else - // where_expression.reset(); - where_expression_columns = other.where_expression_columns; where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; @@ -179,7 +164,6 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType auto dag = analyzer.getActionsDAG(false); const auto * col = &dag->findInOutputs(ast->getColumnName()); - // std::cerr << "buildExpressionAndSets " << ttl_string << std::endl; if (col->result_name != ttl_string) col = &dag->addAlias(*col, ttl_string); @@ -189,10 +173,6 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType result.expression = std::make_shared(dag, ExpressionActionsSettings::fromContext(context)); result.sets = analyzer.getPreparedSets(); - // std::cerr << "--------- buildExpressionAndSets\n"; - // std::cerr << result.expression->dumpActions() << std::endl; - // std::cerr << result.sets->getSubqueries().size() << std::endl; - return result; } @@ -232,8 +212,6 @@ TTLDescription TTLDescription::getTTLFromAST( auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; result.expression_columns = expression->getRequiredColumnsWithTypes(); - // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); - // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); result.result_column = expression->getSampleBlock().safeGetByPosition(0).name; ExpressionActionsPtr where_expression; @@ -256,9 +234,6 @@ TTLDescription TTLDescription::getTTLFromAST( { result.where_expression_ast = where_expr_ast->clone(); where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; - // auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); - // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); - result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); result.where_result_column = where_expression->getSampleBlock().safeGetByPosition(0).name; } From 776ea26ce71287735897b00c65b47d73e8d9811c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 02:45:51 +0300 Subject: [PATCH 0118/1081] Update PreparedSets.h --- src/Interpreters/PreparedSets.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index 30bfda4700d..4f5ca337c5b 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -115,7 +115,6 @@ public: SetPtr buildSetInplace(const ContextPtr & context); std::unique_ptr build(const ContextPtr & context); - void buildSetInplace(const ContextPtr & context); QueryTreeNodePtr detachQueryTree() { return std::move(query_tree); } void setQueryPlan(std::unique_ptr source_); From 1afc5e8c01685d1bb3e86b5a0fff55618db517b0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 03:45:42 +0100 Subject: [PATCH 0119/1081] Enable coverage for debug build --- docker/packager/packager | 8 ++++++++ tests/ci/build_check.py | 2 ++ tests/ci/ci_config.py | 2 ++ 3 files changed, 12 insertions(+) diff --git a/docker/packager/packager b/docker/packager/packager index ade36a55591..4c443896f4a 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -121,6 +121,7 @@ def is_release_build(debug_build: bool, package_type: str, sanitizer: str) -> bo def parse_env_variables( debug_build: bool, + coverage: bool, compiler: str, sanitizer: str, package_type: str, @@ -287,6 +288,9 @@ def parse_env_variables( else: result.append("BUILD_TYPE=None") + if coverage: + result.append("SANITIZE_COVERAGE=1") + if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") @@ -415,6 +419,9 @@ def parse_args() -> argparse.Namespace: choices=("address", "thread", "memory", "undefined", ""), default="", ) + parser.add_argument( + "--coverage", action="store_true", help="enable granular coverage with introspection" + ) parser.add_argument("--clang-tidy", action="store_true") parser.add_argument( @@ -507,6 +514,7 @@ def main() -> None: env_prepared = parse_env_variables( args.debug_build, + args.coverage, args.compiler, args.sanitizer, args.package_type, diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 27243aac4f1..fe4308f5933 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -73,6 +73,8 @@ def get_packager_cmd( cmd += " --debug-build" if build_config.sanitizer: cmd += f" --sanitizer={build_config.sanitizer}" + if build_config.coverage: + cmd += " --coverage" if build_config.tidy: cmd += " --clang-tidy" diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index e3319fe4a72..b8dff3f0a28 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -62,6 +62,7 @@ class BuildConfig: package_type: Literal["deb", "binary", "fuzzers"] additional_pkgs: bool = False debug_build: bool = False + coverage: bool = False sanitizer: str = "" tidy: bool = False sparse_checkout: bool = False @@ -473,6 +474,7 @@ CI_CONFIG = CiConfig( name="package_debug", compiler="clang-17", debug_build=True, + coverage=True, package_type="deb", sparse_checkout=True, ), From 0219d58d925bd3f7901f9251c2abca76c1ae00dc Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sun, 14 Jan 2024 02:56:50 +0000 Subject: [PATCH 0120/1081] Automatic style fix --- docker/packager/packager | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 4c443896f4a..3e7f1ba447e 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -420,7 +420,9 @@ def parse_args() -> argparse.Namespace: default="", ) parser.add_argument( - "--coverage", action="store_true", help="enable granular coverage with introspection" + "--coverage", + action="store_true", + help="enable granular coverage with introspection", ) parser.add_argument("--clang-tidy", action="store_true") From 6405decbb0ad0e80fe20b22a9956481abbe3b479 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 05:14:54 +0100 Subject: [PATCH 0121/1081] Fix Python --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 4c443896f4a..2e2b6550636 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -289,7 +289,7 @@ def parse_env_variables( result.append("BUILD_TYPE=None") if coverage: - result.append("SANITIZE_COVERAGE=1") + cmake_flags.append("-DSANITIZE_COVERAGE=1") if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") From 5ba6def57d0e256be75b729678fc37d4c8989f7e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 07:29:28 +0300 Subject: [PATCH 0122/1081] Update noaa.md --- .../getting-started/example-datasets/noaa.md | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md index 8d34ff8d3ee..bc2e9fecae1 100644 --- a/docs/en/getting-started/example-datasets/noaa.md +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -11,7 +11,7 @@ This dataset contains weather measurements for the last 120 years. Each row is a More precisely and according to the [origin of this data](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): -> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: +> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two-thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: - Daily maximum temperature - Daily minimum temperature @@ -28,7 +28,7 @@ More precisely and according to the [origin of this data](https://github.com/aws ### Pre-prepared data -More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date i.e. +More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date, i.e. ```csv "station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" @@ -63,17 +63,19 @@ for i in {1900..2023}; do wget https://noaa-ghcn-pds.s3.amazonaws.com/csv.gz/${i #### Sampling the data ```bash -zcat 2021.csv.gz | head -AE000041196,20210101,TMAX,278,,,S, -AE000041196,20210101,PRCP,0,D,,S, -AE000041196,20210101,TAVG,214,H,,S, -AEM00041194,20210101,TMAX,266,,,S, -AEM00041194,20210101,TMIN,178,,,S, -AEM00041194,20210101,PRCP,0,,,S, -AEM00041194,20210101,TAVG,217,H,,S, -AEM00041217,20210101,TMAX,262,,,S, -AEM00041217,20210101,TMIN,155,,,S, -AEM00041217,20210101,TAVG,202,H,,S, +$ clickhouse-local --query "SELECT * FROM '2021.csv.gz' LIMIT 10" --format PrettyCompact +┌─c1──────────┬───────c2─┬─c3───┬──c4─┬─c5───┬─c6───┬─c7─┬───c8─┐ +│ AE000041196 │ 20210101 │ TMAX │ 278 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AE000041196 │ 20210101 │ PRCP │ 0 │ D │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AE000041196 │ 20210101 │ TAVG │ 214 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TMAX │ 266 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TMIN │ 178 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ PRCP │ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TAVG │ 217 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TMAX │ 262 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TMIN │ 155 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TAVG │ 202 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +└─────────────┴──────────┴──────┴─────┴──────┴──────┴────┴──────┘ ``` Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): @@ -88,7 +90,7 @@ Summarizing the format documentation and the columns in order: - SNOW - Snowfall (mm) - SNWD - Snow depth (mm) - TMAX - Maximum temperature (tenths of degrees C) - - TAVG - Average temperature (tenths of a degrees C) + - TAVG - Average temperature (tenths of a degree C) - TMIN - Minimum temperature (tenths of degrees C) - PSUN - Daily percent of possible sunshine (percent) - AWND - Average daily wind speed (tenths of meters per second) @@ -215,7 +217,7 @@ CREATE TABLE noaa ### Inserting from local file -Data can be inserted from local file as follows (from the ClickHouse client): +Data can be inserted from a local file as follows (from the ClickHouse client): ```sql INSERT INTO noaa FROM INFILE '/noaa_enriched.parquet' From 9f5a7c51175dc3d4cfe46065b4912e7973a30983 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 08:05:57 +0100 Subject: [PATCH 0123/1081] Fix error --- cmake/sanitize.cmake | 1 + contrib/jemalloc-cmake/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 3f7a8498059..3882b51227e 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -82,3 +82,4 @@ if (SANITIZE_COVERAGE) endif() set (WITHOUT_COVERAGE_FLAGS "-fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table") +set (WITHOUT_COVERAGE_FLAGS_LIST -fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index 15e965ed841..f85a38dcf8a 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -161,6 +161,9 @@ target_include_directories(_jemalloc SYSTEM PRIVATE target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE) +# Because our coverage callbacks call malloc, and recursive call of malloc could not work. +target_compile_options(_jemalloc PRIVATE ${WITHOUT_COVERAGE_FLAGS_LIST}) + if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_DEBUG=1 From 3d904cbf81eb6ce2472eabdcd0be5f6955984ce5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 08:09:08 +0100 Subject: [PATCH 0124/1081] Slightly better --- base/base/coverage.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index d70c3bcd82b..ac8055e836c 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -1,4 +1,5 @@ #include "coverage.h" +#include #pragma GCC diagnostic ignored "-Wreserved-identifier" @@ -57,6 +58,14 @@ namespace uintptr_t * all_addresses_array = nullptr; size_t all_addresses_array_size = 0; + + uintptr_t * allocate(size_t size) + { + void * map = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == map) + return nullptr; + return static_cast(map); + } } extern "C" @@ -79,7 +88,7 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) coverage_array_size = stop - start; /// Note: we will leak this. - coverage_array = static_cast(malloc(sizeof(uintptr_t) * coverage_array_size)); + coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); resetCoverage(); } @@ -92,7 +101,7 @@ void __sanitizer_cov_pcs_init(const uintptr_t * pcs_begin, const uintptr_t * pcs return; pc_table_initialized = true; - all_addresses_array = static_cast(malloc(sizeof(uintptr_t) * coverage_array_size)); + all_addresses_array = allocate(sizeof(uintptr_t) * coverage_array_size); all_addresses_array_size = pcs_end - pcs_begin; /// They are not a real pointers, but also contain a flag in the most significant bit, From 33d9a1d4e83d58f15e36ea6e88908c8410f03c40 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 22:03:47 +0100 Subject: [PATCH 0125/1081] Documentation --- src/Functions/coverage.cpp | 48 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/Functions/coverage.cpp b/src/Functions/coverage.cpp index 8a62469fa54..86de047a76b 100644 --- a/src/Functions/coverage.cpp +++ b/src/Functions/coverage.cpp @@ -85,8 +85,52 @@ public: REGISTER_FUNCTION(Coverage) { - factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }); - factory.registerFunction("coverageAll", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::All)); }); + factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of unique addresses (a subset of the instrumented points in code) in the code +encountered at runtime after the previous coverage reset (with the `SYSTEM RESET COVERAGE` query) or after server startup. + +[example:functions] + +The order of array elements is undetermined. + +You can use another function, `coverageAll` to find all instrumented addresses in the code to compare and calculate the percentage. + +You can process the addresses with the `addressToSymbol` (possibly with `demangle`) and `addressToLine` functions +to calculate symbol-level, file-level, or line-level coverage. + +If you run multiple tests sequentially and reset the coverage with the `SYSTEM RESET COVERAGE` query between the tests, +you can obtain a coverage information for every test in isolation, to find which functions are covered by which tests and vise-versa. + +By default, every *basic block* in the code is covered, which roughly means - a sequence of instructions without jumps, +e.g. a body of for loop without ifs, or a single branch of if. + +See https://clang.llvm.org/docs/SanitizerCoverage.html for more information. +)", + .examples{ + {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverage())))", ""}}, + .categories{"Introspection"} + }); + + factory.registerFunction("coverageAll", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::All)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of all unique addresses in the code instrumented for coverage +- all possible addresses that can appear in the result of the `coverage` function. + +You can use this function, and the `coverage` function to compare and calculate the coverage percentage. + +See the `coverage` function for the details. +)", + .categories{"Introspection"} + }); } } From 3bd2c7e384d07d07da8768aa4708c7726b828db5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 22:06:25 +0100 Subject: [PATCH 0126/1081] Report coverage if available --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index c7049b0e0c8..2d278f18176 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -2840,7 +2840,7 @@ def parse_args(): parser.add_argument( "--collect-per-test-coverage", action="store_true", - default=False, + default=True, help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", ) parser.add_argument( From 1c1e1512bf92c4788ce17f38cf228d4525cdb9eb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 01:29:38 +0300 Subject: [PATCH 0127/1081] Update noaa.md --- docs/en/getting-started/example-datasets/noaa.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md index bc2e9fecae1..9a3ec7791b6 100644 --- a/docs/en/getting-started/example-datasets/noaa.md +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -185,7 +185,7 @@ FROM file('noaa.csv', CSV, 'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" ``` -This query takes a few minutes to run and produces an 6.4GB file `noaa_enriched.parquet`. +This query takes a few minutes to run and produces a 6.4 GB file, `noaa_enriched.parquet`. ## Create table From 9141e1693f03f39d2eda37423918d2b2d873877a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:24:17 +0100 Subject: [PATCH 0128/1081] Calculate cumulative coverage by default. --- tests/clickhouse-test | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 2d278f18176..f1b20a3a43e 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1322,6 +1322,7 @@ class TestCase: # We want to calculate per-test code coverage. That's why we reset it before each test. if ( args.collect_per_test_coverage + and args.reset_coverage_before_every_test and BuildFlags.SANITIZE_COVERAGE in args.build_flags ): clickhouse_execute( @@ -2843,6 +2844,12 @@ def parse_args(): default=True, help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", ) + parser.add_argument( + "--reset-coverage-before-every-test", + action="store_true", + default=False, + help="Collect isolated test coverage for every test instead of a cumulative. Useful only when tests are run sequentially.", + ) parser.add_argument( "--report-logs-stats", action="store_true", From f7abeff0857ec231a7107d2a006b5f98b60a689f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:25:07 +0100 Subject: [PATCH 0129/1081] Slightly better reporting --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index f1b20a3a43e..e480957e5f4 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1259,7 +1259,7 @@ class TestCase: retry_error_codes=True, ).decode() - description_full += f" Coverage: {coverage}" + description_full += f" (coverage: {coverage})" description_full += "\n" From 3e09feda336a355173b46ec85a9cd86d640f3348 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:36:07 +0100 Subject: [PATCH 0130/1081] More functions --- base/base/coverage.cpp | 21 +++++++++++++++------ base/base/coverage.h | 5 ++++- src/Functions/coverage.cpp | 33 ++++++++++++++++++++++++++++----- tests/clickhouse-test | 6 +++--- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index ac8055e836c..499e384d21f 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -53,7 +53,8 @@ namespace uint32_t * guards_start = nullptr; uint32_t * guards_end = nullptr; - uintptr_t * coverage_array = nullptr; + uintptr_t * current_coverage_array = nullptr; + uintptr_t * cumulative_coverage_array = nullptr; size_t coverage_array_size = 0; uintptr_t * all_addresses_array = nullptr; @@ -88,7 +89,8 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) coverage_array_size = stop - start; /// Note: we will leak this. - coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); resetCoverage(); } @@ -126,15 +128,22 @@ void __sanitizer_cov_trace_pc_guard(uint32_t * guard) /// and use them to dereference an array or a bit vector. void * pc = __builtin_return_address(0); - coverage_array[guard - guards_start] = reinterpret_cast(pc); + current_coverage_array[guard - guards_start] = reinterpret_cast(pc); + cumulative_coverage_array[guard - guards_start] = reinterpret_cast(pc); } } __attribute__((no_sanitize("coverage"))) -std::span getCoverage() +std::span getCurrentCoverage() { - return {coverage_array, coverage_array_size}; + return {current_coverage_array, coverage_array_size}; +} + +__attribute__((no_sanitize("coverage"))) +std::span getCumulativeCoverage() +{ + return {cumulative_coverage_array, coverage_array_size}; } __attribute__((no_sanitize("coverage"))) @@ -146,7 +155,7 @@ std::span getAllInstrumentedAddresses() __attribute__((no_sanitize("coverage"))) void resetCoverage() { - memset(coverage_array, 0, coverage_array_size * sizeof(*coverage_array)); + memset(current_coverage_array, 0, coverage_array_size * sizeof(*current_coverage_array)); /// The guard defines whether the __sanitizer_cov_trace_pc_guard should be called. /// For example, you can unset it after first invocation to prevent excessive work. diff --git a/base/base/coverage.h b/base/base/coverage.h index f75ed2d3553..a6e5a6848d7 100644 --- a/base/base/coverage.h +++ b/base/base/coverage.h @@ -15,7 +15,10 @@ void dumpCoverageReportIfPossible(); /// Get accumulated unique program addresses of the instrumented parts of the code, /// seen so far after program startup or after previous reset. /// The returned span will be represented as a sparse map, containing mostly zeros, which you should filter away. -std::span getCoverage(); +std::span getCurrentCoverage(); + +/// Similar but not being reset. +std::span getCumulativeCoverage(); /// Get all instrumented addresses that could be in the coverage. std::span getAllInstrumentedAddresses(); diff --git a/src/Functions/coverage.cpp b/src/Functions/coverage.cpp index 86de047a76b..f4cac26df78 100644 --- a/src/Functions/coverage.cpp +++ b/src/Functions/coverage.cpp @@ -21,11 +21,14 @@ namespace enum class Kind { Current, + Cumulative, All }; /** If ClickHouse is build with coverage instrumentation, returns an array - * of currently accumulated (`coverage`) / all possible (`coverageAll`) unique code addresses. + * of currently accumulated (`coverageCurrent`) + * or accumulated since the startup (`coverageCumulative`) + * or all possible (`coverageAll`) unique code addresses. */ class FunctionCoverage : public IFunction { @@ -64,7 +67,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override { - auto coverage_table = kind == Kind::Current ? getCoverage() : getAllInstrumentedAddresses(); + auto coverage_table = kind == Kind::Current + ? getCurrentCoverage() + : (kind == Kind::Cumulative + ? getCumulativeCoverage() + : getAllInstrumentedAddresses()); auto column_addresses = ColumnUInt64::create(); auto & data = column_addresses->getData(); @@ -85,7 +92,7 @@ public: REGISTER_FUNCTION(Coverage) { - factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, + factory.registerFunction("coverageCurrent", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, FunctionDocumentation { .description=R"( @@ -112,7 +119,23 @@ e.g. a body of for loop without ifs, or a single branch of if. See https://clang.llvm.org/docs/SanitizerCoverage.html for more information. )", .examples{ - {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverage())))", ""}}, + {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverageCurrent())))", ""}}, + .categories{"Introspection"} + }); + + factory.registerFunction("coverageCumulative", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Cumulative)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of unique addresses (a subset of the instrumented points in code) in the code +encountered at runtime after server startup. + +In contrast to `coverageCurrent` it cannot be reset with the `SYSTEM RESET COVERAGE`. + +See the `coverageCurrent` function for the details. +)", .categories{"Introspection"} }); @@ -127,7 +150,7 @@ It returns an array of all unique addresses in the code instrumented for coverag You can use this function, and the `coverage` function to compare and calculate the coverage percentage. -See the `coverage` function for the details. +See the `coverageCurrent` function for the details. )", .categories{"Introspection"} }); diff --git a/tests/clickhouse-test b/tests/clickhouse-test index e480957e5f4..a39c90947ba 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1249,13 +1249,13 @@ class TestCase: ): clickhouse_execute( args, - f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverage()", + f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverageCurrent()", retry_error_codes=True, ) coverage = clickhouse_execute( args, - "SELECT length(coverage())", + "SELECT length(coverageCurrent())", retry_error_codes=True, ).decode() @@ -2460,7 +2460,7 @@ def main(args): # Coverage collected at the system startup before running any tests: clickhouse_execute( args, - "INSERT INTO system.coverage SELECT now(), '', coverage()", + "INSERT INTO system.coverage SELECT now(), '', coverageCurrent()", ) total_tests_run = 0 From e4cd02ea39642dd9b8d519aee0426b752423c3bf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:36:24 +0100 Subject: [PATCH 0131/1081] Fix typo --- src/IO/OpenedFile.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/IO/OpenedFile.h b/src/IO/OpenedFile.h index 10c36d9e1d3..4c4de2265bc 100644 --- a/src/IO/OpenedFile.h +++ b/src/IO/OpenedFile.h @@ -21,7 +21,7 @@ public: OpenedFile(const std::string & file_name_, int flags_); ~OpenedFile(); - /// Close prematurally. + /// Close prematurely. void close(); int getFD() const; @@ -40,4 +40,3 @@ private: }; } - From 30c362909089d6f7fe93b639dfdf1666d5bcfc7c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:36:44 +0100 Subject: [PATCH 0132/1081] An option to dump coverage to a file at exit --- programs/main.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/programs/main.cpp b/programs/main.cpp index 7d07112de66..4852ed8990e 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #include #include +#include /// Universal executable for various clickhouse applications @@ -512,6 +514,49 @@ int main(int argc_, char ** argv_) if (main_func == printHelp && !argv.empty() && (argv.size() == 1 || argv[1][0] == '-')) main_func = mainEntryClickHouseLocal; - return main_func(static_cast(argv.size()), argv.data()); + int exit_code = main_func(static_cast(argv.size()), argv.data()); + +#if defined(SANITIZE_COVERAGE) + /// A user can request to dump the coverage information into files at exit. + /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, + /// that cannot introspect it with SQL functions at runtime. + + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for two filenames: + /// 'prefix.covered' and 'prefix.all' which will contain + /// the list of addresses of covered and all instrumented addresses, respectively. + + /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. + + if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) + { + auto dumpCoverage = [](const std::string & name, auto span) + { + /// Write only non-zeros. + std::vector data; + data.reserve(span.size()); + for (auto addr : span) + if (addr) + data.push_back(addr); + + int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); + if (-1 == fd) + { + writeError("Cannot open a file to write the coverage data\n"); + } + else + { + if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) + writeError("Cannot write the coverage data to a file\n"); + if (0 != ::close(fd)) + writeError("Cannot close the file with coverage data\n"); + } + }; + + dumpCoverage(coverage_filename_prefix + std::string(".covered"), getCumulativeCoverage()); + dumpCoverage(coverage_filename_prefix + std::string(".all"), getAllInstrumentedAddresses()); + } +#endif + + return exit_code; } #endif From fe952fb64c460c260c77336142b5eb4bd05b46d8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:38:42 +0100 Subject: [PATCH 0133/1081] Rename to system.coverage_log to simplify export --- tests/clickhouse-test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index a39c90947ba..eb85bdff0f5 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1249,7 +1249,7 @@ class TestCase: ): clickhouse_execute( args, - f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverageCurrent()", + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', coverageCurrent()", retry_error_codes=True, ) @@ -2448,7 +2448,7 @@ def main(args): clickhouse_execute( args, """ - CREATE TABLE IF NOT EXISTS system.coverage + CREATE TABLE IF NOT EXISTS system.coverage_log ( time DateTime, test_name String, @@ -2460,7 +2460,7 @@ def main(args): # Coverage collected at the system startup before running any tests: clickhouse_execute( args, - "INSERT INTO system.coverage SELECT now(), '', coverageCurrent()", + "INSERT INTO system.coverage_log SELECT now(), '', coverageCurrent()", ) total_tests_run = 0 @@ -2842,7 +2842,7 @@ def parse_args(): "--collect-per-test-coverage", action="store_true", default=True, - help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", + help="Create `system.coverage_log` table on the server and collect information about low-level code coverage on a per test basis there", ) parser.add_argument( "--reset-coverage-before-every-test", From 7662628393f97dd1c094b3346cc55c71f10ad193 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:41:14 +0100 Subject: [PATCH 0134/1081] Export coverage to the CI database --- docker/test/base/setup_export_logs.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index ea82e071112..659bf29b057 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -129,6 +129,19 @@ function setup_logs_replication debug_or_sanitizer_build=$(clickhouse-client -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%'") echo "Build is debug or sanitizer: $debug_or_sanitizer_build" + # We will pre-create a table system.coverage_log. + # It is normally created by clickhouse-test rather than the server, + # so we will create it in advance to make it be picked up by the next commands: + + clickhouse-client --query " + CREATE TABLE IF NOT EXISTS system.coverage_log + ( + time DateTime, + test_name String, + coverage Array(UInt64) + ) ENGINE = MergeTree ORDER BY test_name + " + # For each system log table: echo 'Create %_log tables' clickhouse-client --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table From 97200e2c5d65693ad5d1e6a7c7dea3d5cac0e23d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:46:01 +0100 Subject: [PATCH 0135/1081] Symbolization --- docker/test/base/setup_export_logs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 659bf29b057..e141bc00a77 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -138,7 +138,8 @@ function setup_logs_replication ( time DateTime, test_name String, - coverage Array(UInt64) + coverage Array(UInt64), + symbols Array(LowCardinality(String)) MATERIALIZED arrayMap(x -> demangle(addressToSymbol(x)), coverage) ) ENGINE = MergeTree ORDER BY test_name " From bf2e5748575ad2eb74eb057e0ee242a149edecdb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:48:47 +0100 Subject: [PATCH 0136/1081] Symbolization --- docker/test/base/setup_export_logs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index e141bc00a77..20dd864318f 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -188,7 +188,7 @@ function setup_logs_replication echo "Creating table system.${table}_sender" >&2 # Create Distributed table and materialized view to watch on the original table: - clickhouse-client --query " + clickhouse-client --asterisk_include_materialized_columns 1 --query " CREATE TABLE system.${table}_sender ENGINE = Distributed(${CLICKHOUSE_CI_LOGS_CLUSTER}, default, ${table}_${hash}) SETTINGS flush_on_detach=0 @@ -199,7 +199,7 @@ function setup_logs_replication echo "Creating materialized view system.${table}_watcher" >&2 - clickhouse-client --query " + clickhouse-client --asterisk_include_materialized_columns 1 --query " CREATE MATERIALIZED VIEW system.${table}_watcher TO system.${table}_sender AS SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, * FROM system.${table} From c5dfae1bcade85289b78f0bb760c92bcee078743 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 05:07:23 +0100 Subject: [PATCH 0137/1081] Fix error --- docker/test/base/setup_export_logs.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 20dd864318f..26fcd10d666 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -23,6 +23,10 @@ EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "} EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), " EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), trace)::Array(LowCardinality(String)) AS symbols, arrayMap(x -> addressToLine(x), trace)::Array(LowCardinality(String)) AS lines" +# coverage_log needs more columns for symbolization, but only symbol names (the line numbers are too heavy to calculate) +EXTRA_COLUMNS_COVERAGE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), " +EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), coverage)::Array(LowCardinality(String)) AS symbols" + function __set_connection_args { @@ -138,8 +142,7 @@ function setup_logs_replication ( time DateTime, test_name String, - coverage Array(UInt64), - symbols Array(LowCardinality(String)) MATERIALIZED arrayMap(x -> demangle(addressToSymbol(x)), coverage) + coverage Array(UInt64) ) ENGINE = MergeTree ORDER BY test_name " @@ -158,7 +161,10 @@ function setup_logs_replication else EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_TRACE_LOG}" fi - else + elif [[ "$table" = "coverage_log" ]] + EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS_COVERAGE_LOG}" + EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG}" + then EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS}" EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" fi From e13ca48bce836a2534047e59a4e922395a8f6a87 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 05:13:43 +0100 Subject: [PATCH 0138/1081] Better dump on exit --- programs/main.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/programs/main.cpp b/programs/main.cpp index 4852ed8990e..8958d84e243 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -521,9 +521,8 @@ int main(int argc_, char ** argv_) /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, /// that cannot introspect it with SQL functions at runtime. - /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for two filenames: - /// 'prefix.covered' and 'prefix.all' which will contain - /// the list of addresses of covered and all instrumented addresses, respectively. + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' + /// containing the list of addresses of covered . /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. @@ -552,8 +551,7 @@ int main(int argc_, char ** argv_) } }; - dumpCoverage(coverage_filename_prefix + std::string(".covered"), getCumulativeCoverage()); - dumpCoverage(coverage_filename_prefix + std::string(".all"), getAllInstrumentedAddresses()); + dumpCoverage(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); } #endif From e49cfbef089499a457c8793724629e2e94c8dc37 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 05:40:03 +0100 Subject: [PATCH 0139/1081] Coverage for non-server tools --- tests/clickhouse-test | 23 +++++++++++++++++++++++ tests/queries/shell_config.sh | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index eb85bdff0f5..bd796dbfdf2 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -12,6 +12,7 @@ import itertools import sys import os import os.path +import glob import platform import signal import re @@ -74,6 +75,10 @@ def stringhash(s): # only during process invocation https://stackoverflow.com/a/42089311 return zlib.crc32(s.encode("utf-8")) +def read_file_as_binary_string(file_path): + with open(file_path, 'rb') as file: + binary_data = file.read() + return binary_data # First and last lines of the log def trim_for_log(s): @@ -101,6 +106,7 @@ class HTTPError(Exception): def clickhouse_execute_http( base_args, query, + body=None, timeout=30, settings=None, default_format=None, @@ -140,6 +146,7 @@ def clickhouse_execute_http( client.request( "POST", f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}", + body=body ) res = client.getresponse() data = res.read() @@ -160,6 +167,7 @@ def clickhouse_execute_http( def clickhouse_execute( base_args, query, + body=None, timeout=30, settings=None, max_http_retries=5, @@ -168,6 +176,7 @@ def clickhouse_execute( return clickhouse_execute_http( base_args, query, + body, timeout, settings, max_http_retries=max_http_retries, @@ -181,6 +190,7 @@ def clickhouse_execute_json( data = clickhouse_execute_http( base_args, query, + None, timeout, settings, "JSONEachRow", @@ -1253,6 +1263,19 @@ class TestCase: retry_error_codes=True, ) + # Check for dumped coverage files + file_pattern = "coverage.*" + matching_files = glob.glob(file_pattern) + for file_path in matching_files: + body = read_file_as_binary_string(file_path) + clickhouse_execute( + args, + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", + body=body, + retry_error_codes=True, + ) + os.remove(file_path) + coverage = clickhouse_execute( args, "SELECT length(coverageCurrent())", diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index c687a63623f..614bfcece8f 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -4,6 +4,10 @@ # Don't check for ODR violation, since we may test shared build with ASAN export ASAN_OPTIONS=detect_odr_violation=0 +# If ClickHouse was built with coverage - dump the coverage information at exit +# (in other cases this environment variable has no effect) +export CLICKHOUSE_WRITE_COVERAGE="coverage" + export CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE:="test"} export CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL:="warning"} From 678a32cedee768b6c1a6748e96a0d103e853d8bc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:12:58 +0100 Subject: [PATCH 0140/1081] Obey Python's quirky formatter --- tests/integration/ci-runner.py | 13 +- .../test_async_insert_memory/test.py | 2 +- tests/integration/test_check_table/test.py | 76 +++-- .../test_cluster_discovery/test.py | 2 +- .../test_ldap_external_user_directory/test.py | 26 +- tests/integration/test_mysql_protocol/test.py | 16 +- tests/integration/test_partition/test.py | 4 +- .../test_replicated_database/test.py | 9 +- .../test.py | 9 +- .../s3_mocks/unstable_server.py | 2 +- tests/integration/test_storage_s3/test.py | 17 +- tests/integration/test_storage_url/test.py | 22 +- tests/integration/test_system_merges/test.py | 45 ++- utils/grpc-client/pb2/clickhouse_grpc_pb2.py | 271 ++++++++++-------- .../pb2/clickhouse_grpc_pb2_grpc.py | 237 +++++++++------ 15 files changed, 433 insertions(+), 318 deletions(-) diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 7c922e339fe..d54ed2bb767 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -305,14 +305,11 @@ class ClickhouseIntegrationTestsRunner: def _pre_pull_images(self, repo_path): image_cmd = self._get_runner_image_cmd(repo_path) - cmd = ( - "cd {repo_path}/tests/integration && " - "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( - repo_path=repo_path, - runner_opts=self._get_runner_opts(), - image_cmd=image_cmd, - command=r""" echo Pre Pull finished """, - ) + cmd = "cd {repo_path}/tests/integration && " "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( + repo_path=repo_path, + runner_opts=self._get_runner_opts(), + image_cmd=image_cmd, + command=r""" echo Pre Pull finished """, ) for i in range(5): diff --git a/tests/integration/test_async_insert_memory/test.py b/tests/integration/test_async_insert_memory/test.py index 5d2e5503680..f897007f7bb 100644 --- a/tests/integration/test_async_insert_memory/test.py +++ b/tests/integration/test_async_insert_memory/test.py @@ -43,7 +43,7 @@ def test_memory_usage(): response = node.get_query_request( "SELECT groupArray(number) FROM numbers(1000000) SETTINGS max_memory_usage_for_user={}".format( - 30 * (2**23) + 30 * (2 ** 23) ), user="A", ) diff --git a/tests/integration/test_check_table/test.py b/tests/integration/test_check_table/test.py index 021977fb6b6..ebf404e698b 100644 --- a/tests/integration/test_check_table/test.py +++ b/tests/integration/test_check_table/test.py @@ -95,15 +95,25 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): node1, "non_replicated_mt", "201902_1_1_0", database="default" ) - assert node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] + assert ( + node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + .strip() + .split("\t")[0:2] + == ["201902_1_1_0", "0"] + ) - assert node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] + assert ( + node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + .strip() + .split("\t")[0:2] + == ["201902_1_1_0", "0"] + ) node1.query( "INSERT INTO non_replicated_mt VALUES (toDate('2019-01-01'), 1, 10), (toDate('2019-01-01'), 2, 12)" @@ -123,10 +133,15 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): remove_checksums_on_disk(node1, "default", "non_replicated_mt", "201901_2_2_0") - assert node1.query( - "CHECK TABLE non_replicated_mt PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ).strip().split("\t")[0:2] == ["201901_2_2_0", "0"] + assert ( + node1.query( + "CHECK TABLE non_replicated_mt PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + .strip() + .split("\t")[0:2] + == ["201901_2_2_0", "0"] + ) @pytest.mark.parametrize("merge_tree_settings, zk_path_suffix", [("", "_0")]) @@ -194,12 +209,15 @@ def test_check_replicated_table_simple( == "201901_0_0_0\t1\t\n" ) - assert sorted( - node2.query( - "CHECK TABLE replicated_mt", - settings={"check_query_single_value_result": 0}, - ).split("\n") - ) == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] + assert ( + sorted( + node2.query( + "CHECK TABLE replicated_mt", + settings={"check_query_single_value_result": 0}, + ).split("\n") + ) + == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] + ) with pytest.raises(QueryRuntimeException) as exc: node2.query( @@ -273,10 +291,13 @@ def test_check_replicated_table_corruption( ) node1.query_with_retry("SYSTEM SYNC REPLICA replicated_mt_1") - assert node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) == "{}\t1\t\n".format(part_name) + assert ( + node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + == "{}\t1\t\n".format(part_name) + ) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" remove_part_from_disk(node2, "replicated_mt_1", part_name) @@ -288,10 +309,13 @@ def test_check_replicated_table_corruption( ) node1.query("SYSTEM SYNC REPLICA replicated_mt_1") - assert node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) == "{}\t1\t\n".format(part_name) + assert ( + node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + == "{}\t1\t\n".format(part_name) + ) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" diff --git a/tests/integration/test_cluster_discovery/test.py b/tests/integration/test_cluster_discovery/test.py index ad3deb5b142..a2e7e15b956 100644 --- a/tests/integration/test_cluster_discovery/test.py +++ b/tests/integration/test_cluster_discovery/test.py @@ -61,7 +61,7 @@ def check_on_cluster( print(f"Retry {retry}/{retries} unsuccessful, result: {node_results}") if retry != retries: - time.sleep(2**retry) + time.sleep(2 ** retry) else: msg = msg or f"Wrong '{what}' result" raise Exception( diff --git a/tests/integration/test_ldap_external_user_directory/test.py b/tests/integration/test_ldap_external_user_directory/test.py index 39753794d63..c9642c293ee 100644 --- a/tests/integration/test_ldap_external_user_directory/test.py +++ b/tests/integration/test_ldap_external_user_directory/test.py @@ -76,11 +76,14 @@ def test_role_mapping(ldap_cluster): "select currentUser()", user="johndoe", password="qwertz" ) == TSV([["johndoe"]]) - assert instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) == TSV([["role_1"], ["role_2"]]) + assert ( + instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) + == TSV([["role_1"], ["role_2"]]) + ) instance.query("CREATE ROLE role_3") add_ldap_group(ldap_cluster, group_cn="clickhouse-role_3", member_cn="johndoe") @@ -88,8 +91,11 @@ def test_role_mapping(ldap_cluster): # See https://github.com/ClickHouse/ClickHouse/issues/54318 add_ldap_group(ldap_cluster, group_cn="clickhouse-role_4", member_cn="johndoe") - assert instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) == TSV([["role_1"], ["role_2"], ["role_3"]]) + assert ( + instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) + == TSV([["role_1"], ["role_2"], ["role_3"]]) + ) diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 7a69d07633c..61e76c0dc97 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -854,14 +854,14 @@ def test_types(started_cluster): result = cursor.fetchall()[0] expected = [ - ("Int8_column", -(2**7)), - ("UInt8_column", 2**8 - 1), - ("Int16_column", -(2**15)), - ("UInt16_column", 2**16 - 1), - ("Int32_column", -(2**31)), - ("UInt32_column", 2**32 - 1), - ("Int64_column", -(2**63)), - ("UInt64_column", 2**64 - 1), + ("Int8_column", -(2 ** 7)), + ("UInt8_column", 2 ** 8 - 1), + ("Int16_column", -(2 ** 15)), + ("UInt16_column", 2 ** 16 - 1), + ("Int32_column", -(2 ** 31)), + ("UInt32_column", 2 ** 32 - 1), + ("Int64_column", -(2 ** 63)), + ("UInt64_column", 2 ** 64 - 1), ("String_column", "тест"), ("FixedString_column", "тест"), ("Float32_column", 1.5), diff --git a/tests/integration/test_partition/test.py b/tests/integration/test_partition/test.py index 054418a8ba9..d39787f8924 100644 --- a/tests/integration/test_partition/test.py +++ b/tests/integration/test_partition/test.py @@ -561,9 +561,7 @@ def test_make_clone_in_detached(started_cluster): ["cp", "-r", path + "all_0_0_0", path + "detached/broken_all_0_0_0"] ) assert_eq_with_retry(instance, "select * from clone_in_detached", "\n") - assert [ - "broken_all_0_0_0", - ] == sorted( + assert ["broken_all_0_0_0",] == sorted( instance.exec_in_container(["ls", path + "detached/"]).strip().split("\n") ) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 1fc3fe37044..16425c9bd9e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -506,12 +506,9 @@ def test_alters_from_different_replicas(started_cluster): dummy_node.stop_clickhouse(kill=True) settings = {"distributed_ddl_task_timeout": 5} - assert ( - "There are 1 unfinished hosts (0 of them are currently executing the task" - in competing_node.query_and_get_error( - "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", - settings=settings, - ) + assert "There are 1 unfinished hosts (0 of them are currently executing the task" in competing_node.query_and_get_error( + "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", + settings=settings, ) settings = { "distributed_ddl_task_timeout": 5, diff --git a/tests/integration/test_replicated_database_cluster_groups/test.py b/tests/integration/test_replicated_database_cluster_groups/test.py index 647626d8014..5a315707efb 100644 --- a/tests/integration/test_replicated_database_cluster_groups/test.py +++ b/tests/integration/test_replicated_database_cluster_groups/test.py @@ -95,12 +95,9 @@ def test_cluster_groups(started_cluster): # Exception main_node_2.stop_clickhouse() settings = {"distributed_ddl_task_timeout": 5} - assert ( - "There are 1 unfinished hosts (0 of them are currently executing the task)" - in main_node_1.query_and_get_error( - "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", - settings=settings, - ) + assert "There are 1 unfinished hosts (0 of them are currently executing the task)" in main_node_1.query_and_get_error( + "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", + settings=settings, ) # 3. After start both groups are synced diff --git a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py index 5ef781bdc9e..3632fa15d8a 100644 --- a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py +++ b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py @@ -9,7 +9,7 @@ import time def gen_n_digit_number(n): assert 0 < n < 19 - return random.randint(10 ** (n - 1), 10**n - 1) + return random.randint(10 ** (n - 1), 10 ** n - 1) sum_in_4_column = 0 diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 2549cb0d473..e941356261a 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -553,16 +553,13 @@ def test_multipart(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) # select uploaded data from many threads - select_query = ( - "select sum(column1), sum(column2), sum(column3) " - "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( - host=started_cluster.minio_redirect_host, - port=started_cluster.minio_redirect_port, - bucket=bucket, - filename=filename, - auth=maybe_auth, - table_format=table_format, - ) + select_query = "select sum(column1), sum(column2), sum(column3) " "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, ) try: select_result = run_query( diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py index 7ff7a871413..771df49cbac 100644 --- a/tests/integration/test_storage_url/test.py +++ b/tests/integration/test_storage_url/test.py @@ -79,15 +79,21 @@ def test_table_function_url_access_rights(): f"SELECT * FROM url('http://nginx:80/test_1', 'TSV')", user="u1" ) - assert node1.query( - f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + assert ( + node1.query( + f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) + == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + ) - assert node1.query( - f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + assert ( + node1.query( + f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) + == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + ) expected_error = "necessary to have the grant URL ON *.*" assert expected_error in node1.query_and_get_error( diff --git a/tests/integration/test_system_merges/test.py b/tests/integration/test_system_merges/test.py index 6dbe6c891f2..bacb0eb500d 100644 --- a/tests/integration/test_system_merges/test.py +++ b/tests/integration/test_system_merges/test.py @@ -204,36 +204,33 @@ def test_mutation_simple(started_cluster, replicated): sleep_time=0.1, ) - assert ( - split_tsv( - node_check.query( - """ + assert split_tsv( + node_check.query( + """ SELECT database, table, num_parts, source_part_names, source_part_paths, result_part_name, result_part_path, partition_id, is_mutation FROM system.merges WHERE table = '{name}' """.format( - name=table_name - ) + name=table_name ) ) - == [ - [ - db_name, - table_name, - "1", - "['{}']".format(part), - "['{clickhouse}/{table_path}/{}/']".format( - part, clickhouse=clickhouse_path, table_path=table_path - ), - result_part, - "{clickhouse}/{table_path}/{}/".format( - result_part, clickhouse=clickhouse_path, table_path=table_path - ), - "all", - "1", - ], - ] - ) + ) == [ + [ + db_name, + table_name, + "1", + "['{}']".format(part), + "['{clickhouse}/{table_path}/{}/']".format( + part, clickhouse=clickhouse_path, table_path=table_path + ), + result_part, + "{clickhouse}/{table_path}/{}/".format( + result_part, clickhouse=clickhouse_path, table_path=table_path + ), + "all", + "1", + ], + ] t.join() assert ( diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py index 6218047af3c..9bf7817c7d3 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py @@ -8,16 +8,17 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x30\x01\x62\x06proto3' +) - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc\")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel\"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03\"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03\"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12\"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t\"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04\"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04\"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t\"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x30\x01\x62\x06proto3') - -_LOGSLEVEL = DESCRIPTOR.enum_types_by_name['LogsLevel'] +_LOGSLEVEL = DESCRIPTOR.enum_types_by_name["LogsLevel"] LogsLevel = enum_type_wrapper.EnumTypeWrapper(_LOGSLEVEL) LOG_NONE = 0 LOG_FATAL = 1 @@ -30,134 +31,180 @@ LOG_DEBUG = 7 LOG_TRACE = 8 -_NAMEANDTYPE = DESCRIPTOR.message_types_by_name['NameAndType'] -_EXTERNALTABLE = DESCRIPTOR.message_types_by_name['ExternalTable'] -_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name['SettingsEntry'] -_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name['ObsoleteTransportCompression'] -_QUERYINFO = DESCRIPTOR.message_types_by_name['QueryInfo'] -_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name['SettingsEntry'] -_LOGENTRY = DESCRIPTOR.message_types_by_name['LogEntry'] -_PROGRESS = DESCRIPTOR.message_types_by_name['Progress'] -_STATS = DESCRIPTOR.message_types_by_name['Stats'] -_EXCEPTION = DESCRIPTOR.message_types_by_name['Exception'] -_RESULT = DESCRIPTOR.message_types_by_name['Result'] -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionAlgorithm'] -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionLevel'] -NameAndType = _reflection.GeneratedProtocolMessageType('NameAndType', (_message.Message,), { - 'DESCRIPTOR' : _NAMEANDTYPE, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) - }) +_NAMEANDTYPE = DESCRIPTOR.message_types_by_name["NameAndType"] +_EXTERNALTABLE = DESCRIPTOR.message_types_by_name["ExternalTable"] +_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name["SettingsEntry"] +_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name[ + "ObsoleteTransportCompression" +] +_QUERYINFO = DESCRIPTOR.message_types_by_name["QueryInfo"] +_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name["SettingsEntry"] +_LOGENTRY = DESCRIPTOR.message_types_by_name["LogEntry"] +_PROGRESS = DESCRIPTOR.message_types_by_name["Progress"] +_STATS = DESCRIPTOR.message_types_by_name["Stats"] +_EXCEPTION = DESCRIPTOR.message_types_by_name["Exception"] +_RESULT = DESCRIPTOR.message_types_by_name["Result"] +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = ( + _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionAlgorithm"] +) +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = ( + _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionLevel"] +) +NameAndType = _reflection.GeneratedProtocolMessageType( + "NameAndType", + (_message.Message,), + { + "DESCRIPTOR": _NAMEANDTYPE, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) + }, +) _sym_db.RegisterMessage(NameAndType) -ExternalTable = _reflection.GeneratedProtocolMessageType('ExternalTable', (_message.Message,), { - - 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { - 'DESCRIPTOR' : _EXTERNALTABLE_SETTINGSENTRY, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) - }) - , - 'DESCRIPTOR' : _EXTERNALTABLE, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) - }) +ExternalTable = _reflection.GeneratedProtocolMessageType( + "ExternalTable", + (_message.Message,), + { + "SettingsEntry": _reflection.GeneratedProtocolMessageType( + "SettingsEntry", + (_message.Message,), + { + "DESCRIPTOR": _EXTERNALTABLE_SETTINGSENTRY, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) + }, + ), + "DESCRIPTOR": _EXTERNALTABLE, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) + }, +) _sym_db.RegisterMessage(ExternalTable) _sym_db.RegisterMessage(ExternalTable.SettingsEntry) -ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType('ObsoleteTransportCompression', (_message.Message,), { - 'DESCRIPTOR' : _OBSOLETETRANSPORTCOMPRESSION, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) - }) +ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType( + "ObsoleteTransportCompression", + (_message.Message,), + { + "DESCRIPTOR": _OBSOLETETRANSPORTCOMPRESSION, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) + }, +) _sym_db.RegisterMessage(ObsoleteTransportCompression) -QueryInfo = _reflection.GeneratedProtocolMessageType('QueryInfo', (_message.Message,), { - - 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { - 'DESCRIPTOR' : _QUERYINFO_SETTINGSENTRY, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) - }) - , - 'DESCRIPTOR' : _QUERYINFO, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) - }) +QueryInfo = _reflection.GeneratedProtocolMessageType( + "QueryInfo", + (_message.Message,), + { + "SettingsEntry": _reflection.GeneratedProtocolMessageType( + "SettingsEntry", + (_message.Message,), + { + "DESCRIPTOR": _QUERYINFO_SETTINGSENTRY, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) + }, + ), + "DESCRIPTOR": _QUERYINFO, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) + }, +) _sym_db.RegisterMessage(QueryInfo) _sym_db.RegisterMessage(QueryInfo.SettingsEntry) -LogEntry = _reflection.GeneratedProtocolMessageType('LogEntry', (_message.Message,), { - 'DESCRIPTOR' : _LOGENTRY, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) - }) +LogEntry = _reflection.GeneratedProtocolMessageType( + "LogEntry", + (_message.Message,), + { + "DESCRIPTOR": _LOGENTRY, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) + }, +) _sym_db.RegisterMessage(LogEntry) -Progress = _reflection.GeneratedProtocolMessageType('Progress', (_message.Message,), { - 'DESCRIPTOR' : _PROGRESS, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) - }) +Progress = _reflection.GeneratedProtocolMessageType( + "Progress", + (_message.Message,), + { + "DESCRIPTOR": _PROGRESS, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) + }, +) _sym_db.RegisterMessage(Progress) -Stats = _reflection.GeneratedProtocolMessageType('Stats', (_message.Message,), { - 'DESCRIPTOR' : _STATS, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) - }) +Stats = _reflection.GeneratedProtocolMessageType( + "Stats", + (_message.Message,), + { + "DESCRIPTOR": _STATS, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) + }, +) _sym_db.RegisterMessage(Stats) -Exception = _reflection.GeneratedProtocolMessageType('Exception', (_message.Message,), { - 'DESCRIPTOR' : _EXCEPTION, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) - }) +Exception = _reflection.GeneratedProtocolMessageType( + "Exception", + (_message.Message,), + { + "DESCRIPTOR": _EXCEPTION, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) + }, +) _sym_db.RegisterMessage(Exception) -Result = _reflection.GeneratedProtocolMessageType('Result', (_message.Message,), { - 'DESCRIPTOR' : _RESULT, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) - }) +Result = _reflection.GeneratedProtocolMessageType( + "Result", + (_message.Message,), + { + "DESCRIPTOR": _RESULT, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) + }, +) _sym_db.RegisterMessage(Result) -_CLICKHOUSE = DESCRIPTOR.services_by_name['ClickHouse'] +_CLICKHOUSE = DESCRIPTOR.services_by_name["ClickHouse"] if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _EXTERNALTABLE_SETTINGSENTRY._options = None - _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b'8\001' - _QUERYINFO_SETTINGSENTRY._options = None - _QUERYINFO_SETTINGSENTRY._serialized_options = b'8\001' - _LOGSLEVEL._serialized_start=2363 - _LOGSLEVEL._serialized_end=2520 - _NAMEANDTYPE._serialized_start=42 - _NAMEANDTYPE._serialized_end=83 - _EXTERNALTABLE._serialized_start=86 - _EXTERNALTABLE._serialized_end=331 - _EXTERNALTABLE_SETTINGSENTRY._serialized_start=284 - _EXTERNALTABLE_SETTINGSENTRY._serialized_end=331 - _OBSOLETETRANSPORTCOMPRESSION._serialized_start=334 - _OBSOLETETRANSPORTCOMPRESSION._serialized_end=723 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start=532 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end=614 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start=616 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end=723 - _QUERYINFO._serialized_start=726 - _QUERYINFO._serialized_end=1508 - _QUERYINFO_SETTINGSENTRY._serialized_start=284 - _QUERYINFO_SETTINGSENTRY._serialized_end=331 - _LOGENTRY._serialized_start=1511 - _LOGENTRY._serialized_end=1672 - _PROGRESS._serialized_start=1674 - _PROGRESS._serialized_end=1796 - _STATS._serialized_start=1798 - _STATS._serialized_end=1910 - _EXCEPTION._serialized_start=1912 - _EXCEPTION._serialized_end=1994 - _RESULT._serialized_start=1997 - _RESULT._serialized_end=2360 - _CLICKHOUSE._serialized_start=2523 - _CLICKHOUSE._serialized_end=2870 + DESCRIPTOR._options = None + _EXTERNALTABLE_SETTINGSENTRY._options = None + _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b"8\001" + _QUERYINFO_SETTINGSENTRY._options = None + _QUERYINFO_SETTINGSENTRY._serialized_options = b"8\001" + _LOGSLEVEL._serialized_start = 2363 + _LOGSLEVEL._serialized_end = 2520 + _NAMEANDTYPE._serialized_start = 42 + _NAMEANDTYPE._serialized_end = 83 + _EXTERNALTABLE._serialized_start = 86 + _EXTERNALTABLE._serialized_end = 331 + _EXTERNALTABLE_SETTINGSENTRY._serialized_start = 284 + _EXTERNALTABLE_SETTINGSENTRY._serialized_end = 331 + _OBSOLETETRANSPORTCOMPRESSION._serialized_start = 334 + _OBSOLETETRANSPORTCOMPRESSION._serialized_end = 723 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start = 532 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end = 614 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start = 616 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end = 723 + _QUERYINFO._serialized_start = 726 + _QUERYINFO._serialized_end = 1508 + _QUERYINFO_SETTINGSENTRY._serialized_start = 284 + _QUERYINFO_SETTINGSENTRY._serialized_end = 331 + _LOGENTRY._serialized_start = 1511 + _LOGENTRY._serialized_end = 1672 + _PROGRESS._serialized_start = 1674 + _PROGRESS._serialized_end = 1796 + _STATS._serialized_start = 1798 + _STATS._serialized_end = 1910 + _EXCEPTION._serialized_start = 1912 + _EXCEPTION._serialized_end = 1994 + _RESULT._serialized_start = 1997 + _RESULT._serialized_end = 2360 + _CLICKHOUSE._serialized_start = 2523 + _CLICKHOUSE._serialized_end = 2870 # @@protoc_insertion_point(module_scope) diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py index 1c71218bbe5..25643a243b3 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py @@ -15,25 +15,25 @@ class ClickHouseStub(object): channel: A grpc.Channel. """ self.ExecuteQuery = channel.unary_unary( - '/clickhouse.grpc.ClickHouse/ExecuteQuery', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQuery", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamInput = channel.stream_unary( - '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamOutput = channel.unary_stream( - '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamIO = channel.stream_stream( - '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) class ClickHouseServicer(object): @@ -42,124 +42,173 @@ class ClickHouseServicer(object): def ExecuteQuery(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def ExecuteQueryWithStreamInput(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def ExecuteQueryWithStreamOutput(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def ExecuteQueryWithStreamIO(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_ClickHouseServicer_to_server(servicer, server): rpc_method_handlers = { - 'ExecuteQuery': grpc.unary_unary_rpc_method_handler( - servicer.ExecuteQuery, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - 'ExecuteQueryWithStreamInput': grpc.stream_unary_rpc_method_handler( - servicer.ExecuteQueryWithStreamInput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - 'ExecuteQueryWithStreamOutput': grpc.unary_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamOutput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - 'ExecuteQueryWithStreamIO': grpc.stream_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamIO, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), + "ExecuteQuery": grpc.unary_unary_rpc_method_handler( + servicer.ExecuteQuery, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + "ExecuteQueryWithStreamInput": grpc.stream_unary_rpc_method_handler( + servicer.ExecuteQueryWithStreamInput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + "ExecuteQueryWithStreamOutput": grpc.unary_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamOutput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + "ExecuteQueryWithStreamIO": grpc.stream_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamIO, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'clickhouse.grpc.ClickHouse', rpc_method_handlers) + "clickhouse.grpc.ClickHouse", rpc_method_handlers + ) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class ClickHouse(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ExecuteQuery(request, + def ExecuteQuery( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQuery', + "/clickhouse.grpc.ClickHouse/ExecuteQuery", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def ExecuteQueryWithStreamInput(request_iterator, + def ExecuteQueryWithStreamInput( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_unary( + request_iterator, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.stream_unary(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def ExecuteQueryWithStreamOutput(request, + def ExecuteQueryWithStreamOutput( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_stream(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def ExecuteQueryWithStreamIO(request_iterator, + def ExecuteQueryWithStreamIO( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_stream( + request_iterator, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.stream_stream(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) From e42d10fa9ccf4296732941e9f1b333d692e83384 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:25:20 +0100 Subject: [PATCH 0141/1081] Revert "Obey Python's quirky formatter" This reverts commit 678a32cedee768b6c1a6748e96a0d103e853d8bc. --- tests/integration/ci-runner.py | 13 +- .../test_async_insert_memory/test.py | 2 +- tests/integration/test_check_table/test.py | 76 ++--- .../test_cluster_discovery/test.py | 2 +- .../test_ldap_external_user_directory/test.py | 26 +- tests/integration/test_mysql_protocol/test.py | 16 +- tests/integration/test_partition/test.py | 4 +- .../test_replicated_database/test.py | 9 +- .../test.py | 9 +- .../s3_mocks/unstable_server.py | 2 +- tests/integration/test_storage_s3/test.py | 17 +- tests/integration/test_storage_url/test.py | 22 +- tests/integration/test_system_merges/test.py | 45 +-- utils/grpc-client/pb2/clickhouse_grpc_pb2.py | 271 ++++++++---------- .../pb2/clickhouse_grpc_pb2_grpc.py | 237 ++++++--------- 15 files changed, 318 insertions(+), 433 deletions(-) diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index d54ed2bb767..7c922e339fe 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -305,11 +305,14 @@ class ClickhouseIntegrationTestsRunner: def _pre_pull_images(self, repo_path): image_cmd = self._get_runner_image_cmd(repo_path) - cmd = "cd {repo_path}/tests/integration && " "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( - repo_path=repo_path, - runner_opts=self._get_runner_opts(), - image_cmd=image_cmd, - command=r""" echo Pre Pull finished """, + cmd = ( + "cd {repo_path}/tests/integration && " + "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( + repo_path=repo_path, + runner_opts=self._get_runner_opts(), + image_cmd=image_cmd, + command=r""" echo Pre Pull finished """, + ) ) for i in range(5): diff --git a/tests/integration/test_async_insert_memory/test.py b/tests/integration/test_async_insert_memory/test.py index f897007f7bb..5d2e5503680 100644 --- a/tests/integration/test_async_insert_memory/test.py +++ b/tests/integration/test_async_insert_memory/test.py @@ -43,7 +43,7 @@ def test_memory_usage(): response = node.get_query_request( "SELECT groupArray(number) FROM numbers(1000000) SETTINGS max_memory_usage_for_user={}".format( - 30 * (2 ** 23) + 30 * (2**23) ), user="A", ) diff --git a/tests/integration/test_check_table/test.py b/tests/integration/test_check_table/test.py index ebf404e698b..021977fb6b6 100644 --- a/tests/integration/test_check_table/test.py +++ b/tests/integration/test_check_table/test.py @@ -95,25 +95,15 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): node1, "non_replicated_mt", "201902_1_1_0", database="default" ) - assert ( - node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - .strip() - .split("\t")[0:2] - == ["201902_1_1_0", "0"] - ) + assert node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] - assert ( - node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - .strip() - .split("\t")[0:2] - == ["201902_1_1_0", "0"] - ) + assert node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] node1.query( "INSERT INTO non_replicated_mt VALUES (toDate('2019-01-01'), 1, 10), (toDate('2019-01-01'), 2, 12)" @@ -133,15 +123,10 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): remove_checksums_on_disk(node1, "default", "non_replicated_mt", "201901_2_2_0") - assert ( - node1.query( - "CHECK TABLE non_replicated_mt PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - .strip() - .split("\t")[0:2] - == ["201901_2_2_0", "0"] - ) + assert node1.query( + "CHECK TABLE non_replicated_mt PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ).strip().split("\t")[0:2] == ["201901_2_2_0", "0"] @pytest.mark.parametrize("merge_tree_settings, zk_path_suffix", [("", "_0")]) @@ -209,15 +194,12 @@ def test_check_replicated_table_simple( == "201901_0_0_0\t1\t\n" ) - assert ( - sorted( - node2.query( - "CHECK TABLE replicated_mt", - settings={"check_query_single_value_result": 0}, - ).split("\n") - ) - == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] - ) + assert sorted( + node2.query( + "CHECK TABLE replicated_mt", + settings={"check_query_single_value_result": 0}, + ).split("\n") + ) == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] with pytest.raises(QueryRuntimeException) as exc: node2.query( @@ -291,13 +273,10 @@ def test_check_replicated_table_corruption( ) node1.query_with_retry("SYSTEM SYNC REPLICA replicated_mt_1") - assert ( - node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - == "{}\t1\t\n".format(part_name) - ) + assert node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) == "{}\t1\t\n".format(part_name) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" remove_part_from_disk(node2, "replicated_mt_1", part_name) @@ -309,13 +288,10 @@ def test_check_replicated_table_corruption( ) node1.query("SYSTEM SYNC REPLICA replicated_mt_1") - assert ( - node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - == "{}\t1\t\n".format(part_name) - ) + assert node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) == "{}\t1\t\n".format(part_name) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" diff --git a/tests/integration/test_cluster_discovery/test.py b/tests/integration/test_cluster_discovery/test.py index a2e7e15b956..ad3deb5b142 100644 --- a/tests/integration/test_cluster_discovery/test.py +++ b/tests/integration/test_cluster_discovery/test.py @@ -61,7 +61,7 @@ def check_on_cluster( print(f"Retry {retry}/{retries} unsuccessful, result: {node_results}") if retry != retries: - time.sleep(2 ** retry) + time.sleep(2**retry) else: msg = msg or f"Wrong '{what}' result" raise Exception( diff --git a/tests/integration/test_ldap_external_user_directory/test.py b/tests/integration/test_ldap_external_user_directory/test.py index c9642c293ee..39753794d63 100644 --- a/tests/integration/test_ldap_external_user_directory/test.py +++ b/tests/integration/test_ldap_external_user_directory/test.py @@ -76,14 +76,11 @@ def test_role_mapping(ldap_cluster): "select currentUser()", user="johndoe", password="qwertz" ) == TSV([["johndoe"]]) - assert ( - instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) - == TSV([["role_1"], ["role_2"]]) - ) + assert instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) == TSV([["role_1"], ["role_2"]]) instance.query("CREATE ROLE role_3") add_ldap_group(ldap_cluster, group_cn="clickhouse-role_3", member_cn="johndoe") @@ -91,11 +88,8 @@ def test_role_mapping(ldap_cluster): # See https://github.com/ClickHouse/ClickHouse/issues/54318 add_ldap_group(ldap_cluster, group_cn="clickhouse-role_4", member_cn="johndoe") - assert ( - instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) - == TSV([["role_1"], ["role_2"], ["role_3"]]) - ) + assert instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) == TSV([["role_1"], ["role_2"], ["role_3"]]) diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 61e76c0dc97..7a69d07633c 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -854,14 +854,14 @@ def test_types(started_cluster): result = cursor.fetchall()[0] expected = [ - ("Int8_column", -(2 ** 7)), - ("UInt8_column", 2 ** 8 - 1), - ("Int16_column", -(2 ** 15)), - ("UInt16_column", 2 ** 16 - 1), - ("Int32_column", -(2 ** 31)), - ("UInt32_column", 2 ** 32 - 1), - ("Int64_column", -(2 ** 63)), - ("UInt64_column", 2 ** 64 - 1), + ("Int8_column", -(2**7)), + ("UInt8_column", 2**8 - 1), + ("Int16_column", -(2**15)), + ("UInt16_column", 2**16 - 1), + ("Int32_column", -(2**31)), + ("UInt32_column", 2**32 - 1), + ("Int64_column", -(2**63)), + ("UInt64_column", 2**64 - 1), ("String_column", "тест"), ("FixedString_column", "тест"), ("Float32_column", 1.5), diff --git a/tests/integration/test_partition/test.py b/tests/integration/test_partition/test.py index d39787f8924..054418a8ba9 100644 --- a/tests/integration/test_partition/test.py +++ b/tests/integration/test_partition/test.py @@ -561,7 +561,9 @@ def test_make_clone_in_detached(started_cluster): ["cp", "-r", path + "all_0_0_0", path + "detached/broken_all_0_0_0"] ) assert_eq_with_retry(instance, "select * from clone_in_detached", "\n") - assert ["broken_all_0_0_0",] == sorted( + assert [ + "broken_all_0_0_0", + ] == sorted( instance.exec_in_container(["ls", path + "detached/"]).strip().split("\n") ) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 16425c9bd9e..1fc3fe37044 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -506,9 +506,12 @@ def test_alters_from_different_replicas(started_cluster): dummy_node.stop_clickhouse(kill=True) settings = {"distributed_ddl_task_timeout": 5} - assert "There are 1 unfinished hosts (0 of them are currently executing the task" in competing_node.query_and_get_error( - "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", - settings=settings, + assert ( + "There are 1 unfinished hosts (0 of them are currently executing the task" + in competing_node.query_and_get_error( + "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", + settings=settings, + ) ) settings = { "distributed_ddl_task_timeout": 5, diff --git a/tests/integration/test_replicated_database_cluster_groups/test.py b/tests/integration/test_replicated_database_cluster_groups/test.py index 5a315707efb..647626d8014 100644 --- a/tests/integration/test_replicated_database_cluster_groups/test.py +++ b/tests/integration/test_replicated_database_cluster_groups/test.py @@ -95,9 +95,12 @@ def test_cluster_groups(started_cluster): # Exception main_node_2.stop_clickhouse() settings = {"distributed_ddl_task_timeout": 5} - assert "There are 1 unfinished hosts (0 of them are currently executing the task)" in main_node_1.query_and_get_error( - "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", - settings=settings, + assert ( + "There are 1 unfinished hosts (0 of them are currently executing the task)" + in main_node_1.query_and_get_error( + "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", + settings=settings, + ) ) # 3. After start both groups are synced diff --git a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py index 3632fa15d8a..5ef781bdc9e 100644 --- a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py +++ b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py @@ -9,7 +9,7 @@ import time def gen_n_digit_number(n): assert 0 < n < 19 - return random.randint(10 ** (n - 1), 10 ** n - 1) + return random.randint(10 ** (n - 1), 10**n - 1) sum_in_4_column = 0 diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index e941356261a..2549cb0d473 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -553,13 +553,16 @@ def test_multipart(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) # select uploaded data from many threads - select_query = "select sum(column1), sum(column2), sum(column3) " "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( - host=started_cluster.minio_redirect_host, - port=started_cluster.minio_redirect_port, - bucket=bucket, - filename=filename, - auth=maybe_auth, - table_format=table_format, + select_query = ( + "select sum(column1), sum(column2), sum(column3) " + "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, + ) ) try: select_result = run_query( diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py index 771df49cbac..7ff7a871413 100644 --- a/tests/integration/test_storage_url/test.py +++ b/tests/integration/test_storage_url/test.py @@ -79,21 +79,15 @@ def test_table_function_url_access_rights(): f"SELECT * FROM url('http://nginx:80/test_1', 'TSV')", user="u1" ) - assert ( - node1.query( - f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) - == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) - ) + assert node1.query( + f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) - assert ( - node1.query( - f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) - == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) - ) + assert node1.query( + f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) expected_error = "necessary to have the grant URL ON *.*" assert expected_error in node1.query_and_get_error( diff --git a/tests/integration/test_system_merges/test.py b/tests/integration/test_system_merges/test.py index bacb0eb500d..6dbe6c891f2 100644 --- a/tests/integration/test_system_merges/test.py +++ b/tests/integration/test_system_merges/test.py @@ -204,33 +204,36 @@ def test_mutation_simple(started_cluster, replicated): sleep_time=0.1, ) - assert split_tsv( - node_check.query( - """ + assert ( + split_tsv( + node_check.query( + """ SELECT database, table, num_parts, source_part_names, source_part_paths, result_part_name, result_part_path, partition_id, is_mutation FROM system.merges WHERE table = '{name}' """.format( - name=table_name + name=table_name + ) ) ) - ) == [ - [ - db_name, - table_name, - "1", - "['{}']".format(part), - "['{clickhouse}/{table_path}/{}/']".format( - part, clickhouse=clickhouse_path, table_path=table_path - ), - result_part, - "{clickhouse}/{table_path}/{}/".format( - result_part, clickhouse=clickhouse_path, table_path=table_path - ), - "all", - "1", - ], - ] + == [ + [ + db_name, + table_name, + "1", + "['{}']".format(part), + "['{clickhouse}/{table_path}/{}/']".format( + part, clickhouse=clickhouse_path, table_path=table_path + ), + result_part, + "{clickhouse}/{table_path}/{}/".format( + result_part, clickhouse=clickhouse_path, table_path=table_path + ), + "all", + "1", + ], + ] + ) t.join() assert ( diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py index 9bf7817c7d3..6218047af3c 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py @@ -8,17 +8,16 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x30\x01\x62\x06proto3' -) -_LOGSLEVEL = DESCRIPTOR.enum_types_by_name["LogsLevel"] + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc\")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel\"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03\"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03\"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12\"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t\"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04\"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04\"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t\"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x30\x01\x62\x06proto3') + +_LOGSLEVEL = DESCRIPTOR.enum_types_by_name['LogsLevel'] LogsLevel = enum_type_wrapper.EnumTypeWrapper(_LOGSLEVEL) LOG_NONE = 0 LOG_FATAL = 1 @@ -31,180 +30,134 @@ LOG_DEBUG = 7 LOG_TRACE = 8 -_NAMEANDTYPE = DESCRIPTOR.message_types_by_name["NameAndType"] -_EXTERNALTABLE = DESCRIPTOR.message_types_by_name["ExternalTable"] -_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name["SettingsEntry"] -_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name[ - "ObsoleteTransportCompression" -] -_QUERYINFO = DESCRIPTOR.message_types_by_name["QueryInfo"] -_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name["SettingsEntry"] -_LOGENTRY = DESCRIPTOR.message_types_by_name["LogEntry"] -_PROGRESS = DESCRIPTOR.message_types_by_name["Progress"] -_STATS = DESCRIPTOR.message_types_by_name["Stats"] -_EXCEPTION = DESCRIPTOR.message_types_by_name["Exception"] -_RESULT = DESCRIPTOR.message_types_by_name["Result"] -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = ( - _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionAlgorithm"] -) -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = ( - _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionLevel"] -) -NameAndType = _reflection.GeneratedProtocolMessageType( - "NameAndType", - (_message.Message,), - { - "DESCRIPTOR": _NAMEANDTYPE, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) - }, -) +_NAMEANDTYPE = DESCRIPTOR.message_types_by_name['NameAndType'] +_EXTERNALTABLE = DESCRIPTOR.message_types_by_name['ExternalTable'] +_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name['SettingsEntry'] +_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name['ObsoleteTransportCompression'] +_QUERYINFO = DESCRIPTOR.message_types_by_name['QueryInfo'] +_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name['SettingsEntry'] +_LOGENTRY = DESCRIPTOR.message_types_by_name['LogEntry'] +_PROGRESS = DESCRIPTOR.message_types_by_name['Progress'] +_STATS = DESCRIPTOR.message_types_by_name['Stats'] +_EXCEPTION = DESCRIPTOR.message_types_by_name['Exception'] +_RESULT = DESCRIPTOR.message_types_by_name['Result'] +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionAlgorithm'] +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionLevel'] +NameAndType = _reflection.GeneratedProtocolMessageType('NameAndType', (_message.Message,), { + 'DESCRIPTOR' : _NAMEANDTYPE, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) + }) _sym_db.RegisterMessage(NameAndType) -ExternalTable = _reflection.GeneratedProtocolMessageType( - "ExternalTable", - (_message.Message,), - { - "SettingsEntry": _reflection.GeneratedProtocolMessageType( - "SettingsEntry", - (_message.Message,), - { - "DESCRIPTOR": _EXTERNALTABLE_SETTINGSENTRY, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) - }, - ), - "DESCRIPTOR": _EXTERNALTABLE, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) - }, -) +ExternalTable = _reflection.GeneratedProtocolMessageType('ExternalTable', (_message.Message,), { + + 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { + 'DESCRIPTOR' : _EXTERNALTABLE_SETTINGSENTRY, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) + }) + , + 'DESCRIPTOR' : _EXTERNALTABLE, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) + }) _sym_db.RegisterMessage(ExternalTable) _sym_db.RegisterMessage(ExternalTable.SettingsEntry) -ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType( - "ObsoleteTransportCompression", - (_message.Message,), - { - "DESCRIPTOR": _OBSOLETETRANSPORTCOMPRESSION, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) - }, -) +ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType('ObsoleteTransportCompression', (_message.Message,), { + 'DESCRIPTOR' : _OBSOLETETRANSPORTCOMPRESSION, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) + }) _sym_db.RegisterMessage(ObsoleteTransportCompression) -QueryInfo = _reflection.GeneratedProtocolMessageType( - "QueryInfo", - (_message.Message,), - { - "SettingsEntry": _reflection.GeneratedProtocolMessageType( - "SettingsEntry", - (_message.Message,), - { - "DESCRIPTOR": _QUERYINFO_SETTINGSENTRY, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) - }, - ), - "DESCRIPTOR": _QUERYINFO, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) - }, -) +QueryInfo = _reflection.GeneratedProtocolMessageType('QueryInfo', (_message.Message,), { + + 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { + 'DESCRIPTOR' : _QUERYINFO_SETTINGSENTRY, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) + }) + , + 'DESCRIPTOR' : _QUERYINFO, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) + }) _sym_db.RegisterMessage(QueryInfo) _sym_db.RegisterMessage(QueryInfo.SettingsEntry) -LogEntry = _reflection.GeneratedProtocolMessageType( - "LogEntry", - (_message.Message,), - { - "DESCRIPTOR": _LOGENTRY, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) - }, -) +LogEntry = _reflection.GeneratedProtocolMessageType('LogEntry', (_message.Message,), { + 'DESCRIPTOR' : _LOGENTRY, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) + }) _sym_db.RegisterMessage(LogEntry) -Progress = _reflection.GeneratedProtocolMessageType( - "Progress", - (_message.Message,), - { - "DESCRIPTOR": _PROGRESS, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) - }, -) +Progress = _reflection.GeneratedProtocolMessageType('Progress', (_message.Message,), { + 'DESCRIPTOR' : _PROGRESS, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) + }) _sym_db.RegisterMessage(Progress) -Stats = _reflection.GeneratedProtocolMessageType( - "Stats", - (_message.Message,), - { - "DESCRIPTOR": _STATS, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) - }, -) +Stats = _reflection.GeneratedProtocolMessageType('Stats', (_message.Message,), { + 'DESCRIPTOR' : _STATS, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) + }) _sym_db.RegisterMessage(Stats) -Exception = _reflection.GeneratedProtocolMessageType( - "Exception", - (_message.Message,), - { - "DESCRIPTOR": _EXCEPTION, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) - }, -) +Exception = _reflection.GeneratedProtocolMessageType('Exception', (_message.Message,), { + 'DESCRIPTOR' : _EXCEPTION, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) + }) _sym_db.RegisterMessage(Exception) -Result = _reflection.GeneratedProtocolMessageType( - "Result", - (_message.Message,), - { - "DESCRIPTOR": _RESULT, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) - }, -) +Result = _reflection.GeneratedProtocolMessageType('Result', (_message.Message,), { + 'DESCRIPTOR' : _RESULT, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) + }) _sym_db.RegisterMessage(Result) -_CLICKHOUSE = DESCRIPTOR.services_by_name["ClickHouse"] +_CLICKHOUSE = DESCRIPTOR.services_by_name['ClickHouse'] if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _EXTERNALTABLE_SETTINGSENTRY._options = None - _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b"8\001" - _QUERYINFO_SETTINGSENTRY._options = None - _QUERYINFO_SETTINGSENTRY._serialized_options = b"8\001" - _LOGSLEVEL._serialized_start = 2363 - _LOGSLEVEL._serialized_end = 2520 - _NAMEANDTYPE._serialized_start = 42 - _NAMEANDTYPE._serialized_end = 83 - _EXTERNALTABLE._serialized_start = 86 - _EXTERNALTABLE._serialized_end = 331 - _EXTERNALTABLE_SETTINGSENTRY._serialized_start = 284 - _EXTERNALTABLE_SETTINGSENTRY._serialized_end = 331 - _OBSOLETETRANSPORTCOMPRESSION._serialized_start = 334 - _OBSOLETETRANSPORTCOMPRESSION._serialized_end = 723 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start = 532 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end = 614 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start = 616 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end = 723 - _QUERYINFO._serialized_start = 726 - _QUERYINFO._serialized_end = 1508 - _QUERYINFO_SETTINGSENTRY._serialized_start = 284 - _QUERYINFO_SETTINGSENTRY._serialized_end = 331 - _LOGENTRY._serialized_start = 1511 - _LOGENTRY._serialized_end = 1672 - _PROGRESS._serialized_start = 1674 - _PROGRESS._serialized_end = 1796 - _STATS._serialized_start = 1798 - _STATS._serialized_end = 1910 - _EXCEPTION._serialized_start = 1912 - _EXCEPTION._serialized_end = 1994 - _RESULT._serialized_start = 1997 - _RESULT._serialized_end = 2360 - _CLICKHOUSE._serialized_start = 2523 - _CLICKHOUSE._serialized_end = 2870 + DESCRIPTOR._options = None + _EXTERNALTABLE_SETTINGSENTRY._options = None + _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b'8\001' + _QUERYINFO_SETTINGSENTRY._options = None + _QUERYINFO_SETTINGSENTRY._serialized_options = b'8\001' + _LOGSLEVEL._serialized_start=2363 + _LOGSLEVEL._serialized_end=2520 + _NAMEANDTYPE._serialized_start=42 + _NAMEANDTYPE._serialized_end=83 + _EXTERNALTABLE._serialized_start=86 + _EXTERNALTABLE._serialized_end=331 + _EXTERNALTABLE_SETTINGSENTRY._serialized_start=284 + _EXTERNALTABLE_SETTINGSENTRY._serialized_end=331 + _OBSOLETETRANSPORTCOMPRESSION._serialized_start=334 + _OBSOLETETRANSPORTCOMPRESSION._serialized_end=723 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start=532 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end=614 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start=616 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end=723 + _QUERYINFO._serialized_start=726 + _QUERYINFO._serialized_end=1508 + _QUERYINFO_SETTINGSENTRY._serialized_start=284 + _QUERYINFO_SETTINGSENTRY._serialized_end=331 + _LOGENTRY._serialized_start=1511 + _LOGENTRY._serialized_end=1672 + _PROGRESS._serialized_start=1674 + _PROGRESS._serialized_end=1796 + _STATS._serialized_start=1798 + _STATS._serialized_end=1910 + _EXCEPTION._serialized_start=1912 + _EXCEPTION._serialized_end=1994 + _RESULT._serialized_start=1997 + _RESULT._serialized_end=2360 + _CLICKHOUSE._serialized_start=2523 + _CLICKHOUSE._serialized_end=2870 # @@protoc_insertion_point(module_scope) diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py index 25643a243b3..1c71218bbe5 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py @@ -15,25 +15,25 @@ class ClickHouseStub(object): channel: A grpc.Channel. """ self.ExecuteQuery = channel.unary_unary( - "/clickhouse.grpc.ClickHouse/ExecuteQuery", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQuery', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamInput = channel.stream_unary( - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamOutput = channel.unary_stream( - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamIO = channel.stream_stream( - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) class ClickHouseServicer(object): @@ -42,173 +42,124 @@ class ClickHouseServicer(object): def ExecuteQuery(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def ExecuteQueryWithStreamInput(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def ExecuteQueryWithStreamOutput(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def ExecuteQueryWithStreamIO(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_ClickHouseServicer_to_server(servicer, server): rpc_method_handlers = { - "ExecuteQuery": grpc.unary_unary_rpc_method_handler( - servicer.ExecuteQuery, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - "ExecuteQueryWithStreamInput": grpc.stream_unary_rpc_method_handler( - servicer.ExecuteQueryWithStreamInput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - "ExecuteQueryWithStreamOutput": grpc.unary_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamOutput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - "ExecuteQueryWithStreamIO": grpc.stream_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamIO, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), + 'ExecuteQuery': grpc.unary_unary_rpc_method_handler( + servicer.ExecuteQuery, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + 'ExecuteQueryWithStreamInput': grpc.stream_unary_rpc_method_handler( + servicer.ExecuteQueryWithStreamInput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + 'ExecuteQueryWithStreamOutput': grpc.unary_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamOutput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + 'ExecuteQueryWithStreamIO': grpc.stream_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamIO, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - "clickhouse.grpc.ClickHouse", rpc_method_handlers - ) + 'clickhouse.grpc.ClickHouse', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class ClickHouse(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ExecuteQuery( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def ExecuteQuery(request, target, - "/clickhouse.grpc.ClickHouse/ExecuteQuery", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQuery', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def ExecuteQueryWithStreamInput( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_unary( - request_iterator, + def ExecuteQueryWithStreamInput(request_iterator, target, - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_unary(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def ExecuteQueryWithStreamOutput( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, + def ExecuteQueryWithStreamOutput(request, target, - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def ExecuteQueryWithStreamIO( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, + def ExecuteQueryWithStreamIO(request_iterator, target, - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_stream(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) From 5cda358e62c90a2345a60a249b6d7e8430f6454d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:26:04 +0100 Subject: [PATCH 0142/1081] Obey Python's quirky formatter --- tests/clickhouse-test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index bd796dbfdf2..dd9047c293f 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -75,11 +75,13 @@ def stringhash(s): # only during process invocation https://stackoverflow.com/a/42089311 return zlib.crc32(s.encode("utf-8")) + def read_file_as_binary_string(file_path): - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: binary_data = file.read() return binary_data + # First and last lines of the log def trim_for_log(s): if not s: @@ -146,7 +148,7 @@ def clickhouse_execute_http( client.request( "POST", f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}", - body=body + body=body, ) res = client.getresponse() data = res.read() From 71bef27abfa9cd64a318306ddd11b21b907a37ac Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:26:54 +0100 Subject: [PATCH 0143/1081] Follow-up --- docker/test/base/setup_export_logs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 26fcd10d666..96a15c33674 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -194,7 +194,7 @@ function setup_logs_replication echo "Creating table system.${table}_sender" >&2 # Create Distributed table and materialized view to watch on the original table: - clickhouse-client --asterisk_include_materialized_columns 1 --query " + clickhouse-client --query " CREATE TABLE system.${table}_sender ENGINE = Distributed(${CLICKHOUSE_CI_LOGS_CLUSTER}, default, ${table}_${hash}) SETTINGS flush_on_detach=0 @@ -205,7 +205,7 @@ function setup_logs_replication echo "Creating materialized view system.${table}_watcher" >&2 - clickhouse-client --asterisk_include_materialized_columns 1 --query " + clickhouse-client --query " CREATE MATERIALIZED VIEW system.${table}_watcher TO system.${table}_sender AS SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, * FROM system.${table} From 3eba7678057df92e8a7f91912863843d377eecd4 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jan 2024 19:17:13 +0000 Subject: [PATCH 0144/1081] init --- src/Interpreters/InterpreterCreateQuery.cpp | 10 ++++++++-- .../02973_dictionary_table_exception_fix.reference | 0 .../02973_dictionary_table_exception_fix.sql | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference create mode 100644 tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 36e864ace26..c00f58de59a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1405,8 +1405,14 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, interpreter.execute(); } else - throw Exception(storage_already_exists_error_code, - "{} {}.{} already exists", storage_name, backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + { + if (database->getTable(create.getTable(), getContext())->isDictionary()) + throw Exception(ErrorCodes::DICTIONARY_ALREADY_EXISTS, + "Dictionary {}.{} already exists", backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + else + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, + "Table {}.{} already exists", backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + } } else if (!create.attach) { diff --git a/tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql new file mode 100644 index 00000000000..f8061b42670 --- /dev/null +++ b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql @@ -0,0 +1,6 @@ +CREATE TABLE test_table (i Int64) engine=MergeTree order by i; +CREATE DICTIONARY test_dict (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); +CREATE TABLE test_dict (y Int64) engine=MergeTree order by y; -- { serverError DICTIONARY_ALREADY_EXISTS } +CREATE DICTIONARY test_table (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); -- { serverError TABLE_ALREADY_EXISTS } +CREATE DICTIONARY test_dict (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); -- { serverError DICTIONARY_ALREADY_EXISTS } +CREATE TABLE test_table (y Int64) engine=MergeTree order by y; -- { serverError TABLE_ALREADY_EXISTS } From c5024a5f6d7f88f0fd8dc2af2c52eb1c1d57d2c2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 23:36:48 +0100 Subject: [PATCH 0145/1081] Fix typo --- docker/test/base/setup_export_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 96a15c33674..416281c2aa3 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -30,7 +30,7 @@ EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x - function __set_connection_args { - # It's impossible to use generous $CONNECTION_ARGS string, it's unsafe from word splitting perspective. + # It's impossible to use a generic $CONNECTION_ARGS string, it's unsafe from word splitting perspective. # That's why we must stick to the generated option CONNECTION_ARGS=( --receive_timeout=45 --send_timeout=45 --secure From 1caef191436fc05856be3b85b19cfcd97d0dc804 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jan 2024 09:44:52 +0100 Subject: [PATCH 0146/1081] Maybe better --- programs/main.cpp | 81 ++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/programs/main.cpp b/programs/main.cpp index 8958d84e243..1ff7e5db560 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -394,6 +394,50 @@ void checkHarmfulEnvironmentVariables(char ** argv) } #endif + +#if defined(SANITIZE_COVERAGE) +__attribute__((no_sanitize("coverage"))) +void dumpCoverage() +{ + /// A user can request to dump the coverage information into files at exit. + /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, + /// that cannot introspect it with SQL functions at runtime. + + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' + /// containing the list of addresses of covered . + + /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. + + if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) + { + auto dump = [](const std::string & name, auto span) + { + /// Write only non-zeros. + std::vector data; + data.reserve(span.size()); + for (auto addr : span) + if (addr) + data.push_back(addr); + + int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); + if (-1 == fd) + { + writeError("Cannot open a file to write the coverage data\n"); + } + else + { + if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) + writeError("Cannot write the coverage data to a file\n"); + if (0 != ::close(fd)) + writeError("Cannot close the file with coverage data\n"); + } + }; + + dump(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); + } +} +#endif + } bool isClickhouseApp(std::string_view app_suffix, std::vector & argv) @@ -517,42 +561,7 @@ int main(int argc_, char ** argv_) int exit_code = main_func(static_cast(argv.size()), argv.data()); #if defined(SANITIZE_COVERAGE) - /// A user can request to dump the coverage information into files at exit. - /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, - /// that cannot introspect it with SQL functions at runtime. - - /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' - /// containing the list of addresses of covered . - - /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. - - if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) - { - auto dumpCoverage = [](const std::string & name, auto span) - { - /// Write only non-zeros. - std::vector data; - data.reserve(span.size()); - for (auto addr : span) - if (addr) - data.push_back(addr); - - int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); - if (-1 == fd) - { - writeError("Cannot open a file to write the coverage data\n"); - } - else - { - if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) - writeError("Cannot write the coverage data to a file\n"); - if (0 != ::close(fd)) - writeError("Cannot close the file with coverage data\n"); - } - }; - - dumpCoverage(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); - } + dumpCoverage(); #endif return exit_code; From 21082be9a681166b5585445c8aed62e705063081 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jan 2024 10:41:13 +0100 Subject: [PATCH 0147/1081] Better test --- tests/clickhouse-test | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index dd9047c293f..6d398115d43 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1269,13 +1269,17 @@ class TestCase: file_pattern = "coverage.*" matching_files = glob.glob(file_pattern) for file_path in matching_files: - body = read_file_as_binary_string(file_path) - clickhouse_execute( - args, - f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", - body=body, - retry_error_codes=True, - ) + try: + body = read_file_as_binary_string(file_path) + clickhouse_execute( + args, + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", + body=body, + retry_error_codes=True, + ) + except Exception as e: + print("Cannot insert coverage data: ", str(e)) + # Remove the file even in case of exception to avoid accumulation and quadratic complexity. os.remove(file_path) coverage = clickhouse_execute( From 6b8d53a9fa54e53c766c431201ea8dfd742630ea Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jan 2024 11:07:30 +0100 Subject: [PATCH 0148/1081] Remove obsolete comment --- tests/clickhouse-test | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 6d398115d43..02693b997b4 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -412,7 +412,6 @@ def get_stacktraces_from_gdb(server_pid): # collect server stacktraces from system.stack_trace table -# it does not work in Sandbox def get_stacktraces_from_clickhouse(args): settings_str = " ".join( [ From 605c76e66ea5bdd2644026a5c7425e87f24c3702 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 16 Jan 2024 11:22:27 +0100 Subject: [PATCH 0149/1081] Fix test fails --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 0dc3026afc0..b235918c438 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -6400,23 +6400,27 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, { String database_name = scope.context->getCurrentDatabase(); - String table_name = table_function_node->getOriginalAST()->as()->name; + String table_name = ""; - if (table_function_node->getOriginalAST()->as()->is_compound_name) + if (table_function_node->getOriginalAST() && table_function_node->getOriginalAST()->as()) { - std::vector parts; - splitInto<'.'>(parts, table_function_node->getOriginalAST()->as()->name); - - if (parts.size() == 2) + table_name = table_function_node->getOriginalAST()->as()->name; + if (table_function_node->getOriginalAST()->as()->is_compound_name) { - database_name = parts[0]; - table_name = parts[1]; + std::vector parts; + splitInto<'.'>(parts, table_function_node->getOriginalAST()->as()->name); + + if (parts.size() == 2) + { + database_name = parts[0]; + table_name = parts[1]; + } } } auto & table_function_node_typed = table_function_node->as(); - StoragePtr table = DatabaseCatalog::instance().tryGetTable({database_name, table_name}, scope.context->getQueryContext()); + StoragePtr table = table_name.empty() ? nullptr : DatabaseCatalog::instance().tryGetTable({database_name, table_name}, scope.context->getQueryContext()); if (table) { if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) From 7bc6a858c7778911a51e4c2430125f9c3741a535 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 16 Jan 2024 19:44:55 +0100 Subject: [PATCH 0150/1081] Rewrite bash test to integration test --- .../test_broken_projections/__init__.py | 0 .../config.d/backups.xml | 13 + .../test_broken_projections/test.py | 492 +++++++++++++++++ .../02916_broken_projection.reference | 322 ----------- .../0_stateless/02916_broken_projection.sh | 515 ------------------ 5 files changed, 505 insertions(+), 837 deletions(-) create mode 100644 tests/integration/test_broken_projections/__init__.py create mode 100644 tests/integration/test_broken_projections/config.d/backups.xml create mode 100644 tests/integration/test_broken_projections/test.py delete mode 100644 tests/queries/0_stateless/02916_broken_projection.reference delete mode 100755 tests/queries/0_stateless/02916_broken_projection.sh diff --git a/tests/integration/test_broken_projections/__init__.py b/tests/integration/test_broken_projections/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_broken_projections/config.d/backups.xml b/tests/integration/test_broken_projections/config.d/backups.xml new file mode 100644 index 00000000000..4da8edffd67 --- /dev/null +++ b/tests/integration/test_broken_projections/config.d/backups.xml @@ -0,0 +1,13 @@ + + + + + local + /var/lib/clickhouse/disks/backups/ + + + + + backups + + diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py new file mode 100644 index 00000000000..ca1a29817a5 --- /dev/null +++ b/tests/integration/test_broken_projections/test.py @@ -0,0 +1,492 @@ +import time +import pytest +import logging +import string +import random +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["config.d/backups.xml"], + stay_alive=True, + with_zookeeper=True, + ) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def create_table(node, table, replica, data_prefix = ""): + if data_prefix == "": + data_prefix = table + + node.query( + f""" + DROP TABLE IF EXISTS {table} SYNC; + CREATE TABLE {table} + ( + a String, + b String, + c Int64, + d Int64, + e Int64, + + PROJECTION proj + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT d ORDER BY c + ) + ) + ENGINE = ReplicatedMergeTree('/test_broken_projection_{data_prefix}/data/', '{replica}') ORDER BY a + SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once=3, + enable_vertical_merge_algorithm=1, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + compress_primary_key=0; + """ + ) + + +def insert(node, table, offset, size): + node.query( + f""" + INSERT INTO {table} + SELECT number, number, number, number, number%2 FROM numbers({offset}, {size}) + SETTINGS insert_keeper_fault_injection_probability=0.0; + """ + ) + + +def get_parts(node, table): + return ( + node.query( + f""" + SELECT name + FROM system.parts + WHERE table='{table}' AND database=currentDatabase() AND active = 1 + ORDER BY name;" + """ + ) + .strip() + .split("\n") + ) + + +def bash(node, command): + node.exec_in_container(["bash", "-c", command], privileged=True, user="root") + + +def break_projection(node, table, part, parent_part, break_type): + part_path = node.query( + f""" + SELECT path + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + AND parent_name='{parent_part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + + if break_type == "data": + bash(node, f"rm '{part_path}/d.bin'") + bash(node, f"rm '{part_path}/c.bin'") + elif break_type == "metadata": + bash(node, f"rm '{part_path}/columns.txt'") + elif break_type == "part": + bash(node, f"rm -r '{part_path}'") + + +def break_part(node, table, part): + part_path = node.query( + f""" + SELECT path + FROM system.parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + bash(node, f"rm '{part_path}/columns.txt'") + + +def get_broken_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, errors.name FROM + ( + SELECT parent_name, name, exception_code + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND is_broken = 1 + ) AS parts_info + INNER JOIN system.errors AS errors + ON parts_info.exception_code = errors.code + ORDER BY parent_name, name + """ + ).strip() + + +def optimize(node, table, final, no_wait): + query = f"OPTIMIZE TABLE {table}" + if final: + query += " FINAL" + if no_wait: + query += " SETTINGS alter_sync=0" + node.query(query) + + +def reattach(node, table): + node.query( + f""" + DETACH TABLE {table}; + ATTACH TABLE {table}; + """ + ) + + +def materialize_projection(node, table, proj): + node.query( + f"ALTER TABLE {table} MATERIALIZE PROJECTION {proj} SETTINGS mutations_sync=2" + ) + + +def check_table_full(node, table): + return node.query( + f"CHECK TABLE {table} SETTINGS check_query_single_value_result = 0;" + ).strip() + + +def random_str(length=6): + alphabet = string.ascii_lowercase + string.digits + return "".join(random.SystemRandom().choice(alphabet) for _ in range(length)) + + +def check(node, table, check_result, expect_broken_part="", expected_error=""): + query_id = random_str() + + if expect_broken_part == "proj": + assert expected_error in node.query_and_get_error( + f"SELECT c FROM '{table}' WHERE d == 12 ORDER BY c" + ) + else: + node.query( + f"SELECT c FROM '{table}' WHERE d == 12 OR d == 16 ORDER BY c", + query_id=query_id, + ) + assert "proj" in node.query( + f""" + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE current_database=currentDatabase() AND query_id='{query_id}' AND type='QueryFinish' + """ + ) + + query_id = random_str() + + if expect_broken_part == "proj_2": + assert expected_error in node.query_and_get_error( + f"SELECT d FROM '{table}' WHERE c == 12 ORDER BY d" + ) + else: + node.query( + f"SELECT d FROM '{table}' WHERE c == 12 OR c == 16 ORDER BY d", + query_id=query_id, + ) + assert "proj" in node.query( + f""" + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE current_database=currentDatabase() AND query_id='{query_id}' AND type='QueryFinish' + """ + ) + + assert check_result == int(node.query(f"CHECK TABLE {table}")) + + +def test_broken_ignored(cluster): + node = cluster.instances["node"] + + table_name = "test1" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + # Break metadata (columns.txt) file of projection 'proj' + break_projection(node, table_name, "proj", "all_2_2_0", "metadata") + + # Do select and after "check table" query. + # Select works because it does not read columns.txt. + # But expect check table result as 0. + check(node, table_name, 0) + + # Projection 'proj' from part all_2_2_0 will now appear in broken parts info + # because it was marked broken during "check table" query. + assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + + # Check table query will also show a list of parts which have broken projections. + assert "all_2_2_0" in check_table_full(node, table_name) + + # Break data file of projection 'proj_2' for part all_2_2_0 + break_projection(node, table_name, "proj_2", "all_2_2_0", "data") + + # It will not yet appear in broken projections info. + assert "proj_2" not in get_broken_projections_info(node, table_name) + + # Select now fails with error "File doesn't exist" + check(node, table_name, 0, "proj_2", "FILE_DOESNT_EXIST") + + # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. + assert "all_2_2_0\tproj_2\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + # Second select works, because projection is now marked as broken. + check(node, table_name, 0) + + # Break data file of projection 'proj_2' for part all_3_3_0 + break_projection(node, table_name, "proj_2", "all_3_3_0", "data") + + # It will not yet appear in broken projections info. + assert "all_3_3_0" not in get_broken_projections_info(node, table_name) + + insert(node, table_name, 20, 5) + insert(node, table_name, 25, 5) + + # Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. + # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. + # So a merge will be create for future part all_3_5_1. + # During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. + # Merge will be retried and on second attempt it will succeed. + # The result part all_3_5_1 will have only 1 projection - 'proj', because + # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. + optimize(node, table_name, 0, 1) + time.sleep(5) + + # table_uuid=node.query(f"SELECT uuid FROM system.tables WHERE table='{table_name}' and database=currentDatabase()").strip() + # assert 0 < int( + # node.query( + # f""" + # SYSTEM FLUSH LOGS; + # SELECT count() FROM system.text_log + # WHERE level='Error' + # AND logger_name='MergeTreeBackgroundExecutor' + # AND message like 'Exception while executing background task %{table_uuid}:all_3_5_1%%Cannot open file%proj_2.proj/c.bin%' + # """) + # ) + + assert "all_3_3_0" in get_broken_projections_info(node, table_name) + check(node, table_name, 0) + + +def test_materialize_broken_projection(cluster): + node = cluster.instances["node"] + + table_name = "test2" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + break_projection(node, table_name, "proj", "all_1_1_0", "metadata") + reattach(node, table_name) + + assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj" in check_table_full( + node, table_name + ) + + break_projection(node, table_name, "proj_2", "all_1_1_0", "data") + reattach(node, table_name) + + assert "all_1_1_0\tproj_2\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj_2" in check_table_full( + node, table_name + ) + + materialize_projection(node, table_name, "proj") + + assert "has a broken projection" not in check_table_full(node, table_name) + + +def test_broken_ignored_replicated(cluster): + node = cluster.instances["node"] + + table_name = "test3" + table_name2 = "test3_replica" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + check(node, table_name, 1) + + create_table(node, table_name2, 2, table_name) + check(node, table_name2, 1) + + break_projection(node, table_name, "proj", "all_0_0_0", "data") + assert "Part all_0_0_0 has a broken projection proj" in check_table_full( + node, table_name + ) + + break_part(node, table_name, "all_0_0_0") + node.query(f"SYSTEM SYNC REPLICA {table_name}") + assert "has a broken projection" not in check_table_full(node, table_name) + + +def test_broken_projections_in_backups(cluster): + node = cluster.instances["node"] + + table_name = "test4" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + check(node, table_name, 1) + + break_projection(node, table_name, "proj", "all_2_2_0", "data") + check(node, table_name, 0, "proj", "FILE_DOESNT_EXIST") + + assert "all_2_2_0\tproj\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b1') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', 'b1'); + """ + ) + + check(node, table_name, 1) + + assert "" == get_broken_projections_info(node, table_name) + # TODO: add a check for what projections are loaded + + break_projection(node, table_name, "proj", "all_2_2_0", "part") + + check(node, table_name, 0, "proj", "ErrnoException") + + assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + assert "FILE_DOESNT_EXIST" in node.query_and_get_error( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b2') + """ + ) + + materialize_projection(node, table_name, "proj") + check(node, table_name, 1) + # TODO: + # assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info(node, table_name) + + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b3') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', 'b3'); + """ + ) + check(node, table_name, 1) + + break_projection(node, table_name, "proj", "all_1_1_0", "part") + # TODO: check(node, table_name, 0, "proj", "FILE_DOESNT_EXIST") + assert "Part all_1_1_0 has a broken projection proj" in check_table_full( + node, table_name + ) + assert "all_1_1_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', 'b4'); + """ + ) + check(node, table_name, 1) + assert "" == get_broken_projections_info(node, table_name) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference deleted file mode 100644 index 3967215e5de..00000000000 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ /dev/null @@ -1,322 +0,0 @@ -insert new part -insert new part -insert new part -insert new part -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke metadata of part 'proj' (parent part: all_2_2_0) -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -check table full (test - all_2_2_0) -all_2_2_0 -0 -broke data of part 'proj_2' (parent part: all_2_2_0) -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: proj_2 -FILE_DOESNT_EXIST -check table -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -0 -broke data of part 'proj_2' (parent part: all_3_3_0) -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -insert new part -insert new part -optimize -OPTIMIZE TABLE test SETTINGS alter_sync=0 -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -all_3_3_0 proj_2 NO_FILE_IN_DATA_PART -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -0 -broke metadata of part 'proj' (parent part: all_1_1_0) -Detach - Attach -broken projections info -all_1_1_0 proj NO_FILE_IN_DATA_PART -all_2_2_0 proj NO_FILE_IN_DATA_PART -all_2_2_0 proj_2 FILE_DOESNT_EXIST -all_3_3_0 proj_2 FILE_DOESNT_EXIST -0 -broke data of part 'proj_2' (parent part: all_1_1_0) -Detach - Attach -broken projections info -all_1_1_0 proj NO_FILE_IN_DATA_PART -all_1_1_0 proj_2 FILE_DOESNT_EXIST -all_2_2_0 proj NO_FILE_IN_DATA_PART -all_2_2_0 proj_2 FILE_DOESNT_EXIST -all_3_3_0 proj_2 FILE_DOESNT_EXIST -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -check table full (test - all_1_1_0) -all_1_1_0 -materialize projection proj -check table full (test - ) -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -materialize projection proj_2 -check table full (test - ) -0 -broke data of part 'proj' (parent part: all_3_5_1_7) -insert new part -optimize -OPTIMIZE TABLE test FINAL -insert new part -optimize -OPTIMIZE TABLE test FINAL -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj_2 -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -insert new part -insert new part -insert new part -insert new part -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -select from projection 'proj' -used projections -SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2_replica WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke data of part 'proj' (parent part: all_0_0_0) -check table full (test2 - all_0_0_0) -all_0_0_0 -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broke data of part 'all_0_0_0' -check table full (test2 - all_0_0_0) -all_0_0_0 -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -insert new part -insert new part -insert new part -insert new part -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke data of part 'proj' (parent part: all_2_2_0) -select from projection 'proj', expect error: proj -FILE_DOESNT_EXIST -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART -BACKUP_CREATED -RESTORED -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -broken projections info -0 -broke all data of part 'proj' (parent part: all_2_2_0) -select from projection 'proj', expect error: proj -Errno -Errno -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -FILE_DOESNT_EXIST -materialize projection proj -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -BACKUP_CREATED -RESTORED -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke all data of part 'proj' (parent part: all_1_1_0) -select from projection 'proj', expect error: proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_1_1_0 proj FILE_DOESNT_EXIST -BACKUP_CREATED -RESTORED -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_1_1_0 proj NO_FILE_IN_DATA_PART diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh deleted file mode 100755 index fbd26e59f6f..00000000000 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ /dev/null @@ -1,515 +0,0 @@ -#!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage, no-parallel -# shellcheck disable=SC2046 - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -function create_table() -{ - test_id=$1 - name=$2 - replica=$3 - $CLICKHOUSE_CLIENT -nm -q " - DROP TABLE IF EXISTS $name SYNC; - CREATE TABLE $name - ( - a String, - b String, - c Int64, - d Int64, - e Int64, - - PROJECTION proj - ( - SELECT c ORDER BY d - ), - PROJECTION proj_2 - ( - SELECT d ORDER BY c - ) - ) - ENGINE = ReplicatedMergeTree('/test_broken_projection_32_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a - SETTINGS min_bytes_for_wide_part = 0, - max_parts_to_merge_at_once=3, - enable_vertical_merge_algorithm=1, - vertical_merge_algorithm_min_rows_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - compress_primary_key=0; - " -} - -function random() -{ - cat /dev/urandom | LC_ALL=C tr -dc 'a-zA-Z' | fold -w ${1:-8} | head -n 1 -} - -function insert() -{ - table=$1 - offset=$2 - size=$3 - echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability=0.0;" -} - -function break_projection() -{ - table=$1 - part_name=$2 - parent_name=$3 - break_type=$4 - - read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " - SELECT path - FROM system.projection_parts - WHERE table='$table' - AND database=currentDatabase() - AND active=1 - AND part_name='$part_name' - AND parent_name='$parent_name' - ORDER BY modification_time DESC - LIMIT 1; - ") - - $CLICKHOUSE_CLIENT -q "select throwIf(substring('$part_path', 1, 1) != '/', 'Path is relative: $part_path')" || exit - - if [ "$break_type" = "data" ] - then - rm "$part_path/d.bin" - rm "$part_path/c.bin" - echo "broke data of part '$part_name' (parent part: $parent_name)" - fi - if [ "$break_type" = "metadata" ] - then - rm "$part_path/columns.txt" - echo "broke metadata of part '$part_name' (parent part: $parent_name)" - fi - if [ "$break_type" = "part" ] - then - rm -r "$part_path" - echo "broke all data of part '$part_name' (parent part: $parent_name)" - fi -} - -function break_part() -{ - table=$1 - part_name=$2 - - read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " - SELECT path - FROM system.parts - WHERE table='$table' - AND database=currentDatabase() - AND active=1 - AND part_name='$part_name' - ORDER BY modification_time DESC - LIMIT 1; - ") - - if [ "$part_path" = "" ] - then - echo "Part path is empty" - exit - fi - - rm $part_path/columns.txt - echo "broke data of part '$part_name'" -} - -function broken_projections_info() -{ - table=$1 - echo 'broken projections info' - $CLICKHOUSE_CLIENT -q " - SELECT parent_name, name, errors.name FROM - ( - SELECT parent_name, name, exception_code - FROM system.projection_parts - WHERE table='$table' - AND database=currentDatabase() - AND is_broken = 1 - ) AS parts_info - INNER JOIN system.errors AS errors - ON parts_info.exception_code = errors.code - ORDER BY parent_name, name -" -} - -function check() -{ - table=$1 - expect_broken_part="" - expected_error="" - if [ $# -gt 1 ]; then - expect_broken_part=$2 - expected_error=$3 - fi - - #echo 'system.parts' - #$CLICKHOUSE_CLIENT -q " - #SELECT name, active, projections - #FROM system.parts - #WHERE table='$table' AND database=currentDatabase() - #ORDER BY name;" - - query_id=$(random 8) - - if [ "$expect_broken_part" = "proj" ] - then - echo "select from projection 'proj', expect error: $expect_broken_part" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " - SET send_logs_level='fatal'; - SELECT c FROM $table WHERE d == 12 ORDER BY c; - " 2>&1 | grep -oF "$expected_error" - else - echo "select from projection 'proj'" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM $table WHERE d == 12 OR d == 16 ORDER BY c;" - echo 'used projections' - $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' - " - fi - - query_id=$(random 8) - - if [ "$expect_broken_part" = "proj_2" ] - then - echo "select from projection 'proj_2', expect error: $expect_broken_part" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " - SET send_logs_level='fatal'; - SELECT d FROM $table WHERE c == 12 ORDER BY d; - " 2>&1 | grep -oF "$expected_error" - else - echo "select from projection 'proj_2'" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM $table WHERE c == 12 OR c == 16 ORDER BY d;" - echo 'used projections' - $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' - " - fi - - echo 'check table' - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE $table;" -} - -function optimize() -{ - final=$1 - no_wait=$2 - - echo 'optimize' - query="OPTIMIZE TABLE test" - - if [ $final -eq 1 ]; then - query="$query FINAL" - fi - if [ $no_wait -eq 1 ]; then - query="$query SETTINGS alter_sync=0" - fi - - echo $query - - $CLICKHOUSE_CLIENT -q "$query" -} - -function reattach() -{ - echo 'Detach - Attach' - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - DETACH TABLE test; - ATTACH TABLE test; - " -} - -function materialize_projection -{ - table=$1 - projection=$2 - echo "materialize projection $projection" - $CLICKHOUSE_CLIENT -q "ALTER TABLE $table MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" -} - -function check_table_full() -{ - table=$1 - expect_broken_part=$2 - echo "check table full ($1 - $2)" - if [ "$expect_broken_part" = "" ] - then - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE $table SETTINGS check_query_single_value_result = 0; - " | grep "broken" - else - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE $table SETTINGS check_query_single_value_result = 0; - " | grep "broken" | grep -o $expect_broken_part | head -n 1 - fi -} - -function test1() -{ - create_table test1 test 1 - - table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") - - insert test 0 5 - - insert test 5 5 - - insert test 10 5 - - insert test 15 5 - - check test - - # Break metadata file of projection 'proj' - break_projection test proj all_2_2_0 metadata - - # Do select and after "check table" query. - # Select works because it does not read columns.txt. - check test - - # Projection 'proj' from part all_2_2_0 will now appear in broken parts info - # because it was marked broken during "check table" query. - # TODO: try to mark it during select as well - broken_projections_info test - - # Check table query will also show a list of parts which have broken projections. - check_table_full test "all_2_2_0" - - # Break data file of projection 'proj_2' for part all_2_2_0 - break_projection test proj_2 all_2_2_0 data - - # It will not yet appear in broken projections info. - broken_projections_info test - - # Select now fails with error "File doesn't exist" - check test "proj_2" FILE_DOESNT_EXIST - - # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. - broken_projections_info test - - # Second select works, because projection is now marked as broken. - check test - - # Break data file of projection 'proj_2' for part all_3_3_0 - break_projection test proj_2 all_3_3_0 data - - # It will not yet appear in broken projections info. - broken_projections_info test - - insert test 20 5 - - insert test 25 5 - - # Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. - # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. - # So a merge will be create for future part all_3_5_1. - # During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. - # Merge will be retried and on second attempt it will succeed. - # The result part all_3_5_1 will have only 1 projection - 'proj', because - # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. - optimize 0 1 - sleep 2 - - $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT count() FROM system.text_log - WHERE level='Error' - AND logger_name='MergeTreeBackgroundExecutor' - AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' - " - - # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. - broken_projections_info test - - check test - - break_projection test proj all_1_1_0 metadata - - reattach - - broken_projections_info test - - break_projection test proj_2 all_1_1_0 data - - reattach - - broken_projections_info test - - check test - - check_table_full test all_1_1_0 - - materialize_projection test proj - - check_table_full test - - check test - - materialize_projection test proj_2 - - check_table_full test - - break_projection test proj all_3_5_1_7 data - - insert test 30 5 - - optimize 1 0 - - insert test 35 5 - - optimize 1 0 - - check test -} - -function test2() -{ - create_table test2 test2 1 - - insert test2 0 5 - - insert test2 5 5 - - insert test 10 5 - - insert test 15 5 - - check test2 - - create_table test2 test2_replica 2 - - check test2_replica - - break_projection test2 proj all_0_0_0 data - - check_table_full test2 all_0_0_0 - - check test2 - - break_part test2 all_0_0_0 - - check_table_full test2 all_0_0_0 - - check test2 - - $CLICKHOUSE_CLIENT -q "SYSTEM SYNC REPLICA test2;" - - check test2 -} - -function test3() -{ - create_table test3 test 1 - - insert test 0 5 - - insert test 5 5 - - insert test 10 5 - - insert test 15 5 - - check test - - break_projection test proj all_2_2_0 data - - check test proj FILE_DOESNT_EXIST - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; - " | grep -o "BACKUP_CREATED" - - ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " - drop table test sync; - set backup_restore_keeper_fault_injection_probability=0.0; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); - " | grep -o "RESTORED" - - check test - - broken_projections_info test - - break_projection test proj all_2_2_0 part - - check test proj Errno - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') - " 2>&1 | grep -o "FILE_DOESNT_EXIST" - - materialize_projection test proj - - check test - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') - " | grep -o "BACKUP_CREATED" - - ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " - drop table test sync; - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); - " | grep -o "RESTORED" - - check test - - break_projection test proj all_1_1_0 part - - check test proj FILE_DOESNT_EXIST - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4') - settings check_projection_parts=false, allow_backup_broken_projections=true; - " | grep -o "BACKUP_CREATED" - - ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " - drop table test sync; - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4'); - " | grep -o "RESTORED" - - check test - - broken_projections_info test -} - -test1 -test2 -test3 - - -$CLICKHOUSE_CLIENT -nm -q " -DROP TABLE IF EXISTS test SYNC; -DROP TABLE IF EXISTS test2 SYNC; -DROP TABLE IF EXISTS test2_replica SYNC; -" From 216769f43ea536a38b9e7d5650cdd02fae972caf Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 16 Jan 2024 18:55:23 +0000 Subject: [PATCH 0151/1081] Automatic style fix --- tests/integration/test_broken_projections/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index ca1a29817a5..cc3e55402b3 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -28,7 +28,7 @@ def cluster(): cluster.shutdown() -def create_table(node, table, replica, data_prefix = ""): +def create_table(node, table, replica, data_prefix=""): if data_prefix == "": data_prefix = table From 8d88f4cf87d13c6760a5235abf4180102daf8b5c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 17 Jan 2024 09:42:53 +0100 Subject: [PATCH 0152/1081] Update setting is_parameterized_view & settings columns for view --- src/Interpreters/InterpreterCreateQuery.cpp | 5 ++++- src/Storages/StorageView.cpp | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7985785aa9f..6031c8b4e46 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -780,8 +780,11 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti properties.constraints = as_storage_metadata->getConstraints(); } - else if (create.select && !create.isParameterizedView()) + else if (create.select) { + if (create.isParameterizedView()) + return properties; + Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index 1898e49de86..6b80e2450c4 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,7 +112,7 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - if (is_parameterized_view_) + if (!is_parameterized_view_) { if (!query.isParameterizedView()) storage_metadata.setColumns(columns_); From dea8b10ae972b4fc2b20dbf90d90e362bf7e4207 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Jan 2024 15:39:57 +0100 Subject: [PATCH 0153/1081] Fix test --- tests/integration/test_broken_projections/test.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index ca1a29817a5..90d82f9f010 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -425,9 +425,7 @@ def test_broken_projections_in_backups(cluster): ) check(node, table_name, 1) - assert "" == get_broken_projections_info(node, table_name) - # TODO: add a check for what projections are loaded break_projection(node, table_name, "proj", "all_2_2_0", "part") @@ -446,8 +444,6 @@ def test_broken_projections_in_backups(cluster): materialize_projection(node, table_name, "proj") check(node, table_name, 1) - # TODO: - # assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info(node, table_name) assert "BACKUP_CREATED" in node.query( f""" @@ -466,7 +462,6 @@ def test_broken_projections_in_backups(cluster): check(node, table_name, 1) break_projection(node, table_name, "proj", "all_1_1_0", "part") - # TODO: check(node, table_name, 0, "proj", "FILE_DOESNT_EXIST") assert "Part all_1_1_0 has a broken projection proj" in check_table_full( node, table_name ) @@ -477,7 +472,7 @@ def test_broken_projections_in_backups(cluster): assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false; + backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false, allow_backup_broken_projections=true; """ ) @@ -488,5 +483,5 @@ def test_broken_projections_in_backups(cluster): restore table {table_name} from Disk('backups', 'b4'); """ ) - check(node, table_name, 1) - assert "" == get_broken_projections_info(node, table_name) + check(node, table_name, 0) + assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info(node, table_name) From 1e9de73bf57de1eb66007cba0fecb9f0459c973e Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Jan 2024 15:40:46 +0100 Subject: [PATCH 0154/1081] Fix style check --- tests/integration/test_broken_projections/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index eb8c452fff0..1b192e0df24 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -484,4 +484,6 @@ def test_broken_projections_in_backups(cluster): """ ) check(node, table_name, 0) - assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info(node, table_name) + assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( + node, table_name + ) From d3b4dea8058e1cccb34bf39b3f26b4c0e5b2368a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 17 Jan 2024 20:02:17 +0100 Subject: [PATCH 0155/1081] Fix clang tidy build --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index d2270ea9910..7322d53d831 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -6401,7 +6401,7 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, { String database_name = scope.context->getCurrentDatabase(); - String table_name = ""; + String table_name; if (table_function_node->getOriginalAST() && table_function_node->getOriginalAST()->as()) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6031c8b4e46..e71946caafe 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -809,11 +809,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti * for example: LIMIT, OFFSET, functions parameters, functions constant only arguments. */ - SelectQueryOptions options; - if (create.isParameterizedView()) - options = options.createParameterizedView(); - - InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), options); + InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), SelectQueryOptions()); as_select_sample = interpreter.getSampleBlock(); } From e3f5058f6129badab2e0071e86f51ffb77e57ce5 Mon Sep 17 00:00:00 2001 From: MyroTk <44327070+MyroTk@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:13:15 -0800 Subject: [PATCH 0156/1081] Update Dockerfile --- docker/test/integration/runner/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index c795fbf0672..2a81db78a3d 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -94,6 +94,7 @@ RUN python3 -m pip install --no-cache-dir \ pytest-repeat \ pytest-timeout \ pytest-xdist \ + pytest-reportlog==0.4.0 \ pytz \ pyyaml==5.3.1 \ redis \ From f89803ebf65d7590e73816052b7ac2de81e04864 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 17 Jan 2024 23:17:53 +0100 Subject: [PATCH 0157/1081] Slightly better --- docker/test/base/setup_export_logs.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 416281c2aa3..043adf99ffc 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -156,7 +156,8 @@ function setup_logs_replication # Do not try to resolve stack traces in case of debug/sanitizers # build, since it is too slow (flushing of trace_log can take ~1min # with such MV attached) - if [[ "$debug_or_sanitizer_build" = 1 ]]; then + if [[ "$debug_or_sanitizer_build" = 1 ]] + then EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" else EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_TRACE_LOG}" @@ -180,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"'/; + s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)'/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From f9ca4e3b8541d7db85effa3f9be286f7ad916965 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 00:52:05 +0100 Subject: [PATCH 0158/1081] Slightly better --- docker/test/base/setup_export_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 043adf99ffc..7033d4b52e2 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -181,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)'/; + s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From 61101d1a577b441931ef74b24d449b085d0f0ec3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 01:16:50 +0100 Subject: [PATCH 0159/1081] Add a release build with coverage, turn off coverage in the debug build --- .github/workflows/master.yml | 8 ++++++++ .github/workflows/pull_request.yml | 8 ++++++++ tests/ci/ci_config.py | 17 +++++++++++++---- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index d2865eb737d..50d3eb4a062 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -98,6 +98,14 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + BuilderDebReleaseCoverage: + needs: [ RunConfig, BuildDockers ] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release_coverage + checkout_depth: 0 + data: ${{ needs.RunConfig.outputs.data }} BuilderDebAarch64: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index bd2b2b60904..7f843f82c01 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -146,6 +146,14 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + BuilderDebReleaseCoverage: + needs: [ RunConfig, FastTest ] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release_coverage + checkout_depth: 0 + data: ${{ needs.RunConfig.outputs.data }} BuilderDebAarch64: needs: [RunConfig, FastTest] if: ${{ !failure() && !cancelled() }} diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index b8dff3f0a28..1ca4e06bc8c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -474,6 +474,12 @@ CI_CONFIG = CiConfig( name="package_debug", compiler="clang-17", debug_build=True, + package_type="deb", + sparse_checkout=True, + ), + "package_release_coverage": BuildConfig( + name="package_release_coverage", + compiler="clang-17", coverage=True, package_type="deb", sparse_checkout=True, @@ -571,6 +577,7 @@ CI_CONFIG = CiConfig( "package_tsan", "package_msan", "package_debug", + "package_release_coverage", "binary_release", "fuzzers", ] @@ -660,16 +667,15 @@ CI_CONFIG = CiConfig( "Stateful tests (release)": TestConfig( "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), + "Stateful tests (coverage)": TestConfig( + "package_release_coverage", job_config=JobConfig(**stateful_test_common_params) # type: ignore + ), "Stateful tests (aarch64)": TestConfig( "package_aarch64", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), "Stateful tests (release, DatabaseOrdinary)": TestConfig( "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), - # "Stateful tests (release, DatabaseReplicated)": TestConfig( - # "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore - # ), - # Stateful tests for parallel replicas "Stateful tests (release, ParallelReplicas)": TestConfig( "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), @@ -712,6 +718,9 @@ CI_CONFIG = CiConfig( "Stateless tests (release)": TestConfig( "package_release", job_config=JobConfig(**statless_test_common_params) # type: ignore ), + "Stateless tests (coverage)": TestConfig( + "package_release_coverage", job_config=JobConfig(**statless_test_common_params) # type: ignore + ), "Stateless tests (aarch64)": TestConfig( "package_aarch64", job_config=JobConfig(**statless_test_common_params) # type: ignore ), From b9f8fff623448e7013bbe604b39d0f72b81032f9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 01:36:48 +0100 Subject: [PATCH 0160/1081] Fix YAML --- .github/workflows/master.yml | 2 +- .github/workflows/pull_request.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 50d3eb4a062..1920f3a2a56 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -99,7 +99,7 @@ jobs: checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} BuilderDebReleaseCoverage: - needs: [ RunConfig, BuildDockers ] + needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_build.yml with: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 7f843f82c01..57199e6b9d9 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -147,7 +147,7 @@ jobs: checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} BuilderDebReleaseCoverage: - needs: [ RunConfig, FastTest ] + needs: [RunConfig, FastTest] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_build.yml with: From cc5cc361ef561993bc7bbea6f1588562f7d3deae Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 12:35:49 +0100 Subject: [PATCH 0161/1081] Fix error --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index c310185b071..8efd3b8f302 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -289,7 +289,7 @@ def parse_env_variables( result.append("BUILD_TYPE=None") if coverage: - cmake_flags.append("-DSANITIZE_COVERAGE=1") + cmake_flags.append("-DSANITIZE_COVERAGE=1 -DBUILD_STANDALONE_KEEPER=0") if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") From 6c0445f36584a60724f7d616f47c7b953621997c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 12:47:18 +0100 Subject: [PATCH 0162/1081] Fix CMake --- cmake/sanitize.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 3882b51227e..23e9cc34fec 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -63,14 +63,14 @@ endif() option(WITH_COVERAGE "Instrumentation for code coverage with default implementation" OFF) if (WITH_COVERAGE) - message (INFORMATION "Enabled instrumentation for code coverage") + message (STATUS "Enabled instrumentation for code coverage") set(COVERAGE_FLAGS "-fprofile-instr-generate -fcoverage-mapping") endif() option (SANITIZE_COVERAGE "Instrumentation for code coverage with custom callbacks" OFF) if (SANITIZE_COVERAGE) - message (INFORMATION "Enabled instrumentation for code coverage") + message (STATUS "Enabled instrumentation for code coverage") # We set this define for whole build to indicate that at least some parts are compiled with coverage. # And to expose it in system.build_options. From 6d6b8fcf8e988d78fc983ed4043ed556e36b833b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 12:58:50 +0100 Subject: [PATCH 0163/1081] Add missing comments in code --- tests/ci/ci_config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 1ca4e06bc8c..45bdfbecb0c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -55,6 +55,13 @@ class JobConfig: run_always: bool = False +# About the "sparse_checkout" option: +# +# Misha f. Shiryaev +# :facepalm: +# we have this feature, it's used by devs, we need to test it in CI +# It's not useful for the CI itself + @dataclass class BuildConfig: name: str From db3ffa5c86dba79ca7052abe8d53799ac3e4afb9 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 18 Jan 2024 12:11:03 +0000 Subject: [PATCH 0164/1081] Automatic style fix --- tests/ci/ci_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 45bdfbecb0c..ab37659e65b 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -62,6 +62,7 @@ class JobConfig: # we have this feature, it's used by devs, we need to test it in CI # It's not useful for the CI itself + @dataclass class BuildConfig: name: str From 0a6331f5f756d5d6465095edac89d2a03618d773 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Thu, 18 Jan 2024 13:18:13 +0000 Subject: [PATCH 0165/1081] spell-check additions --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 73b7a081797..1f6b24597da 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2657 +personal_ws-1.1 en 2697 AArch ACLs ALTERs @@ -2016,6 +2016,7 @@ pcre performant perl persistency +personal_ws-1.1 en 2657 phpclickhouse pipelining plaintext From 8b730811efd055f1b200f277202a34258a93722e Mon Sep 17 00:00:00 2001 From: Dale McDiarmid Date: Thu, 18 Jan 2024 13:24:40 +0000 Subject: [PATCH 0166/1081] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 73b7a081797..1f6b24597da 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2657 +personal_ws-1.1 en 2697 AArch ACLs ALTERs @@ -2016,6 +2016,7 @@ pcre performant perl persistency +personal_ws-1.1 en 2657 phpclickhouse pipelining plaintext From 8ac04c6dd8a945e0f189aae572c54ee4458f75dd Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 15:15:57 +0000 Subject: [PATCH 0167/1081] Address comments --- src/DataTypes/DataTypeVariant.cpp | 9 ++++ .../Serializations/SerializationVariant.cpp | 48 +++++++------------ .../Serializations/SerializationVariant.h | 23 +++++++++ .../SerializationVariantElement.cpp | 14 +++--- 4 files changed, 57 insertions(+), 37 deletions(-) diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 2bc4dfa5a7a..e0510373960 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -105,6 +105,15 @@ ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & fiel } else { + /// We don't have exact mapping Field type -> Data type, so we cannot + /// always know in which variant we need to insert the field by it's type. + /// Examples: + /// Field(42) and Variant(UInt16, String). Type of the Field - UInt64, but we can insert it in UInt16 + /// Field(42) and Variant(Date, String). Type of the Field - UInt64, but we can insert it in Date + + /// Let's first apply FieldToDataType visitor to find best Data type for this field. + /// If we have variant with such type, we will insert this field into it. + /// Otherwise we will try to find the first variant that has default Field value with the same type. auto field_type = applyVisitor(FieldToDataType(), field); auto discr = tryGetVariantDiscriminator(field_type); if (!discr) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 64fcb63d604..78ec0a5e2da 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -200,19 +200,12 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( for (size_t i = 0; i != limit; ++i) writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream); - /// Second, serialize variants in global order. + /// Second, serialize non-empty variant (other variants are empty and we can skip their serialization). settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) - { - addVariantElementToPath(settings.path, i); - /// For non empty variant use the same offset/limit as for whole Variant column - if (i == non_empty_global_discr) - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), offset, limit, settings, variant_state->states[i]); - /// For empty variants, use just 0/0, they won't serialize anything. - else - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); - settings.path.pop_back(); - } + addVariantElementToPath(settings.path, non_empty_global_discr); + /// We can use the same offset/limit as for whole Variant column + variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]); + settings.path.pop_back(); settings.path.pop_back(); return; } @@ -237,26 +230,22 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( } } - /// If limit for some variant is 0, it means that we don't have its discriminator in the range. - /// Set offset to the size of column for such variants, so we won't serialize values from them. - for (size_t i = 0; i != variant_offsets_and_limits.size(); ++i) - { - if (!variant_offsets_and_limits[i].second) - variant_offsets_and_limits[i].first = col.getVariantByGlobalDiscriminator(i).size(); - } - /// Serialize variants in global order. settings.path.push_back(Substream::VariantElements); for (size_t i = 0; i != variants.size(); ++i) { - addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkWithMultipleStreams( - col.getVariantByGlobalDiscriminator(i), - variant_offsets_and_limits[i].first, - variant_offsets_and_limits[i].second, - settings, - variant_state->states[i]); - settings.path.pop_back(); + /// Serialize variant only if we have its discriminator in the range. + if (variant_offsets_and_limits[i].second) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams( + col.getVariantByGlobalDiscriminator(i), + variant_offsets_and_limits[i].first, + variant_offsets_and_limits[i].second, + settings, + variant_state->states[i]); + settings.path.pop_back(); + } } settings.path.pop_back(); } @@ -564,9 +553,6 @@ std::vector SerializationVariant::getVariantsDeserializeTextOrder(const } std::sort(order.begin(), order.end(), [&](size_t left, size_t right) { return priorities[left] > priorities[right]; }); - String types_order; - for (auto i : order) - types_order += " " + variant_types[i]->getName(); return order; } diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h index b6bee94c65f..3f53dcf1339 100644 --- a/src/DataTypes/Serializations/SerializationVariant.h +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -6,6 +6,29 @@ namespace DB { +/// Class for serializing/deserializing column with Variant type. +/// It supports both text and binary bulk serializations/deserializations. +/// +/// During text serialization it checks discriminator of the current row and +/// uses corresponding text serialization of this variant. +/// +/// During text deserialization it tries all variants deserializations +/// (using tryDeserializeText* methods of ISerialization) in predefined order +/// and inserts data in the first variant with succeeded deserialization. +/// +/// During binary bulk serialization it transforms local discriminators +/// to global and serializes them into a separate stream VariantDiscriminators. +/// Each variant is serialized into a separate stream with path VariantElements/VariantElement +/// (VariantElements stream is needed for correct sub-columns creation). We store and serialize +/// variants in a sparse form (the size of a variant column equals to the number of its discriminator +/// in the discriminators column), so during deserialization the limit for each variant is +/// calculated according to discriminators column. +/// Offsets column is not serialized and stored only in memory. +/// +/// During binary bulk deserialization we first deserialize discriminators from corresponding stream +/// and use them to calculate the limit for each variant. Each variant is deserialized from +/// corresponding stream using calculated limit. Offsets column is not deserialized and constructed +/// according to discriminators. class SerializationVariant : public ISerialization { public: diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index e06a20d2990..be91e0ba2ee 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -149,19 +149,21 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( assert_cast(*variant_element_state->variant->assumeMutable()).nestedRemoveNullable(); } + /// If nothing to deserialize, just insert defaults. + if (variant_limit == 0) + { + mutable_column->insertManyDefaults(limit); + return; + } + addVariantToPath(settings.path); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); size_t variant_offset = variant_element_state->variant->size() - variant_limit; - /// If don't have our discriminator in range, just insert defaults. - if (variant_limit == 0) - { - mutable_column->insertManyDefaults(limit); - } /// If we have only our discriminator in range, insert the whole range to result column. - else if (variant_limit == limit) + if (variant_limit == limit) { mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, variant_limit); } From f202d713711857c083de5aaba1198198d2eaa3a4 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 17:29:36 +0000 Subject: [PATCH 0168/1081] Make alter with variant espansion as no-op, add tests for alters --- src/DataTypes/DataTypeVariant.cpp | 22 ++ src/DataTypes/DataTypeVariant.h | 4 + .../Serializations/SerializationVariant.cpp | 2 +- .../SerializationVariantElement.cpp | 16 +- src/Storages/MergeTree/MutateTask.cpp | 20 +- .../02941_variant_type_alters.reference | 330 ++++++++++++++++++ .../0_stateless/02941_variant_type_alters.sh | 61 ++++ 7 files changed, 452 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02941_variant_type_alters.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_alters.sh diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index e0510373960..3a39fdf9ea8 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -230,6 +230,28 @@ static DataTypePtr create(const ASTPtr & arguments) return std::make_shared(nested_types); } +bool isVariantExtension(const DataTypePtr & from_type, const DataTypePtr & to_type) +{ + const auto * from_variant = typeid_cast(from_type.get()); + const auto * to_variant = typeid_cast(to_type.get()); + if (!from_variant || !to_variant) + return false; + + const auto & to_variants = to_variant->getVariants(); + std::unordered_set to_variant_types; + to_variant_types.reserve(to_variants.size()); + for (const auto & variant : to_variants) + to_variant_types.insert(variant->getName()); + + for (const auto & variant : from_variant->getVariants()) + { + if (!to_variant_types.contains(variant->getName())) + return false; + } + + return true; +} + void registerDataTypeVariant(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index ca15dff1476..1a1cb6c12f2 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -61,5 +61,9 @@ private: SerializationPtr doGetDefaultSerialization() const override; }; +/// Check if conversion from from_type to to_type is Variant extension +/// (both types are Variants and to_type contains all variants from from_type). +bool isVariantExtension(const DataTypePtr & from_type, const DataTypePtr & to_type); + } diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 78ec0a5e2da..48a78dd54a9 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -275,7 +275,7 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( { auto * discriminators_stream = settings.getter(settings.path); if (!discriminators_stream) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::deserializeBinaryBulkWithMultipleStreams"); + return; SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index be91e0ba2ee..80524cbd814 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -91,7 +91,7 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( { auto * discriminators_stream = settings.getter(settings.path); if (!discriminators_stream) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams"); + return; /// If we started to read a new column, reinitialize discriminators column in deserialization state. if (!variant_element_state->discriminators || result_column->empty()) @@ -156,10 +156,24 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( return; } + size_t prev_variant_size = variant_element_state->variant->size(); addVariantToPath(settings.path); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); + /// If nothing was deserialized when variant_limit > 0 + /// it means that we don't have a stream for such sub-column. + /// It may happen during ALTER MODIFY column with Variant extension. + /// In this case we should just insert default values. + if (variant_element_state->variant->empty()) + { + mutable_column->insertManyDefaults(limit); + return; + } + + if (variant_element_state->variant->size() != prev_variant_size + variant_limit) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected variant column size after deserialization. Expected {}, got {}", prev_variant_size + variant_limit, variant_element_state->variant->size()); + size_t variant_offset = variant_element_state->variant->size() - variant_limit; /// If we have only our discriminator in range, insert the whole range to result column. diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e4070aa8262..44734ec98c0 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -1921,7 +1922,7 @@ static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const if (!part_column) return false; - /// For ALTER MODIFY COLUMN from 'Type' to 'Nullable(Type)' we can skip mutatation and + /// For ALTER MODIFY COLUMN from 'Type' to 'Nullable(Type)' we can skip mutation and /// apply only metadata conversion. But it doesn't work for custom serialization. const auto * to_nullable = typeid_cast(command.data_type.get()); if (!to_nullable) @@ -1937,6 +1938,20 @@ static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const return true; } +static bool canSkipConversionToVariant(const MergeTreeDataPartPtr & part, const MutationCommand & command) +{ + if (command.type != MutationCommand::READ_COLUMN) + return false; + + auto part_column = part->tryGetColumn(command.column_name); + if (!part_column) + return false; + + /// For ALTER MODIFY COLUMN with Variant extension (like 'Variant(T1, T2)' to 'Variant(T1, T2, T3, ...)') + /// we can skip mutation and apply only metadata conversion. + return isVariantExtension(part_column->type, command.data_type); +} + static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, const MutationCommand & command, const ContextPtr & context) { if (command.partition) @@ -1952,6 +1967,9 @@ static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, con if (canSkipConversionToNullable(part, command)) return true; + if (canSkipConversionToVariant(part, command)) + return true; + return false; } diff --git a/tests/queries/0_stateless/02941_variant_type_alters.reference b/tests/queries/0_stateless/02941_variant_type_alters.reference new file mode 100644 index 00000000000..52c834e455b --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_alters.reference @@ -0,0 +1,330 @@ +Memory +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 \N \N \N \N \N \N +1 1 \N \N \N \N \N \N +2 2 \N \N \N \N \N \N +3 3 \N \N 3 \N 3 \N +4 4 \N \N 4 \N 4 \N +5 5 \N \N 5 \N 5 \N +6 6 \N \N str_6 str_6 \N \N +7 7 \N \N str_7 str_7 \N \N +8 8 \N \N str_8 str_8 \N \N +9 9 \N \N \N \N \N \N +10 10 \N \N \N \N \N \N +11 11 \N \N \N \N \N \N +12 12 \N \N 12 \N 12 \N +13 13 \N \N str_13 str_13 \N \N +14 14 \N \N \N \N \N \N +15 15 \N \N 1970-01-16 \N \N 1970-01-16 +16 16 \N \N 1970-01-17 \N \N 1970-01-17 +17 17 \N \N 1970-01-18 \N \N 1970-01-18 +18 18 \N \N 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N \N \N +20 20 \N \N 20 \N 20 \N +21 21 \N \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 \N \N \N \N \N \N +1 1 \N \N \N \N \N \N +2 2 \N \N \N \N \N \N +3 3 \N \N 3 \N 3 \N +4 4 \N \N 4 \N 4 \N +5 5 \N \N 5 \N 5 \N +6 6 \N \N str_6 str_6 \N \N +7 7 \N \N str_7 str_7 \N \N +8 8 \N \N str_8 str_8 \N \N +9 9 \N \N \N \N \N \N +10 10 \N \N \N \N \N \N +11 11 \N \N \N \N \N \N +12 12 \N \N 12 \N 12 \N +13 13 \N \N str_13 str_13 \N \N +14 14 \N \N \N \N \N \N +15 15 \N \N 1970-01-16 \N \N 1970-01-16 +16 16 \N \N 1970-01-17 \N \N 1970-01-17 +17 17 \N \N 1970-01-18 \N \N 1970-01-18 +18 18 \N \N 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N \N \N +20 20 \N \N 20 \N 20 \N +21 21 \N \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N +MergeTree compact +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N +MergeTree wide +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh new file mode 100755 index 00000000000..9b0d4febd65 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column 1" + $CH_CLIENT -q "alter table test add column v Variant(UInt64, String) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64 from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64 from test order by x" + + echo "alter modify column 1" + $CH_CLIENT -q "alter table test modify column v Variant(UInt64, String, Date) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64, v.Date from test order by x" + + echo "insert after alter modify column 1" + $CH_CLIENT -q "insert into test select number, number, toDate(number) from numbers(15, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(18, 4)" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64, v.Date from test order by x" + + echo "alter modify column 2" + $CH_CLIENT -q "alter table test modify column y Variant(UInt64, String) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, v, v.String, v.UInt64, v.Date from test order by x" + + echo "insert after alter modify column 2" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL), NULL from numbers(22, 3)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, v, v.String, v.UInt64, v.Date from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=Memory" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" From 4109b6608186b1b9d9dce60f1821313294b7e7c4 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 17:30:32 +0000 Subject: [PATCH 0169/1081] Remove unneded tag from test --- tests/queries/0_stateless/02941_variant_type_alters.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh index 9b0d4febd65..7e2ecbd67aa 100755 --- a/tests/queries/0_stateless/02941_variant_type_alters.sh +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash -# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment From f1749217ee41b3b721fb8a185a929eb18db89b2f Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 18 Jan 2024 21:53:56 +0200 Subject: [PATCH 0170/1081] added format_schema_rows_template setting --- docs/en/operations/settings/settings-formats.md | 4 ++++ src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + 4 files changed, 7 insertions(+) diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index eb09af44efd..5dedaa2f6ab 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1668,6 +1668,10 @@ Path to file which contains format string for rows (for Template format). Delimiter between rows (for Template format). +### format_schema_rows_template {#format_schema_rows_template} + +Format string for rows (for Template format) + ## CustomSeparated format settings {custom-separated-format-settings} ### format_custom_escaping_rule {#format_custom_escaping_rule} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 292e945a29c..4de739ec405 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1079,6 +1079,7 @@ class IColumn; M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ + M(String, format_schema_rows_template, "\n", "Format string for rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ M(String, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 608f9433d6f..6f414c5a69f 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -166,6 +166,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; + format_settings.template_settings.row_format_schema_string = settings.format_schema_rows_template; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 30e4dd04513..70d33a1edcd 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -338,6 +338,7 @@ struct FormatSettings String resultset_format; String row_format; String row_between_delimiter; + String row_format_schema_string; } template_settings; struct From 3dcc2056a59f9b374b4de3b72c30107dd7825d47 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 20:39:36 +0000 Subject: [PATCH 0171/1081] Fix conflicts --- src/DataTypes/Serializations/ISerialization.cpp | 1 + src/DataTypes/Serializations/ISerialization.h | 1 + src/DataTypes/Serializations/SerializationVariant.cpp | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 2f1eb1887af..7d57d72090b 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -54,6 +54,7 @@ const std::set ISerialization::Substream::named_types TupleElement, NamedOffsets, NamedNullMap, + NamedVariantDiscriminators, }; String ISerialization::Substream::toString() const diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 64a7a889640..7fba9db4acf 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -155,6 +155,7 @@ public: ObjectData, VariantDiscriminators, + NamedVariantDiscriminators, VariantOffsets, VariantElements, VariantElement, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 48a78dd54a9..5af94364167 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -36,7 +36,7 @@ void SerializationVariant::enumerateStreams( const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; - auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", false); + auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", SubstreamType::NamedVariantDiscriminators); auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; settings.path.push_back(Substream::VariantDiscriminators); From cfc8c60aa70917e48281e3583adc922967326d50 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 21:26:55 +0000 Subject: [PATCH 0172/1081] Fix build --- src/DataTypes/Serializations/SerializationVariantElement.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 80524cbd814..8d0acee1c2b 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -208,8 +208,8 @@ void SerializationVariantElement::removeVariantFromPath(DB::ISerialization::Subs } SerializationVariantElement::VariantSubcolumnCreator::VariantSubcolumnCreator( - const DB::ColumnPtr & local_discriminators_, - const DB::String & variant_element_name_, + const ColumnPtr & local_discriminators_, + const String & variant_element_name_, const ColumnVariant::Discriminator global_variant_discriminator_, const ColumnVariant::Discriminator local_variant_discriminator_) : local_discriminators(local_discriminators_) From c966674c242552584540dc2e28026894c39f9b16 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 03:59:05 +0100 Subject: [PATCH 0173/1081] Disable LTO with Coverage --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 063cfc77302..6e984ddd864 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -348,7 +348,7 @@ if (COMPILER_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-absolute-paths") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdiagnostics-absolute-paths") - if (NOT ENABLE_TESTS AND NOT SANITIZE AND OS_LINUX) + if (NOT ENABLE_TESTS AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX) # https://clang.llvm.org/docs/ThinLTO.html # Applies to clang and linux only. # Disabled when building with tests or sanitizers. From c6afbe522cae20ee6041534bf7ee7e31e3acb51c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 05:26:14 +0100 Subject: [PATCH 0174/1081] Do not check for large translation units with coverage --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e984ddd864..d0f44f6f3ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -546,7 +546,7 @@ if (ENABLE_RUST) endif() endif() -if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64)) +if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64)) set(CHECK_LARGE_OBJECT_SIZES_DEFAULT ON) else () set(CHECK_LARGE_OBJECT_SIZES_DEFAULT OFF) From 482229cd27c7ddf4218af2ea5d9b087e51876ab0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 08:04:34 +0100 Subject: [PATCH 0175/1081] Add tests with coverage --- .github/workflows/master.yml | 16 ++++++++++++++++ .github/workflows/pull_request.yml | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 1920f3a2a56..5f683fa6c59 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -371,6 +371,14 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatelessTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateless tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseOrdinary: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -463,6 +471,14 @@ jobs: test_name: Stateful tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatefulTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 28617695ad5..235c8042657 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -382,6 +382,14 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatelessTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateless tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseReplicated: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -509,6 +517,14 @@ jobs: test_name: Stateful tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatefulTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} From 7ae631de1ed1ff4bcb8bac5e06c2026db3ff972c Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 19 Jan 2024 10:23:40 +0000 Subject: [PATCH 0176/1081] Remove wron check, remove duplicate tests --- .../SerializationVariantElement.cpp | 4 -- ...> 02943_variant_read_subcolumns.reference} | 0 ..._1.sh => 02943_variant_read_subcolumns.sh} | 0 .../02943_variant_read_subcolumns_2.reference | 6 --- .../02943_variant_read_subcolumns_2.sh | 38 ------------------- 5 files changed, 48 deletions(-) rename tests/queries/0_stateless/{02943_variant_read_subcolumns_1.reference => 02943_variant_read_subcolumns.reference} (100%) rename tests/queries/0_stateless/{02943_variant_read_subcolumns_1.sh => 02943_variant_read_subcolumns.sh} (100%) delete mode 100644 tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference delete mode 100755 tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 8d0acee1c2b..56f0e5d77be 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -156,7 +156,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( return; } - size_t prev_variant_size = variant_element_state->variant->size(); addVariantToPath(settings.path); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); @@ -171,9 +170,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( return; } - if (variant_element_state->variant->size() != prev_variant_size + variant_limit) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected variant column size after deserialization. Expected {}, got {}", prev_variant_size + variant_limit, variant_element_state->variant->size()); - size_t variant_offset = variant_element_state->variant->size() - variant_limit; /// If we have only our discriminator in range, insert the whole range to result column. diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns.reference similarity index 100% rename from tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference rename to tests/queries/0_stateless/02943_variant_read_subcolumns.reference diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh similarity index 100% rename from tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh rename to tests/queries/0_stateless/02943_variant_read_subcolumns.sh diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference deleted file mode 100644 index 4b93782cddf..00000000000 --- a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference +++ /dev/null @@ -1,6 +0,0 @@ -Memory -test -MergeTree compact -test -MergeTree wide -test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh deleted file mode 100755 index 9ccad55191f..00000000000 --- a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash -# Tags: long - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# reset --log_comment -CLICKHOUSE_LOG_COMMENT= -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " - - -function test() -{ - echo "test" - $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" - $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" - $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" - $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" -} - -$CH_CLIENT -q "drop table if exists test;" - -echo "Memory" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" -test -$CH_CLIENT -q "drop table test;" - -echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" -test -$CH_CLIENT -q "drop table test;" - -echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" -test -$CH_CLIENT -q "drop table test;" - From 0c85339ddb26e00ac64d6c763a0f5019b7ee2619 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 19 Jan 2024 10:50:13 +0000 Subject: [PATCH 0177/1081] Fix style --- src/DataTypes/Serializations/SerializationVariantElement.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 56f0e5d77be..053f8d22d5a 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -9,7 +9,6 @@ namespace DB namespace ErrorCodes { extern const int NOT_IMPLEMENTED; - extern const int LOGICAL_ERROR; } void SerializationVariantElement::enumerateStreams( From a196d04a1c2d5f36ec43c2b0947916be7321037c Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Jan 2024 12:04:16 +0100 Subject: [PATCH 0178/1081] Update test --- .../test_broken_projections/test.py | 94 +++++++++++++++++-- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index 1b192e0df24..48ed10d0f87 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -159,6 +159,19 @@ def get_broken_projections_info(node, table): ).strip() +def get_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, is_broken + FROM system.projection_parts + WHERE table='{table}' + AND active = 1 + AND database=currentDatabase() + ORDER BY parent_name, name + """ + ).strip() + + def optimize(node, table, final, no_wait): query = f"OPTIMIZE TABLE {table}" if final: @@ -389,6 +402,11 @@ def test_broken_ignored_replicated(cluster): assert "has a broken projection" not in check_table_full(node, table_name) +def get_random_string(string_length=8): + alphabet = string.ascii_letters + string.digits + return "".join((random.choice(alphabet) for _ in range(string_length))) + + def test_broken_projections_in_backups(cluster): node = cluster.instances["node"] @@ -400,6 +418,10 @@ def test_broken_projections_in_backups(cluster): insert(node, table_name, 10, 5) insert(node, table_name, 15, 5) + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + check(node, table_name, 1) break_projection(node, table_name, "proj", "all_2_2_0", "data") @@ -409,10 +431,23 @@ def test_broken_projections_in_backups(cluster): node, table_name ) + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t0\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t1\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + + backup_name = f"b1-{get_random_string()}" assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b1') settings check_projection_parts=false; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; """ ) @@ -420,18 +455,30 @@ def test_broken_projections_in_backups(cluster): f""" drop table {table_name} sync; set backup_restore_keeper_fault_injection_probability=0.0; - restore table {table_name} from Disk('backups', 'b1'); + restore table {table_name} from Disk('backups', '{backup_name}'); """ ) + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t0\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t0\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + check(node, table_name, 1) assert "" == get_broken_projections_info(node, table_name) - break_projection(node, table_name, "proj", "all_2_2_0", "part") + break_projection(node, table_name, "proj_2", "all_2_2_0", "part") - check(node, table_name, 0, "proj", "ErrnoException") + check(node, table_name, 0, "proj_2", "ErrnoException") - assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info( + assert "all_2_2_0\tproj_2\tFILE_DOESNT_EXIST" == get_broken_projections_info( node, table_name ) @@ -442,13 +489,14 @@ def test_broken_projections_in_backups(cluster): """ ) - materialize_projection(node, table_name, "proj") + materialize_projection(node, table_name, "proj_2") check(node, table_name, 1) + backup_name = f"b3-{get_random_string()}" assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b3') settings check_projection_parts=false; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; """ ) @@ -456,11 +504,23 @@ def test_broken_projections_in_backups(cluster): f""" drop table {table_name} sync; set backup_restore_keeper_fault_injection_probability=0.0; - restore table {table_name} from Disk('backups', 'b3'); + restore table {table_name} from Disk('backups', '{backup_name}'); """ ) check(node, table_name, 1) + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t0\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t0\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + break_projection(node, table_name, "proj", "all_1_1_0", "part") assert "Part all_1_1_0 has a broken projection proj" in check_table_full( node, table_name @@ -469,10 +529,11 @@ def test_broken_projections_in_backups(cluster): node, table_name ) + backup_name = f"b4-{get_random_string()}" assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false, allow_backup_broken_projections=true; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false, allow_backup_broken_projections=true; """ ) @@ -480,9 +541,22 @@ def test_broken_projections_in_backups(cluster): f""" drop table {table_name} sync; set backup_restore_keeper_fault_injection_probability=0.0; - restore table {table_name} from Disk('backups', 'b4'); + restore table {table_name} from Disk('backups', '{backup_name}'); """ ) + + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t1\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t0\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + check(node, table_name, 0) assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( node, table_name From 580501c2b42231eacc4e843968aeb876ff784297 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 19 Jan 2024 11:08:56 +0000 Subject: [PATCH 0179/1081] Add new settings to settings changes history --- src/Core/SettingsChangesHistory.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 62ffd837a33..af213983b66 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -83,7 +83,9 @@ static std::map sett { {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, - {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}}}, + {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, + {"allow_experimental_variant_type", false, false, "Add new experimental Variant type"}, + {"use_variant_when_no_common_type_in_if", false, false, "Allow to use Variant in if/multiIf if there is no common type"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"}, From 7c66141e08ec203dbff908d69d929ea3bfc0995f Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 19 Jan 2024 11:11:13 +0000 Subject: [PATCH 0180/1081] Automatic style fix --- tests/integration/test_broken_projections/test.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index 48ed10d0f87..8e3978a078e 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -439,8 +439,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t1\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) backup_name = f"b1-{get_random_string()}" @@ -467,8 +466,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t0\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) check(node, table_name, 1) @@ -517,8 +515,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t0\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) break_projection(node, table_name, "proj", "all_1_1_0", "part") @@ -553,8 +550,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t0\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) check(node, table_name, 0) From c51d1f04f6d135c63f5123d4aaef47cef5474525 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 19 Jan 2024 15:57:20 +0100 Subject: [PATCH 0181/1081] Add settings max_unexpected_write_error_retries for Azure Blob Storage --- src/Core/Settings.h | 1 + src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp | 10 +++++----- src/Disks/IO/WriteBufferFromAzureBlobStorage.h | 2 ++ .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 3 ++- .../AzureBlobStorage/AzureObjectStorage.cpp | 1 + .../AzureBlobStorage/AzureObjectStorage.h | 5 ++++- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 292e945a29c..59f32c60f63 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -86,6 +86,7 @@ class IColumn; M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ + M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index 60bc04f5f95..b4665eb7346 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -18,17 +18,17 @@ namespace ProfileEvents namespace DB { -static constexpr auto DEFAULT_RETRY_NUM = 3; - WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( std::shared_ptr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, + size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_) : WriteBufferFromFileBase(buf_size_, nullptr, 0) , log(&Poco::Logger::get("WriteBufferFromAzureBlobStorage")) , max_single_part_upload_size(max_single_part_upload_size_) + , max_unexpected_write_error_retries(max_unexpected_write_error_retries_) , blob_path(blob_path_) , write_settings(write_settings_) , blob_container_client(blob_container_client_) @@ -77,13 +77,13 @@ void WriteBufferFromAzureBlobStorage::execWithRetry(std::function func, void WriteBufferFromAzureBlobStorage::finalizeImpl() { - execWithRetry([this](){ next(); }, DEFAULT_RETRY_NUM); + execWithRetry([this](){ next(); }, max_unexpected_write_error_retries); if (tmp_buffer_write_offset > 0) uploadBlock(tmp_buffer->data(), tmp_buffer_write_offset); auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); - execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, DEFAULT_RETRY_NUM); + execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, max_unexpected_write_error_retries); LOG_TRACE(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); } @@ -94,7 +94,7 @@ void WriteBufferFromAzureBlobStorage::uploadBlock(const char * data, size_t size const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(data), size); - execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, DEFAULT_RETRY_NUM, size); + execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, max_unexpected_write_error_retries, size); tmp_buffer_write_offset = 0; LOG_TRACE(log, "Staged block (id: {}) of size {} (blob path: {}).", block_id, size, blob_path); diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index f1be81922e1..7494130134b 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -30,6 +30,7 @@ public: AzureClientPtr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, + size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_); @@ -48,6 +49,7 @@ private: Poco::Logger * log; const size_t max_single_part_upload_size; + const size_t max_unexpected_write_error_retries; const std::string blob_path; const WriteSettings write_settings; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index a5c8afe0264..a209049ceee 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -164,7 +164,8 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), - config.getInt(config_prefix + ".list_object_keys_size", 1000) + config.getInt(config_prefix + ".list_object_keys_size", 1000), + config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", 4) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 068e2aebab1..683bfeb74a7 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -268,6 +268,7 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO client.get(), object.remote_path, settings.get()->max_single_part_upload_size, + settings.get()->max_unexpected_write_error_retries, buf_size, patchSettings(write_settings)); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 4718dce9bf9..2d505c6a022 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -23,12 +23,14 @@ struct AzureObjectStorageSettings uint64_t min_bytes_for_seek_, int max_single_read_retries_, int max_single_download_retries_, - int list_object_keys_size_) + int list_object_keys_size_, + size_t max_unexpected_write_error_retries_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) + , max_unexpected_write_error_retries (max_unexpected_write_error_retries_) { } @@ -39,6 +41,7 @@ struct AzureObjectStorageSettings size_t max_single_read_retries = 3; size_t max_single_download_retries = 3; int list_object_keys_size = 1000; + size_t max_unexpected_write_error_retries = 4; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; From d22fc3a224ac29857f3dc3eb60ff872221829006 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 19 Jan 2024 16:13:58 +0100 Subject: [PATCH 0182/1081] Updated to fetch default from settings --- .../ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index a209049ceee..e0199fde98b 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -7,6 +7,7 @@ #include #include #include +#include using namespace Azure::Storage::Blobs; @@ -157,7 +158,7 @@ std::unique_ptr getAzureBlobContainerClient( } } -std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr /*context*/) +std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { return std::make_unique( config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), @@ -165,7 +166,7 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), config.getInt(config_prefix + ".list_object_keys_size", 1000), - config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", 4) + config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries) ); } From 05609cf75d5048fbd62508fcf6454cec1855943d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 17:02:17 +0100 Subject: [PATCH 0183/1081] Ci to CI --- tests/ci/ci_config.py | 6 +++--- tests/ci/test_ci_config.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 1d94f4fc1cc..611767be2e4 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -270,9 +270,9 @@ sql_test_params = { @dataclass -class CiConfig: +class CIConfig: """ - Contains configs for ALL jobs in CI pipeline + Contains configs for all jobs in the CI pipeline each config item in the below dicts should be an instance of JobConfig class or inherited from it """ @@ -435,7 +435,7 @@ class CiConfig: raise KeyError("config contains errors", errors) -CI_CONFIG = CiConfig( +CI_CONFIG = CIConfig( label_configs={ Labels.DO_NOT_TEST_LABEL.value: LabelConfig(run_jobs=["Style check"]), }, diff --git a/tests/ci/test_ci_config.py b/tests/ci/test_ci_config.py index d22ed16748e..49d49d9c328 100644 --- a/tests/ci/test_ci_config.py +++ b/tests/ci/test_ci_config.py @@ -3,7 +3,7 @@ import unittest -class TestCiConfig(unittest.TestCase): +class TestCIConfig(unittest.TestCase): def test_no_errors_in_ci_config(self): raised = None try: From 639d7745d450073234405d0725cbd64884d4f8c5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 17:02:23 +0100 Subject: [PATCH 0184/1081] Fix error --- docker/test/base/setup_export_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 7033d4b52e2..d3721108426 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -181,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; + s/^ORDER BY \(?(.+?)\)?$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From 8c54a09e6652b491764abeddf3a0e8e6800374ef Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Jan 2024 08:59:29 +0100 Subject: [PATCH 0185/1081] Fix error --- docker/test/base/setup_export_logs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index d3721108426..156adb1d1e4 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -17,7 +17,7 @@ CLICKHOUSE_CI_LOGS_CLUSTER=${CLICKHOUSE_CI_LOGS_CLUSTER:-system_logs_export} EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name LowCardinality(String), instance_type LowCardinality(String), instance_id String, INDEX ix_pr (pull_request_number) TYPE set(100), INDEX ix_commit (commit_sha) TYPE set(100), INDEX ix_check_time (check_start_time) TYPE minmax, "} EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, toLowCardinality('') AS check_name, toLowCardinality('') AS instance_type, '' AS instance_id"} -EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "} +EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name"} # trace_log needs more columns for symbolization EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), " @@ -181,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/^ORDER BY \(?(.+?)\)?$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; + s/^ORDER BY (([^\(].+?)|\((.+?)\))$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \2\3)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From caf9d8df6d789203a4e408341c9494952eb14ad2 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Sat, 20 Jan 2024 14:59:13 +0100 Subject: [PATCH 0186/1081] Update test.py --- tests/integration/test_broken_projections/test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index 8e3978a078e..d750bb5827d 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -413,6 +413,8 @@ def test_broken_projections_in_backups(cluster): table_name = "test4" create_table(node, table_name, 1) + node.qeury("SYSTEM STOP MERGES") + insert(node, table_name, 0, 5) insert(node, table_name, 5, 5) insert(node, table_name, 10, 5) @@ -557,3 +559,4 @@ def test_broken_projections_in_backups(cluster): assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( node, table_name ) + node.qeury("SYSTEM START MERGES") From e5c3b67f379efdd6d403be08f8bce164348663a1 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Sat, 20 Jan 2024 16:10:09 +0100 Subject: [PATCH 0187/1081] Update test.py --- tests/integration/test_broken_projections/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index d750bb5827d..acf0160cf1b 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -413,7 +413,7 @@ def test_broken_projections_in_backups(cluster): table_name = "test4" create_table(node, table_name, 1) - node.qeury("SYSTEM STOP MERGES") + node.query("SYSTEM STOP MERGES") insert(node, table_name, 0, 5) insert(node, table_name, 5, 5) @@ -559,4 +559,4 @@ def test_broken_projections_in_backups(cluster): assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( node, table_name ) - node.qeury("SYSTEM START MERGES") + node.query("SYSTEM START MERGES") From cfe60586c007a230df68771b3f914d9a66414b7d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Jan 2024 21:45:11 +0100 Subject: [PATCH 0188/1081] Reset coverage after each test --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index b62bd5975ea..49c517852a6 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -2894,7 +2894,7 @@ def parse_args(): parser.add_argument( "--reset-coverage-before-every-test", action="store_true", - default=False, + default=True, help="Collect isolated test coverage for every test instead of a cumulative. Useful only when tests are run sequentially.", ) parser.add_argument( From 51cc01f8be8fea1fcaea0af9c85ca2930536e593 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 21 Jan 2024 14:36:03 +0100 Subject: [PATCH 0189/1081] Minor change --- base/base/coverage.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index 499e384d21f..05bef21049b 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -62,6 +62,7 @@ namespace uintptr_t * allocate(size_t size) { + /// Note: mmap return zero-initialized memory, and we count on that. void * map = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (MAP_FAILED == map) return nullptr; @@ -91,8 +92,6 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) /// Note: we will leak this. current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); - - resetCoverage(); } /// This is called at least once for every DSO for initialization From b967cc6af9deac20eff318e3433fc5b09fd6314a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 21 Jan 2024 15:30:50 +0100 Subject: [PATCH 0190/1081] Fix error --- base/base/coverage.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index 05bef21049b..b85f1a16d32 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -92,6 +92,8 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) /// Note: we will leak this. current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + + resetCoverage(); } /// This is called at least once for every DSO for initialization @@ -102,8 +104,8 @@ void __sanitizer_cov_pcs_init(const uintptr_t * pcs_begin, const uintptr_t * pcs return; pc_table_initialized = true; - all_addresses_array = allocate(sizeof(uintptr_t) * coverage_array_size); all_addresses_array_size = pcs_end - pcs_begin; + all_addresses_array = allocate(sizeof(uintptr_t) * all_addresses_array_size); /// They are not a real pointers, but also contain a flag in the most significant bit, /// in which we are not interested for now. Reset it. @@ -125,10 +127,10 @@ void __sanitizer_cov_trace_pc_guard(uint32_t * guard) /// The values of `*guard` are as you set them in /// __sanitizer_cov_trace_pc_guard_init and so you can make them consecutive /// and use them to dereference an array or a bit vector. - void * pc = __builtin_return_address(0); + intptr_t pc = reinterpret_cast(__builtin_return_address(0)); - current_coverage_array[guard - guards_start] = reinterpret_cast(pc); - cumulative_coverage_array[guard - guards_start] = reinterpret_cast(pc); + current_coverage_array[guard - guards_start] = pc; + cumulative_coverage_array[guard - guards_start] = pc; } } From eae39ff545978386a8a57bca7c68b1ff97cf6d6d Mon Sep 17 00:00:00 2001 From: Blargian Date: Sun, 21 Jan 2024 21:51:06 +0200 Subject: [PATCH 0191/1081] #31363 - modified TemplateBlockOutputFormat to work with added format_schema_rows_template setting --- src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 3 +- src/Formats/FormatSettings.h | 2 +- .../Impl/TemplateBlockOutputFormat.cpp | 33 +++++++++++++++---- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4de739ec405..3143ada7d65 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1078,8 +1078,8 @@ class IColumn; M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ - M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ M(String, format_schema_rows_template, "\n", "Format string for rows (for Template format)", 0) \ + M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ M(String, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 6f414c5a69f..6f7f758621c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -166,7 +166,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; - format_settings.template_settings.row_format_schema_string = settings.format_schema_rows_template; + format_settings.template_settings.row_format_schema = settings.format_schema_rows_template; + format_settings.template_settings.row_between_delimiter_schema = settings.format_schema_rows_between_delimiter; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 70d33a1edcd..28a2076af84 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -338,7 +338,7 @@ struct FormatSettings String resultset_format; String row_format; String row_between_delimiter; - String row_format_schema_string; + String row_format_schema; } template_settings; struct diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 6d8fe1e5a2c..495cc0e541e 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int SYNTAX_ERROR; + extern const int INVALID_TEMPLATE_FORMAT; } TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_, @@ -213,14 +214,34 @@ void registerOutputFormatTemplate(FormatFactory & factory) }); } - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( + ParsedTemplateFormatString row_format; + auto idx_by_name = [&](const String & colName) + { + return sample.getPositionByName(colName); + }; + if (settings.template_settings.row_format.empty()) + { + if (settings.template_settings.row_format_schema.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); + } + else + { + row_format = ParsedTemplateFormatString(); + row_format.parse(settings.template_settings.row_format_schema,idx_by_name); + } + } + else + { + if (settings.template_settings.row_format_schema.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); + } + row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) - { - return sample.getPositionByName(colName); - }); - + idx_by_name); + } return std::make_shared(sample, buf, settings, resultset_format, row_format, settings.template_settings.row_between_delimiter); }); From 02b178cc9c9b38a30344e2ddd896c1eaf429c3de Mon Sep 17 00:00:00 2001 From: MochiXu Date: Mon, 22 Jan 2024 11:08:03 +0800 Subject: [PATCH 0192/1081] fix drop inverted index --- src/Storages/MergeTree/MutateTask.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e4070aa8262..8ed8b8bba4c 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -674,15 +674,21 @@ static NameToNameVector collectFilesForRenames( { if (command.type == MutationCommand::Type::DROP_INDEX) { - if (source_part->checksums.has(INDEX_FILE_PREFIX + command.column_name + ".idx2")) + const std::vector suffixes = {".idx2", ".idx", ".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; + + for (const auto& suffix : suffixes) { - add_rename(INDEX_FILE_PREFIX + command.column_name + ".idx2", ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); - } - else if (source_part->checksums.has(INDEX_FILE_PREFIX + command.column_name + ".idx")) - { - add_rename(INDEX_FILE_PREFIX + command.column_name + ".idx", ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + + if ((suffix == ".idx2" || suffix == ".idx") && source_part->checksums.has(filename)) + { + add_rename(filename, ""); + add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + } + else if (source_part->checksums.has(filename)) + { + add_rename(filename, ""); + } } } else if (command.type == MutationCommand::Type::DROP_PROJECTION) From d2c671c17eb4a85583b30d81033f7180ea93f627 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 19 Jan 2024 20:38:08 +0000 Subject: [PATCH 0193/1081] 1st attempt at vectorization 80 mio arrays with 150 elements each, runtimes in sec WITH (SELECT vec FROM vectors limit 1) AS const_vec SELECT sum(dist) FROM (SELECT (const_vec, vec) AS dist FROM vectors) auto-vectorized hand-vectorized L2 Float32 0.61 0.57 L2 Float64 1.15 0.99 cos Float32 0.78 0.65 cos Float64 1.35 1.05 --- src/Functions/array/arrayDistance.cpp | 145 +++++++++++++++++- .../02282_array_distance.reference | 4 + .../0_stateless/02282_array_distance.sql | 40 +++-- 3 files changed, 172 insertions(+), 17 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index c68c89ee0d5..670442c0c79 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -9,6 +10,10 @@ #include #include +#if USE_MULTITARGET_CODE +#include +#endif + namespace DB { namespace ErrorCodes @@ -75,6 +80,49 @@ struct L2Distance state.sum += other_state.sum; } +#if USE_MULTITARGET_CODE + template + AVX512_FUNCTION_SPECIFIC_ATTRIBUTE static void accumulateCombine( + const ResultType * __restrict data_x, + const ResultType * __restrict data_y, + size_t i_max, + size_t & i_x, + size_t & i_y, + State & state) + { + __m512 sums; + if constexpr (std::is_same_v) + sums = _mm512_setzero_ps(); + else + sums = _mm512_setzero_pd(); + + const size_t n = (std::is_same_v) ? 16 : 8; + + for (; i_x + n < i_max; i_x += n, i_y += n) + { + if constexpr (std::is_same_v) + { + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + __m512 differences = _mm512_sub_ps(x, y); + sums = _mm512_fmadd_ps(differences, differences, sums); + } + else + { + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + __m512 differences = _mm512_sub_pd(x, y); + sums = _mm512_fmadd_pd(differences, differences, sums); + } + } + + if constexpr (std::is_same_v) + state.sum = _mm512_reduce_add_ps(sums); + else + state.sum = _mm512_reduce_add_pd(sums); + } +#endif + template static ResultType finalize(const State & state, const ConstParams &) { @@ -189,6 +237,70 @@ struct CosineDistance state.y_squared += other_state.y_squared; } +#if USE_MULTITARGET_CODE + template + AVX512_FUNCTION_SPECIFIC_ATTRIBUTE static void accumulateCombine( + const ResultType * __restrict data_x, + const ResultType * __restrict data_y, + size_t i_max, + size_t & i_x, + size_t & i_y, + State & state) + { + __m512 dot_products; + __m512 x_squareds; + __m512 y_squareds; + + if constexpr (std::is_same_v) + { + dot_products = _mm512_setzero_ps(); + x_squareds = _mm512_setzero_ps(); + y_squareds = _mm512_setzero_ps(); + } + else + { + dot_products = _mm512_setzero_pd(); + x_squareds = _mm512_setzero_pd(); + y_squareds = _mm512_setzero_pd(); + } + + const size_t n = (std::is_same_v) ? 16 : 8; + + for (; i_x + n < i_max; i_x += n, i_y += n) + { + if constexpr (std::is_same_v) + { + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + dot_products = _mm512_fmadd_ps(x, y, dot_products); + x_squareds = _mm512_fmadd_ps(x, x, x_squareds); + y_squareds = _mm512_fmadd_ps(y, y, y_squareds); + } + else + { + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + dot_products = _mm512_fmadd_pd(x, y, dot_products); + x_squareds = _mm512_fmadd_pd(x, x, x_squareds); + y_squareds = _mm512_fmadd_pd(y, y, y_squareds); + } + } + + if constexpr (std::is_same_v) + { + state.dot_prod = _mm512_reduce_add_ps(dot_products); + state.x_squared = _mm512_reduce_add_ps(x_squareds); + state.y_squared = _mm512_reduce_add_ps(y_squareds); + } + else + { + state.dot_prod = _mm512_reduce_add_pd(dot_products); + state.x_squared = _mm512_reduce_add_pd(x_squareds); + state.y_squared = _mm512_reduce_add_pd(y_squareds); + } + } +#endif + template static ResultType finalize(const State & state, const ConstParams &) { @@ -352,7 +464,7 @@ private: /// Check that arrays in both columns are the sames size for (size_t row = 0; row < offsets_x.size(); ++row) { - if (unlikely(offsets_x[row] != offsets_y[row])) + if (offsets_x[row] != offsets_y[row]) [[unlikely]] { ColumnArray::Offset prev_offset = row > 0 ? offsets_x[row] : 0; throw Exception( @@ -420,7 +532,7 @@ private: ColumnArray::Offset prev_offset = 0; for (size_t row : collections::range(0, offsets_y.size())) { - if (unlikely(offsets_x[0] != offsets_y[row] - prev_offset)) + if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] { throw Exception( ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, @@ -438,14 +550,35 @@ private: auto & result_data = result->getData(); /// Do the actual computation - ColumnArray::Offset prev = 0; + size_t prev = 0; size_t row = 0; + for (auto off : offsets_y) { + size_t i = 0; + typename Kernel::template State state; + + /// SIMD optimization: process multiple elements in both input arrays at once. + /// To avoid combinatorial explosion of SIMD kernels, focus on + /// - the two most common input/output types (Float32 x Float32) --> Float32 and (Float64 x Float64) --> Float64 instead of 10 x + /// 10 input types x 2 output types, + /// - const/non-const inputs instead of non-const/non-const inputs + /// - the two most common metrics L2 and cosine distance, + /// - the most powerful SIMD instruction set (AVX-512F). +#if USE_MULTITARGET_CODE + if constexpr (std::is_same_v && std::is_same_v) /// ResultType is Float32 or Float64 + { + if constexpr (std::is_same_v + || std::is_same_v) + { + if (isArchSupported(TargetArch::AVX512F)) + Kernel::template accumulateCombine(data_x.data(), data_y.data(), i + offsets_x[0], i, prev, state); + } + } +#else /// Process chunks in vectorized manner static constexpr size_t VEC_SIZE = 4; typename Kernel::template State states[VEC_SIZE]; - size_t i = 0; for (; prev + VEC_SIZE < off; i += VEC_SIZE, prev += VEC_SIZE) { for (size_t s = 0; s < VEC_SIZE; ++s) @@ -453,10 +586,9 @@ private: states[s], static_cast(data_x[i + s]), static_cast(data_y[prev + s]), kernel_params); } - typename Kernel::template State state; for (const auto & other_state : states) Kernel::template combine(state, other_state, kernel_params); - +#endif /// Process the tail for (; prev < off; ++i, ++prev) { @@ -466,6 +598,7 @@ private: result_data[row] = Kernel::finalize(state, kernel_params); row++; } + return result; } diff --git a/tests/queries/0_stateless/02282_array_distance.reference b/tests/queries/0_stateless/02282_array_distance.reference index 9758da9a833..c21e294cb62 100644 --- a/tests/queries/0_stateless/02282_array_distance.reference +++ b/tests/queries/0_stateless/02282_array_distance.reference @@ -80,3 +80,7 @@ nan 5 6 268 2 10.234459893824097 23.15167380558045 536 0.00007815428961455151 6 5 268 2 10.234459893824097 23.15167380558045 536 0.00007815428961455151 6 6 0 0 0 0 0 0 +5.8309517 +0.0003244877 +5.830951894845301 +0.0003245172890904424 diff --git a/tests/queries/0_stateless/02282_array_distance.sql b/tests/queries/0_stateless/02282_array_distance.sql index 9c16071dc1f..2cca853fd67 100644 --- a/tests/queries/0_stateless/02282_array_distance.sql +++ b/tests/queries/0_stateless/02282_array_distance.sql @@ -12,10 +12,10 @@ SELECT cosineDistance([1, 2, 3], [0, 0, 0]); -- Overflows WITH CAST([-547274980, 1790553898, 1981517754, 1908431500, 1352428565, -573412550, -552499284, 2096941042], 'Array(Int32)') AS a SELECT - L1Distance(a,a), - L2Distance(a,a), - L2SquaredDistance(a,a), - LinfDistance(a,a), + L1Distance(a, a), + L2Distance(a, a), + L2SquaredDistance(a, a), + LinfDistance(a, a), cosineDistance(a, a); DROP TABLE IF EXISTS vec1; @@ -88,15 +88,33 @@ SELECT FROM vec2f v1, vec2d v2 WHERE length(v1.v) == length(v2.v); -SELECT L1Distance([0, 0], [1]); -- { serverError 190 } -SELECT L2Distance([1, 2], (3,4)); -- { serverError 43 } -SELECT L2SquaredDistance([1, 2], (3,4)); -- { serverError 43 } -SELECT LpDistance([1, 2], [3,4]); -- { serverError 42 } -SELECT LpDistance([1, 2], [3,4], -1.); -- { serverError 69 } -SELECT LpDistance([1, 2], [3,4], 'aaa'); -- { serverError 43 } -SELECT LpDistance([1, 2], [3,4], materialize(2.7)); -- { serverError 44 } +SELECT L1Distance([0, 0], [1]); -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } +SELECT L2Distance([1, 2], (3,4)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT L2SquaredDistance([1, 2], (3,4)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT LpDistance([1, 2], [3,4]); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT LpDistance([1, 2], [3,4], -1.); -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT LpDistance([1, 2], [3,4], 'aaa'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT LpDistance([1, 2], [3,4], materialize(2.7)); -- { serverError ILLEGAL_COLUMN } DROP TABLE vec1; DROP TABLE vec2; DROP TABLE vec2f; DROP TABLE vec2d; + +-- Queries which trigger manually vectorized implementation + +SELECT L2Distance( + [toFloat32(0.0), toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0)], + materialize([toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0), toFloat32(34.0)])); + +SELECT cosineDistance( + [toFloat32(0.0), toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0)], + materialize([toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0), toFloat32(34.0)])); + +SELECT L2Distance( + [toFloat64(0.0), toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0)], + materialize([toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0), toFloat64(34.0)])); + +SELECT cosineDistance( + [toFloat64(0.0), toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0)], + materialize([toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0), toFloat64(34.0)])); From 68d0f4e42161713f3b54de2069d894b1f84ed833 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 20 Jan 2024 21:36:25 +0000 Subject: [PATCH 0194/1081] (Futile) unrolling attempt at vectorization --- src/Functions/array/arrayDistance.cpp | 88 ++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 16 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 670442c0c79..aa13ee01d9a 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -90,36 +90,92 @@ struct L2Distance size_t & i_y, State & state) { - __m512 sums; - if constexpr (std::is_same_v) - sums = _mm512_setzero_ps(); - else - sums = _mm512_setzero_pd(); + __m512 sums1; + __m512 sums2; + __m512 sums3; + __m512 sums4; - const size_t n = (std::is_same_v) ? 16 : 8; + if constexpr (std::is_same_v) + { + sums1 = _mm512_setzero_ps(); + sums2 = _mm512_setzero_ps(); + sums3 = _mm512_setzero_ps(); + sums4 = _mm512_setzero_ps(); + } + else + { + sums1 = _mm512_setzero_pd(); + sums2 = _mm512_setzero_pd(); + sums3 = _mm512_setzero_pd(); + sums4 = _mm512_setzero_pd(); + } + + const size_t n = (std::is_same_v) ? 64 : 32; for (; i_x + n < i_max; i_x += n, i_y += n) { if constexpr (std::is_same_v) { - __m512 x = _mm512_loadu_ps(data_x + i_x); - __m512 y = _mm512_loadu_ps(data_y + i_y); - __m512 differences = _mm512_sub_ps(x, y); - sums = _mm512_fmadd_ps(differences, differences, sums); + __m512 x1 = _mm512_loadu_ps(data_x + i_x); + __m512 y1 = _mm512_loadu_ps(data_y + i_y); + __m512 diff1 = _mm512_sub_ps(x1, y1); + sums1 = _mm512_fmadd_ps(diff1, diff1, sums1); + + __m512 x2 = _mm512_loadu_ps(data_x + i_x + 16); + __m512 y2 = _mm512_loadu_ps(data_y + i_y + 16); + __m512 diff2 = _mm512_sub_ps(x2, y2); + sums2 = _mm512_fmadd_ps(diff2, diff2, sums2); + + __m512 x3 = _mm512_loadu_ps(data_x + i_x + 32); + __m512 y3 = _mm512_loadu_ps(data_y + i_y + 32); + __m512 diff3 = _mm512_sub_ps(x3, y3); + sums3 = _mm512_fmadd_ps(diff3, diff3, sums3); + + __m512 x4 = _mm512_loadu_ps(data_x + i_x + 48); + __m512 y4 = _mm512_loadu_ps(data_y + i_y + 48); + __m512 diff4 = _mm512_sub_ps(x4, y4); + sums4 = _mm512_fmadd_ps(diff4, diff4, sums4); } else { - __m512 x = _mm512_loadu_pd(data_x + i_x); - __m512 y = _mm512_loadu_pd(data_y + i_y); - __m512 differences = _mm512_sub_pd(x, y); - sums = _mm512_fmadd_pd(differences, differences, sums); + __m512 x1 = _mm512_loadu_pd(data_x + i_x); + __m512 y1 = _mm512_loadu_pd(data_y + i_y); + __m512 diff1 = _mm512_sub_pd(x1, y1); + sums1 = _mm512_fmadd_pd(diff1, diff1, sums1); + + __m512 x2 = _mm512_loadu_pd(data_x + i_x + 8); + __m512 y2 = _mm512_loadu_pd(data_y + i_y + 8); + __m512 diff2 = _mm512_sub_pd(x2, y2); + sums2 = _mm512_fmadd_pd(diff2, diff2, sums2); + + __m512 x3 = _mm512_loadu_pd(data_x + i_x + 16); + __m512 y3 = _mm512_loadu_pd(data_y + i_y + 16); + __m512 diff3 = _mm512_sub_pd(x3, y3); + sums3 = _mm512_fmadd_pd(diff3, diff3, sums3); + + __m512 x4 = _mm512_loadu_pd(data_x + i_x + 24); + __m512 y4 = _mm512_loadu_pd(data_y + i_y + 24); + __m512 diff4 = _mm512_sub_pd(x4, y4); + sums4 = _mm512_fmadd_pd(diff4, diff4, sums4); } } if constexpr (std::is_same_v) - state.sum = _mm512_reduce_add_ps(sums); + { + Float32 sum1 = _mm512_reduce_add_ps(sums1); + Float32 sum2 = _mm512_reduce_add_ps(sums2); + Float32 sum3 = _mm512_reduce_add_ps(sums3); + Float32 sum4 = _mm512_reduce_add_ps(sums4); + state.sum = sum1 + sum2 + sum3 + sum4; + } else - state.sum = _mm512_reduce_add_pd(sums); + { + Float64 sum1 = _mm512_reduce_add_pd(sums1); + Float64 sum2 = _mm512_reduce_add_pd(sums2); + Float64 sum3 = _mm512_reduce_add_pd(sums3); + Float64 sum4 = _mm512_reduce_add_pd(sums4); + state.sum = sum1 + sum2 + sum3 + sum4; + } } #endif From 68fc97089ec22d29b5d25df4e3865a22cf9701db Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 20 Jan 2024 21:50:13 +0000 Subject: [PATCH 0195/1081] Revert "(Futile) unrolling attempt at vectorization" This reverts commit df30a990545eafdf5e6a09034d81a97fb0188ba0. --- src/Functions/array/arrayDistance.cpp | 84 +++++---------------------- 1 file changed, 14 insertions(+), 70 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index aa13ee01d9a..670442c0c79 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -90,92 +90,36 @@ struct L2Distance size_t & i_y, State & state) { - __m512 sums1; - __m512 sums2; - __m512 sums3; - __m512 sums4; - + __m512 sums; if constexpr (std::is_same_v) - { - sums1 = _mm512_setzero_ps(); - sums2 = _mm512_setzero_ps(); - sums3 = _mm512_setzero_ps(); - sums4 = _mm512_setzero_ps(); - } + sums = _mm512_setzero_ps(); else - { - sums1 = _mm512_setzero_pd(); - sums2 = _mm512_setzero_pd(); - sums3 = _mm512_setzero_pd(); - sums4 = _mm512_setzero_pd(); - } + sums = _mm512_setzero_pd(); - const size_t n = (std::is_same_v) ? 64 : 32; + const size_t n = (std::is_same_v) ? 16 : 8; for (; i_x + n < i_max; i_x += n, i_y += n) { if constexpr (std::is_same_v) { - __m512 x1 = _mm512_loadu_ps(data_x + i_x); - __m512 y1 = _mm512_loadu_ps(data_y + i_y); - __m512 diff1 = _mm512_sub_ps(x1, y1); - sums1 = _mm512_fmadd_ps(diff1, diff1, sums1); - - __m512 x2 = _mm512_loadu_ps(data_x + i_x + 16); - __m512 y2 = _mm512_loadu_ps(data_y + i_y + 16); - __m512 diff2 = _mm512_sub_ps(x2, y2); - sums2 = _mm512_fmadd_ps(diff2, diff2, sums2); - - __m512 x3 = _mm512_loadu_ps(data_x + i_x + 32); - __m512 y3 = _mm512_loadu_ps(data_y + i_y + 32); - __m512 diff3 = _mm512_sub_ps(x3, y3); - sums3 = _mm512_fmadd_ps(diff3, diff3, sums3); - - __m512 x4 = _mm512_loadu_ps(data_x + i_x + 48); - __m512 y4 = _mm512_loadu_ps(data_y + i_y + 48); - __m512 diff4 = _mm512_sub_ps(x4, y4); - sums4 = _mm512_fmadd_ps(diff4, diff4, sums4); + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + __m512 differences = _mm512_sub_ps(x, y); + sums = _mm512_fmadd_ps(differences, differences, sums); } else { - __m512 x1 = _mm512_loadu_pd(data_x + i_x); - __m512 y1 = _mm512_loadu_pd(data_y + i_y); - __m512 diff1 = _mm512_sub_pd(x1, y1); - sums1 = _mm512_fmadd_pd(diff1, diff1, sums1); - - __m512 x2 = _mm512_loadu_pd(data_x + i_x + 8); - __m512 y2 = _mm512_loadu_pd(data_y + i_y + 8); - __m512 diff2 = _mm512_sub_pd(x2, y2); - sums2 = _mm512_fmadd_pd(diff2, diff2, sums2); - - __m512 x3 = _mm512_loadu_pd(data_x + i_x + 16); - __m512 y3 = _mm512_loadu_pd(data_y + i_y + 16); - __m512 diff3 = _mm512_sub_pd(x3, y3); - sums3 = _mm512_fmadd_pd(diff3, diff3, sums3); - - __m512 x4 = _mm512_loadu_pd(data_x + i_x + 24); - __m512 y4 = _mm512_loadu_pd(data_y + i_y + 24); - __m512 diff4 = _mm512_sub_pd(x4, y4); - sums4 = _mm512_fmadd_pd(diff4, diff4, sums4); + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + __m512 differences = _mm512_sub_pd(x, y); + sums = _mm512_fmadd_pd(differences, differences, sums); } } if constexpr (std::is_same_v) - { - Float32 sum1 = _mm512_reduce_add_ps(sums1); - Float32 sum2 = _mm512_reduce_add_ps(sums2); - Float32 sum3 = _mm512_reduce_add_ps(sums3); - Float32 sum4 = _mm512_reduce_add_ps(sums4); - state.sum = sum1 + sum2 + sum3 + sum4; - } + state.sum = _mm512_reduce_add_ps(sums); else - { - Float64 sum1 = _mm512_reduce_add_pd(sums1); - Float64 sum2 = _mm512_reduce_add_pd(sums2); - Float64 sum3 = _mm512_reduce_add_pd(sums3); - Float64 sum4 = _mm512_reduce_add_pd(sums4); - state.sum = sum1 + sum2 + sum3 + sum4; - } + state.sum = _mm512_reduce_add_pd(sums); } #endif From df0c018a9be06e9ccbfb40460f29b155aa86b57f Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Fri, 12 Jan 2024 16:09:09 +0800 Subject: [PATCH 0196/1081] support T64 for date32 type --- src/Compression/CompressionCodecT64.cpp | 6 +++++ .../00873_t64_codec_date.reference | 4 +++ .../0_stateless/00873_t64_codec_date.sql | 26 +++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 tests/queries/0_stateless/00873_t64_codec_date.reference create mode 100644 tests/queries/0_stateless/00873_t64_codec_date.sql diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp index bf9a9414bc1..42c6a18aa77 100644 --- a/src/Compression/CompressionCodecT64.cpp +++ b/src/Compression/CompressionCodecT64.cpp @@ -91,6 +91,7 @@ enum class MagicNumber : uint8_t Decimal32 = 19, Decimal64 = 20, IPv4 = 21, + Date32 = 22, }; MagicNumber serializeTypeId(std::optional type_id) @@ -109,6 +110,7 @@ MagicNumber serializeTypeId(std::optional type_id) case TypeIndex::Int32: return MagicNumber::Int32; case TypeIndex::Int64: return MagicNumber::Int64; case TypeIndex::Date: return MagicNumber::Date; + case TypeIndex::Date32: return MagicNumber::Date32; case TypeIndex::DateTime: return MagicNumber::DateTime; case TypeIndex::DateTime64: return MagicNumber::DateTime64; case TypeIndex::Enum8: return MagicNumber::Enum8; @@ -137,6 +139,7 @@ TypeIndex deserializeTypeId(uint8_t serialized_type_id) case MagicNumber::Int32: return TypeIndex::Int32; case MagicNumber::Int64: return TypeIndex::Int64; case MagicNumber::Date: return TypeIndex::Date; + case MagicNumber::Date32: return TypeIndex::Date32; case MagicNumber::DateTime: return TypeIndex::DateTime; case MagicNumber::DateTime64: return TypeIndex::DateTime64; case MagicNumber::Enum8: return TypeIndex::Enum8; @@ -177,6 +180,8 @@ TypeIndex baseType(TypeIndex type_idx) case TypeIndex::Enum16: case TypeIndex::Date: return TypeIndex::UInt16; + case TypeIndex::Date32: + return TypeIndex::Int32; case TypeIndex::UInt32: case TypeIndex::DateTime: case TypeIndex::IPv4: @@ -205,6 +210,7 @@ TypeIndex typeIdx(const IDataType * data_type) case TypeIndex::UInt16: case TypeIndex::Enum16: case TypeIndex::Date: + case TypeIndex::Date32: case TypeIndex::Int32: case TypeIndex::UInt32: case TypeIndex::IPv4: diff --git a/tests/queries/0_stateless/00873_t64_codec_date.reference b/tests/queries/0_stateless/00873_t64_codec_date.reference new file mode 100644 index 00000000000..1568c3122e6 --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.reference @@ -0,0 +1,4 @@ +1970-01-01 1970-01-01 1950-01-01 1950-01-01 +1970-01-01 1970-01-01 1970-01-01 1970-01-01 +2149-06-06 2149-06-06 2149-06-08 2149-06-08 +2149-06-06 2149-06-06 2149-06-06 2149-06-06 diff --git a/tests/queries/0_stateless/00873_t64_codec_date.sql b/tests/queries/0_stateless/00873_t64_codec_date.sql new file mode 100644 index 00000000000..e9230c75665 --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS t64; + +CREATE TABLE t64 +( + date16 Date, + t_date16 Date Codec(T64, ZSTD), + date_32 Date32, + t_date32 Date32 Codec(T64, ZSTD) +) ENGINE MergeTree() ORDER BY tuple(); + +INSERT INTO t64 values ('1970-01-01', '1970-01-01', '1970-01-01', '1970-01-01'); +INSERT INTO t64 values ('2149-06-06', '2149-06-06', '2149-06-06', '2149-06-06'); +INSERT INTO t64 values ('2149-06-08', '2149-06-08', '2149-06-08', '2149-06-08'); +INSERT INTO t64 values ('1950-01-01', '1950-01-01', '1950-01-01', '1950-01-01'); + +SELECT * FROM t64 ORDER BY date16; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +OPTIMIZE TABLE t64 FINAL; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +DROP TABLE t64; From b4dcd6755a8a2384e5937991e0656058aed4f95a Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 Jan 2024 13:41:58 +0000 Subject: [PATCH 0197/1081] Use ColumnConst instead of Field in IColumn::createWithOffsets --- src/Columns/ColumnConst.cpp | 22 +++++++++++++++++++ src/Columns/ColumnConst.h | 5 +++++ src/Columns/ColumnNullable.cpp | 16 ++++++-------- src/Columns/ColumnNullable.h | 2 +- src/Columns/ColumnObject.cpp | 3 ++- src/Columns/ColumnSparse.cpp | 3 ++- src/Columns/ColumnVector.cpp | 5 +++-- src/Columns/ColumnVector.h | 2 +- src/Columns/IColumn.cpp | 6 ++--- src/Columns/IColumn.h | 5 +++-- src/Functions/IFunction.cpp | 4 ++-- .../0_stateless/02941_variant_type_alters.sh | 1 + 12 files changed, 52 insertions(+), 22 deletions(-) diff --git a/src/Columns/ColumnConst.cpp b/src/Columns/ColumnConst.cpp index 9aa0f5cfa49..6e5a3c45c4e 100644 --- a/src/Columns/ColumnConst.cpp +++ b/src/Columns/ColumnConst.cpp @@ -159,4 +159,26 @@ void ColumnConst::compareColumn( std::fill(compare_results.begin(), compare_results.end(), res); } +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value) +{ + auto data = column->cloneEmpty(); + data->insert(value); + return ColumnConst::create(std::move(data), 1); +} + +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, size_t const_value_index) +{ + auto data = column->cloneEmpty(); + data->insertFrom(*column, const_value_index); + return ColumnConst::create(std::move(data), 1); +} + +ColumnConst::Ptr createColumnConstWithDefaultValue(const ColumnPtr & column) +{ + auto data = column->cloneEmpty(); + data->insertDefault(); + return ColumnConst::create(std::move(data), 1); +} + + } diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index dc84e0c2402..3c646a62795 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -292,4 +292,9 @@ public: bool isCollationSupported() const override { return data->isCollationSupported(); } }; +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value); +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, size_t const_value_index); +ColumnConst::Ptr createColumnConstWithDefaultValue(const ColumnPtr &column); + + } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 25b0e35e15e..c0b13204b8e 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -827,24 +827,22 @@ void ColumnNullable::checkConsistency() const "Logical error: Sizes of nested column and null map of Nullable column are not equal"); } -ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const { ColumnPtr new_values; ColumnPtr new_null_map; - if (default_field.getType() == Field::Types::Null) + const ColumnNullable & nullable_column_with_default_value = assert_cast(column_with_default_value.getDataColumn()); + if (nullable_column_with_default_value.isNullAt(0)) { - auto default_column = nested_column->cloneEmpty(); - default_column->insertDefault(); - /// Value in main column, when null map is 1 is implementation defined. So, take any value. - new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift); - new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift); + new_values = nested_column->createWithOffsets(offsets, *createColumnConstWithDefaultValue(nested_column), total_rows, shift); + new_null_map = null_map->createWithOffsets(offsets, *createColumnConst(null_map, Field(1u)), total_rows, shift); } else { - new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift); - new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift); + new_values = nested_column->createWithOffsets(offsets, *ColumnConst::create(nullable_column_with_default_value.getNestedColumnPtr(), 1), total_rows, shift); + new_null_map = null_map->createWithOffsets(offsets, *createColumnConst(null_map, Field(0u)), total_rows, shift); } return ColumnNullable::create(new_values, new_null_map); diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 60c7750f8fc..3e04ba8a180 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -167,7 +167,7 @@ public: getIndicesOfNonDefaultRowsImpl(indices, from, limit); } - ColumnPtr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + ColumnPtr createWithOffsets(const Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const override; bool isNullable() const override { return true; } bool isFixedAndContiguous() const override { return false; } diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index f7176568a1b..0ec9c616bab 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -475,7 +476,7 @@ void ColumnObject::Subcolumn::finalize() { auto values = part->index(*offsets, offsets->size()); values = castColumn({values, from_type, ""}, to_type); - part = values->createWithOffsets(offsets_data, to_type->getDefault(), part_size, /*shift=*/ 0); + part = values->createWithOffsets(offsets_data, *createColumnConstWithDefaultValue(result_column->getPtr()), part_size, /*shift=*/ 0); } } diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 02e6e9e56b4..eeeec912ce8 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -130,7 +131,7 @@ StringRef ColumnSparse::getDataAt(size_t n) const ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const { - return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1); + return values->createWithOffsets(getOffsetsData(), *createColumnConst(values, 0), _size, /*shift=*/ 1); } void ColumnSparse::insertSingleValue(const Inserter & inserter) diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index b1cf449dfde..3aadc530878 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -940,7 +941,7 @@ ColumnPtr ColumnVector::compress() const } template -ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const { if (offsets.size() + shift != size()) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -949,7 +950,7 @@ ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, c auto res = this->create(); auto & res_data = res->getData(); - T default_value = static_cast(default_field.safeGet()); + T default_value = assert_cast &>(column_with_default_value.getDataColumn()).getElement(0); res_data.resize_fill(total_rows, default_value); for (size_t i = 0; i < offsets.size(); ++i) res_data[offsets[i]] = data[i + shift]; diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index fab2d5f06aa..652cc1f5ff9 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -379,7 +379,7 @@ public: return this->template getIndicesOfNonDefaultRowsImpl(indices, from, limit); } - ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const override; ColumnPtr compress() const override; diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 82dc82e0bd9..d7f83b822d2 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -35,7 +35,7 @@ void IColumn::insertFrom(const IColumn & src, size_t n) insert(src[n]); } -ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const { if (offsets.size() + shift != size()) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -51,14 +51,14 @@ ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & defa current_offset = offsets[i]; if (offsets_diff > 1) - res->insertMany(default_field, offsets_diff - 1); + res->insertManyFrom(column_with_default_value.getDataColumn(), 0, offsets_diff - 1); res->insertFrom(*this, i + shift); } ssize_t offsets_diff = static_cast(total_rows) - current_offset; if (offsets_diff > 1) - res->insertMany(default_field, offsets_diff - 1); + res->insertManyFrom(column_with_default_value.getDataColumn(), 0, offsets_diff - 1); return res; } diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 3f866e6213d..1dcd3acdd19 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -34,6 +34,7 @@ class Arena; class ColumnGathererStream; class Field; class WeakHash32; +class ColumnConst; /* * Represents a set of equal ranges in previous column to perform sorting in current column. @@ -462,10 +463,10 @@ public: /// Returns column with @total_size elements. /// In result column values from current column are at positions from @offsets. - /// Other values are filled by @default_value. + /// Other values are filled by value from @column_with_default_value. /// @shift means how much rows to skip from the beginning of current column. /// Used to create full column from sparse. - [[nodiscard]] virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const; + [[nodiscard]] virtual Ptr createWithOffsets(const Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const; /// Compress column in memory to some representation that allows to decompress it back. /// Return itself if compression is not applicable for this column type. diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index a46f4d2a11d..d4c6b8f4ba6 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -313,7 +313,7 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, { bool use_default_implementation_for_sparse_columns = useDefaultImplementationForSparseColumns(); /// DataTypeFunction does not support obtaining default (isDefaultAt()) - /// ColumnFunction does not support getting specific values + /// ColumnFunction does not support getting specific values. if (result_type->getTypeId() != TypeIndex::Function && use_default_implementation_for_sparse_columns) { size_t num_sparse_columns = 0; @@ -368,7 +368,7 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, if (!result_type->canBeInsideSparseColumns() || !res->isDefaultAt(0) || res->getNumberOfDefaultRows() != 1) { const auto & offsets_data = assert_cast &>(*sparse_offsets).getData(); - return res->createWithOffsets(offsets_data, (*res)[0], input_rows_count, /*shift=*/ 1); + return res->createWithOffsets(offsets_data, *createColumnConst(res, 0), input_rows_count, /*shift=*/ 1); } return ColumnSparse::create(res, sparse_offsets, input_rows_count); diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh index 7e2ecbd67aa..9b0d4febd65 100755 --- a/tests/queries/0_stateless/02941_variant_type_alters.sh +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment From 15e3a5b3961ac304a30ef211594f57bda3a2f584 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 22 Jan 2024 14:57:10 +0100 Subject: [PATCH 0198/1081] Try fix flaky test --- .../integration/test_broken_projections/test.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index acf0160cf1b..87d910b9c77 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -28,10 +28,17 @@ def cluster(): cluster.shutdown() -def create_table(node, table, replica, data_prefix=""): +def create_table(node, table, replica, data_prefix="", aggressive_merge=True): if data_prefix == "": data_prefix = table + if aggressive_merge: + vertical_merge_algorithm_min_rows_to_activate = 1 + vertical_merge_algorithm_min_columns_to_activate = 1 + else: + vertical_merge_algorithm_min_rows_to_activate = 100000 + vertical_merge_algorithm_min_columns_to_activate = 100 + node.query( f""" DROP TABLE IF EXISTS {table} SYNC; @@ -56,9 +63,8 @@ def create_table(node, table, replica, data_prefix=""): SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, - vertical_merge_algorithm_min_rows_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_rows_to_activate = {vertical_merge_algorithm_min_rows_to_activate}, + vertical_merge_algorithm_min_columns_to_activate = {vertical_merge_algorithm_min_columns_to_activate}, compress_primary_key=0; """ ) @@ -411,7 +417,7 @@ def test_broken_projections_in_backups(cluster): node = cluster.instances["node"] table_name = "test4" - create_table(node, table_name, 1) + create_table(node, table_name, 1, aggressive_merge=False) node.query("SYSTEM STOP MERGES") From 0606a772674fdecf08a9a904ef46293e8bba9acc Mon Sep 17 00:00:00 2001 From: mochi Date: Mon, 22 Jan 2024 22:02:50 +0800 Subject: [PATCH 0199/1081] Update src/Storages/MergeTree/MutateTask.cpp Co-authored-by: Dmitry Novik --- src/Storages/MergeTree/MutateTask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 8ed8b8bba4c..fccee6bd887 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -676,7 +676,7 @@ static NameToNameVector collectFilesForRenames( { const std::vector suffixes = {".idx2", ".idx", ".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; - for (const auto& suffix : suffixes) + for (const auto & suffix : suffixes) { String filename = INDEX_FILE_PREFIX + command.column_name + suffix; From 2e7ce5b0e208c91874d44eb0c828a1e01544a387 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 22 Jan 2024 16:24:43 +0100 Subject: [PATCH 0200/1081] Updated settings ptr and fetching of client from Disk & ObjectStorage --- src/Backups/BackupIO_AzureBlobStorage.cpp | 32 ++++++++----------- src/Backups/BackupIO_AzureBlobStorage.h | 4 +-- .../AzureBlobStorage/AzureObjectStorage.h | 7 +++- .../Cached/CachedObjectStorage.h | 8 +++++ src/Disks/ObjectStorages/IObjectStorage.h | 13 ++++++++ .../copyAzureBlobStorageFile.cpp | 22 ++++++------- .../copyAzureBlobStorageFile.h | 4 +-- src/Storages/StorageAzureBlob.cpp | 2 +- 8 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 8c6c1040eec..fca324869ae 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -37,13 +37,12 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); - settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); - auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr), + StorageAzureBlob::createSettings(context_), configuration_.container); - client = object_storage->getClient(); + client = object_storage->getAzureBlobStorageClient(); + settings = object_storage->getSettings(); } BackupReaderAzureBlobStorage::~BackupReaderAzureBlobStorage() = default; @@ -89,8 +88,8 @@ std::unique_ptr BackupReaderAzureBlobStorage::readFile(const key = file_name; } return std::make_unique( - client.get(), key, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + client.get(), key, read_settings, settings.get()->max_single_read_retries, + settings.get()->max_single_download_retries); } void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, @@ -98,10 +97,8 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, { LOG_INFO(&Poco::Logger::get("BackupReaderAzureBlobStorage"), "Enter copyFileToDisk"); - /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. - /// We don't check for `has_throttling` here because the native copy almost doesn't use network. auto destination_data_source_description = destination_disk->getDataSourceDescription(); - if (destination_data_source_description.sameKind(data_source_description) + if ((destination_data_source_description.type == DataSourceType::AzureBlobStorage) && (destination_data_source_description.is_encrypted == encrypted_in_backup)) { LOG_TRACE(log, "Copying {} from AzureBlobStorage to disk {}", path_in_backup, destination_disk->getName()); @@ -115,7 +112,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, copyAzureBlobStorageFile( client, - reinterpret_cast(destination_disk->getObjectStorage().get())->getClient(), + destination_disk->getObjectStorage()->getAzureBlobStorageClient(), configuration.container, fs::path(configuration.blob_path) / path_in_backup, 0, @@ -150,13 +147,12 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); - settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); - auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupWriterAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr), + StorageAzureBlob::createSettings(context_), configuration_.container); - client = object_storage->getClient(); + client = object_storage->getAzureBlobStorageClient(); + settings = object_storage->getSettings(); } void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -172,7 +168,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu { LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); copyAzureBlobStorageFile( - reinterpret_cast(src_disk->getObjectStorage().get())->getClient(), + src_disk->getObjectStorage()->getAzureBlobStorageClient(), client, /* src_container */ blob_path[1], /* src_path */ blob_path[0], @@ -267,8 +263,8 @@ std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String } return std::make_unique( - client.get(), key, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + client.get(), key, read_settings, settings.get()->max_single_read_retries, + settings.get()->max_single_download_retries); } std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const String & file_name) @@ -285,7 +281,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin return std::make_unique( client.get(), key, - settings->max_single_part_upload_size, + settings.get()->max_single_part_upload_size, DBMS_DEFAULT_BUFFER_SIZE, write_settings); } diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 12bf073cd08..87dc470cdb3 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -31,7 +31,7 @@ private: MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; - std::shared_ptr settings; + MultiVersion settings; }; class BackupWriterAzureBlobStorage : public BackupWriterDefault @@ -60,7 +60,7 @@ private: MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; - std::shared_ptr settings; + MultiVersion settings; }; } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 52d535054ff..a9d082539e6 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -139,7 +139,12 @@ public: bool isRemote() const override { return true; } - MultiVersion & getClient() { return client; } + MultiVersion & getSettings() { return settings; } + + MultiVersion & getAzureBlobStorageClient() override + { + return client; + } private: const String name; diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 4c185db051d..6b0ff8be58a 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -3,6 +3,7 @@ #include #include #include +#include "config.h" namespace Poco { @@ -118,6 +119,13 @@ public: static bool canUseReadThroughCache(const ReadSettings & settings); +#if USE_AZURE_BLOB_STORAGE + MultiVersion & getAzureBlobStorageClient() override + { + return object_storage->getAzureBlobStorageClient(); + } +#endif + private: FileCache::Key getCacheKey(const std::string & path) const; diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index f405be72287..cf113586ddf 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -23,7 +23,12 @@ #include #include #include +#include "config.h" +#if USE_AZURE_BLOB_STORAGE +#include +#include +#endif namespace DB { @@ -212,6 +217,14 @@ public: virtual WriteSettings patchSettings(const WriteSettings & write_settings) const; +#if USE_AZURE_BLOB_STORAGE + virtual MultiVersion & getAzureBlobStorageClient() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "This function is only implemented for AzureBlobStorage"); + } +#endif + + private: mutable std::mutex throttlers_mutex; ThrottlerPtr remote_read_throttler; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 272be914cc1..bb8702e9b41 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -49,7 +49,7 @@ namespace size_t total_size_, const String & dest_container_, const String & dest_blob_, - std::shared_ptr settings_, + MultiVersion settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, bool for_disk_azure_blob_storage_, @@ -65,7 +65,7 @@ namespace , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) , log(log_) - , max_single_part_upload_size(settings_->max_single_part_upload_size) + , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) { } @@ -78,7 +78,7 @@ namespace size_t total_size; const String & dest_container; const String & dest_blob; - std::shared_ptr settings; + MultiVersion settings; const std::optional> & object_metadata; ThreadPoolCallbackRunner schedule; bool for_disk_azure_blob_storage; @@ -114,9 +114,9 @@ namespace if (!total_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - auto max_part_number = settings->max_part_number; - auto min_upload_part_size = settings->min_upload_part_size; - auto max_upload_part_size = settings->max_upload_part_size; + auto max_part_number = settings.get()->max_part_number; + auto min_upload_part_size = settings.get()->min_upload_part_size; + auto max_upload_part_size = settings.get()->max_upload_part_size; if (!max_part_number) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); @@ -333,7 +333,7 @@ void copyDataToAzureBlobStorageFile( MultiVersion & dest_client, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) @@ -352,14 +352,14 @@ void copyAzureBlobStorageFile( size_t size, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const ReadSettings & read_settings, const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - if (settings->use_native_copy) + if (settings.get()->use_native_copy) { ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) @@ -393,8 +393,8 @@ void copyAzureBlobStorageFile( LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client.get(), src_blob, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + return std::make_unique(src_client.get(), src_blob, read_settings, settings.get()->max_single_read_retries, + settings.get()->max_single_download_retries); }; UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index b022151d32d..491f7cd7176 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -29,7 +29,7 @@ void copyAzureBlobStorageFile( size_t src_size, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const ReadSettings & read_settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, @@ -48,7 +48,7 @@ void copyDataToAzureBlobStorageFile( MultiVersion & client, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, bool for_disk_azure_blob_storage = false); diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 7a40d2dcb73..e54838c7a61 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -1214,7 +1214,7 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files + std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; if (num_rows_from_cache) { From 4e5249275ed67c52d958007978c66619db22a1a5 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 22 Jan 2024 16:45:25 +0100 Subject: [PATCH 0201/1081] Do not resolve remote table id on initiator --- src/Storages/StorageDistributed.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 987ea4a4957..9972517bbac 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -779,18 +779,11 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, } else { - auto resolved_remote_storage_id = remote_storage_id; - // In case of cross-replication we don't know what database is used for the table. - // `storage_id.hasDatabase()` can return false only on the initiator node. - // Each shard will use the default database (in the case of cross-replication shards may have different defaults). - if (remote_storage_id.hasDatabase()) - resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); - auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); auto column_names_and_types = distributed_storage_snapshot->getColumns(get_column_options); - auto storage = std::make_shared(resolved_remote_storage_id, ColumnsDescription{column_names_and_types}); + auto storage = std::make_shared(remote_storage_id, ColumnsDescription{column_names_and_types}); auto table_node = std::make_shared(std::move(storage), query_context); if (table_expression_modifiers) From 7b235fe643e744b643be6e4d0788de63cae4a07c Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 22 Jan 2024 22:59:59 +0200 Subject: [PATCH 0202/1081] #31363 - remove schema delimiter setting and add test 00937_format_schema_rows_template.sh and reference --- src/Formats/FormatFactory.cpp | 1 - .../Impl/TemplateBlockOutputFormat.cpp | 15 +++------ ...0937_format_schema_rows_template.reference | 4 +++ .../00937_format_schema_rows_template.sh | 32 +++++++++++++++++++ 4 files changed, 40 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/00937_format_schema_rows_template.reference create mode 100755 tests/queries/0_stateless/00937_format_schema_rows_template.sh diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 6f7f758621c..184778a9fa9 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -167,7 +167,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; format_settings.template_settings.row_format_schema = settings.format_schema_rows_template; - format_settings.template_settings.row_between_delimiter_schema = settings.format_schema_rows_between_delimiter; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 495cc0e541e..99a7f59c09e 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -221,21 +221,14 @@ void registerOutputFormatTemplate(FormatFactory & factory) }; if (settings.template_settings.row_format.empty()) { - if (settings.template_settings.row_format_schema.empty()) - { - throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); - } - else - { - row_format = ParsedTemplateFormatString(); - row_format.parse(settings.template_settings.row_format_schema,idx_by_name); - } + row_format = ParsedTemplateFormatString(); + row_format.parse(settings.template_settings.row_format_schema,idx_by_name); } else { - if (settings.template_settings.row_format_schema.empty()) + if (!settings.template_settings.row_format_schema.empty()) { - throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template, but not both"); } row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.reference b/tests/queries/0_stateless/00937_format_schema_rows_template.reference new file mode 100644 index 00000000000..167f16ec55f --- /dev/null +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.reference @@ -0,0 +1,4 @@ +Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number of Likes: 456, Date: 2016-01-02; +Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; +Question: 'Is it opensource', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 + diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh new file mode 100755 index 00000000000..651e3618f83 --- /dev/null +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2016 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Test format_schema_rows_template setting + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template (question String, answer String, likes UInt64, date Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="INSERT INTO template VALUES +('How awesome is clickhouse?', 'unbelievably awesome!', 456, '2016-01-02'),\ +('How fast is clickhouse?', 'Lightning fast!', 9876543210, '2016-01-03'),\ +('Is it opensource', 'of course it is!', 789, '2016-01-04')"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'"; + +echo -e "\n" + +# Test that if both format_schema_rows_template setting and format_template_row are provided, error is thrown + +echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > "$CURDIR"/00937_template_output_format_row.tmp +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ +format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'"; -- { serverError 474 } + +$CLICKHOUSE_CLIENT --query="DROP TABLE template"; +rm "$CURDIR"/00937_template_output_format_row.tmp \ No newline at end of file From 3832a8261a19004e88a32b4bab39f6b46b14daa6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 22 Jan 2024 23:20:02 +0200 Subject: [PATCH 0203/1081] #31363 - update documentation for En and Ru --- docs/en/interfaces/formats.md | 4 +++- docs/ru/interfaces/formats.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a11c3e5ef19..fd44fbf4462 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -253,7 +253,7 @@ This format is also available under the name `TSVRawWithNamesAndNames`. This format allows specifying a custom format string with placeholders for values with a specified escaping rule. -It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_template_resultset`, `format_template_row` (`format_schema_rows_template`), `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Setting `format_template_row` specifies the path to the file containing format strings for rows with the following syntax: @@ -279,6 +279,8 @@ the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quo `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluste, or if the format is trivial then `format_schema_rows_template` can be used to pass the template string directly in the query, rather than a path to the file which contains it. + The `format_template_rows_between_delimiter` setting specifies the delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) Setting `format_template_resultset` specifies the path to the file, which contains a format string for resultset. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index b4794b02743..8f8197e2221 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -201,7 +201,7 @@ SELECT * FROM nestedt FORMAT TSV Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. -Для этого используются настройки `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) +Для этого используются настройки `format_template_resultset`, `format_template_row` (`format_schema_rows_template`), `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) Настройка `format_template_row` задаёт путь к файлу, содержащему форматную строку для строк таблицы, которая должна иметь вид: @@ -227,6 +227,8 @@ SELECT * FROM nestedt FORMAT TSV `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +В тех случаях, когда не удобно или не возможно указать произвольную форматную строку в файле, можно использовать `format_schema_rows_template` указать произвольную форматную строку в запросе. + Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: From c59f3e164ffedf1a7561b0fd0a65c2555685ca91 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Thu, 18 Jan 2024 12:08:29 -0800 Subject: [PATCH 0204/1081] Return baseline component as fourth array --- .../sql-reference/functions/time-series-functions.md | 8 ++++++-- src/Functions/seriesDecomposeSTL.cpp | 10 +++++++++- .../0_stateless/02813_seriesDecomposeSTL.reference | 8 ++++---- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 144d832b36a..2e42aa884b4 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -77,8 +77,8 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of three arrays where the first array include seasonal components, the second array - trend, -and the third array - residue component. +- An array of four arrays where the first array include seasonal components, the second array - trend, +the third array - residue component, and the fourth array - baseline component. Type: [Array](../../sql-reference/data-types/array.md). @@ -107,6 +107,10 @@ Result: [ 0, 0.0000019073486, -0.0000019073486, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0000019073486, 0, 0 + ], + [ + 10.1, 20.449999, 40.340004, 10.100001, 20.45, 40.34, 10.100001, 20.45, 40.34, 10.1, 20.45, 40.34, + 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.100002, 20.45, 40.34 ]] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 21e36761213..0c44afa32a6 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -128,6 +128,10 @@ public: res_data.insert(residue.begin(), residue.end()); res_col_offsets_data.push_back(res_data.size()); + // Create Baseline = seasonal + trend + std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus()); + res_col_offsets_data.push_back(res_data.size()); + root_offsets_data.push_back(res_col_offsets->size()); prev_src_offset = curr_offset; @@ -201,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of three arrays where the first array include seasonal components, the second array - trend, and the third array - residue component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline component. Type: [Array](../../sql-reference/data-types/array.md). @@ -230,6 +234,10 @@ Result: [ 0, 0.0000019073486, -0.0000019073486, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0000019073486, 0, 0 + ], + [ + 10.1, 20.449999, 40.340004, 10.100001, 20.45, 40.34, 10.100001, 20.45, 40.34, 10.1, 20.45, 40.34, + 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.100002, 20.45, 40.34 ]] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ```)", diff --git a/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference b/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference index dc30e7f8371..28dae705335 100644 --- a/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference +++ b/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference @@ -1,4 +1,4 @@ -[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0]] -[[4.04452e-8,-1.7846537e-8,-5.9488454e-9,0,0,0,0,0,0,-1.9868216e-8,-9.5297715e-8,2.2540547e-9,3.4229203e-8,8.573613e-8],[1.9999999,2,2,2,2,2,2,2,2,2,2,2,1.9999996,1.9999996],[1.1920929e-7,0,0,0,0,0,0,0,0,0,0,0,3.5762787e-7,2.3841858e-7]] -[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0]] -[[53.946846,-4.8119445,43.525013,-23.71359,-42.472305,-51.636955,-50.458298,-51.982674,37.62072,-15.9006605,56.65076,-5.809669,57.143845,-2.0370207,54.050922,-4.897961,43.954018,-23.808758,-42.651337,-51.86827,-50.709732,-52.18156,37.734905,-15.853402,56.91643,-5.8815174,57.253094,-2.012879,54.157806,-4.9817176,44.384747,-23.902956,-42.830154,-52.10025,-50.96271,-52.3829,37.84573,-15.81032,57.177113,-5.958963,57.356136,-1.9952412,54.27533,-5.066312,44.878296,-23.956438,-42.993656,-52.337124,-51.208073,-52.615646,37.91102,-15.8062525,57.49891,-6.056076,57.45604,-1.9797823,54.39525,-5.1483474,45.374573],[88.028534,88.95315,89.87776,90.802376,91.64913,92.49588,93.342636,94.19737,95.0521,95.90684,96.712975,97.51912,98.32526,98.36342,98.40158,98.43974,98.36777,98.29579,98.223816,98.536446,98.849075,99.161705,99.7552,100.348694,100.94219,101.53184,102.12149,102.711136,103.79921,104.88729,105.975365,107.50462,109.033875,110.56313,111.79767,113.032196,114.26673,115.02128,115.775826,116.53037,117.15541,117.78044,118.40548,118.86489,119.3243,119.783714,120.04031,120.29691,120.55351,120.78621,121.01891,121.25161,121.533585,121.81555,122.09753,122.41821,122.7389,123.059586,123.39267],[-2.97538,2.8587952,-23.402771,0.91121674,4.8231735,9.141075,8.115662,10.785301,0.32717896,5.99382,-12.363731,5.29055,0.53089905,-2.3264008,-3.4524994,1.4582214,-2.321785,2.51297,5.4275208,3.3318253,5.8606567,0.019859314,-4.4901123,-12.495293,-5.8586197,-1.650322,-11.374588,4.3017426,4.042984,1.094429,9.639885,3.3983307,-3.20372,-5.462883,-5.834961,-6.649292,-1.1124649,3.7890396,16.047066,-2.5714111,8.488449,-2.785202,2.319191,-0.79857635,13.797401,-5.827278,-6.0466614,-5.9597855,-7.3454437,-3.1705627,6.0700684,3.5546417,1.9675064,-0.7594757,2.446434,0.5615692,0.86585236,-3.9112396,1.2327576]] +[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0],[10.1,20.449999,40.340004,10.100001,20.45,40.34,10.100001,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.100002,20.45,40.34]] +[[4.04452e-8,-1.7846537e-8,-5.9488454e-9,0,0,0,0,0,0,-1.9868216e-8,-9.5297715e-8,2.2540547e-9,3.4229203e-8,8.573613e-8],[1.9999999,2,2,2,2,2,2,2,2,2,2,2,1.9999996,1.9999996],[1.1920929e-7,0,0,0,0,0,0,0,0,0,0,0,3.5762787e-7,2.3841858e-7],[1.9999999,2,2,2,2,2,2,2,2,2,1.9999999,2,1.9999996,1.9999998]] +[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0],[10.1,20.449999,40.340004,10.100001,20.45,40.34,10.100001,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.100002,20.45,40.34]] +[[53.946846,-4.8119445,43.525013,-23.71359,-42.472305,-51.636955,-50.458298,-51.982674,37.62072,-15.9006605,56.65076,-5.809669,57.143845,-2.0370207,54.050922,-4.897961,43.954018,-23.808758,-42.651337,-51.86827,-50.709732,-52.18156,37.734905,-15.853402,56.91643,-5.8815174,57.253094,-2.012879,54.157806,-4.9817176,44.384747,-23.902956,-42.830154,-52.10025,-50.96271,-52.3829,37.84573,-15.81032,57.177113,-5.958963,57.356136,-1.9952412,54.27533,-5.066312,44.878296,-23.956438,-42.993656,-52.337124,-51.208073,-52.615646,37.91102,-15.8062525,57.49891,-6.056076,57.45604,-1.9797823,54.39525,-5.1483474,45.374573],[88.028534,88.95315,89.87776,90.802376,91.64913,92.49588,93.342636,94.19737,95.0521,95.90684,96.712975,97.51912,98.32526,98.36342,98.40158,98.43974,98.36777,98.29579,98.223816,98.536446,98.849075,99.161705,99.7552,100.348694,100.94219,101.53184,102.12149,102.711136,103.79921,104.88729,105.975365,107.50462,109.033875,110.56313,111.79767,113.032196,114.26673,115.02128,115.775826,116.53037,117.15541,117.78044,118.40548,118.86489,119.3243,119.783714,120.04031,120.29691,120.55351,120.78621,121.01891,121.25161,121.533585,121.81555,122.09753,122.41821,122.7389,123.059586,123.39267],[-2.97538,2.8587952,-23.402771,0.91121674,4.8231735,9.141075,8.115662,10.785301,0.32717896,5.99382,-12.363731,5.29055,0.53089905,-2.3264008,-3.4524994,1.4582214,-2.321785,2.51297,5.4275208,3.3318253,5.8606567,0.019859314,-4.4901123,-12.495293,-5.8586197,-1.650322,-11.374588,4.3017426,4.042984,1.094429,9.639885,3.3983307,-3.20372,-5.462883,-5.834961,-6.649292,-1.1124649,3.7890396,16.047066,-2.5714111,8.488449,-2.785202,2.319191,-0.79857635,13.797401,-5.827278,-6.0466614,-5.9597855,-7.3454437,-3.1705627,6.0700684,3.5546417,1.9675064,-0.7594757,2.446434,0.5615692,0.86585236,-3.9112396,1.2327576],[141.97537,84.141205,133.40277,67.08878,49.176826,40.858925,42.88434,42.2147,132.67282,80.00618,153.36374,91.70945,155.4691,96.3264,152.4525,93.54178,142.32178,74.48703,55.57248,46.668175,48.139343,46.980145,137.49011,84.49529,157.85863,95.65032,159.37459,100.69826,157.95702,99.90557,150.3601,83.60167,66.20372,58.462883,60.834957,60.649296,152.11246,99.21096,172.95294,110.57141,174.51155,115.7852,172.68082,113.79858,164.2026,95.82728,77.04666,67.95979,69.34544,68.17056,158.92993,105.44536,179.0325,115.759476,179.55356,120.43843,177.13416,117.91124,168.76724]] From f935493f284e1acd94eacdf0c50f91de688d817e Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Fri, 19 Jan 2024 09:05:04 -0800 Subject: [PATCH 0205/1081] fix trailing whitespaces --- docs/en/sql-reference/functions/time-series-functions.md | 2 +- src/Functions/seriesDecomposeSTL.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 2e42aa884b4..21e66302ad2 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -78,7 +78,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** - An array of four arrays where the first array include seasonal components, the second array - trend, -the third array - residue component, and the fourth array - baseline component. +the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. Type: [Array](../../sql-reference/data-types/array.md). diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 0c44afa32a6..9a6a229e282 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -128,7 +128,7 @@ public: res_data.insert(residue.begin(), residue.end()); res_col_offsets_data.push_back(res_data.size()); - // Create Baseline = seasonal + trend + // Create Baseline = seasonal + trend std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus()); res_col_offsets_data.push_back(res_data.size()); @@ -205,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. Type: [Array](../../sql-reference/data-types/array.md). From d9edd5a7f36491a8d86705e6c7221c1a74cd6ef5 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Fri, 19 Jan 2024 10:20:01 -0800 Subject: [PATCH 0206/1081] fix spell check --- docs/en/sql-reference/functions/time-series-functions.md | 2 +- src/Functions/seriesDecomposeSTL.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 21e66302ad2..016c3410944 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -78,7 +78,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** - An array of four arrays where the first array include seasonal components, the second array - trend, -the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. +the third array - residue component, and the fourth array - baseline(seasonal + trend) component. Type: [Array](../../sql-reference/data-types/array.md). diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 9a6a229e282..4376691868b 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -205,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasonal + trend) component. Type: [Array](../../sql-reference/data-types/array.md). From 084ee74b6898214024feafcfa292ff8419bc0050 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Mon, 22 Jan 2024 07:31:33 -0800 Subject: [PATCH 0207/1081] minor fix --- src/Functions/seriesDecomposeSTL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 4376691868b..e9276c4aefb 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -129,7 +129,7 @@ public: res_col_offsets_data.push_back(res_data.size()); // Create Baseline = seasonal + trend - std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus()); + std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus<>()); res_col_offsets_data.push_back(res_data.size()); root_offsets_data.push_back(res_col_offsets->size()); From e78eb41264ebb37d3fd813850a3e55ce7690ecea Mon Sep 17 00:00:00 2001 From: MyroTk <44327070+MyroTk@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:19:31 -0800 Subject: [PATCH 0208/1081] Update Dockerfile --- docker/test/integration/runner/Dockerfile | 57 +++++++++++------------ 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 2a81db78a3d..dbf90f9b810 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -63,47 +63,46 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \ RUN python3 -m pip install --no-cache-dir \ - PyMySQL \ - aerospike==11.1.0 \ - asyncio \ + PyMySQL==1.1.0 \ + asyncio==3.4.3 \ avro==1.10.2 \ - azure-storage-blob \ - boto3 \ - cassandra-driver \ + azure-storage-blob==12.19.0 \ + boto3==1.34.24 \ + cassandra-driver==3.29.0 \ confluent-kafka==1.9.2 \ delta-spark==2.3.0 \ - dict2xml \ - dicttoxml \ + dict2xml==1.7.4 \ + dicttoxml==1.7.16 \ docker==6.1.3 \ docker-compose==1.29.2 \ - grpcio \ - grpcio-tools \ - kafka-python \ - kazoo \ - lz4 \ - minio \ - nats-py \ - protobuf \ + grpcio==1.60.0 \ + grpcio-tools==1.60.0 \ + kafka-python==2.0.2 \ + kazoo==2.9.0 \ + lz4==4.3.3 \ + minio==7.2.3 \ + nats-py==2.6.0 \ + protobuf==4.25.2 \ psycopg2-binary==2.9.6 \ - pyhdfs \ + pyhdfs==0.3.1 \ pymongo==3.11.0 \ pyspark==3.3.2 \ - pytest \ + pytest==7.4.4 \ pytest-order==1.0.0 \ - pytest-random \ - pytest-repeat \ - pytest-timeout \ - pytest-xdist \ + pytest-random==0.2 \ + pytest-repeat==0.9.3 \ + pytest-timeout==2.2.0 \ + pytest-xdist==3.5.0 \ pytest-reportlog==0.4.0 \ - pytz \ + pytz==2023.3.post1 \ pyyaml==5.3.1 \ - redis \ - requests-kerberos \ + redis==5.0.1 \ + requests-kerberos==0.14.0 \ tzlocal==2.1 \ - retry \ - bs4 \ - lxml \ - urllib3 + retry==0.9.2 \ + bs4==0.0.2 \ + lxml==5.1.0 \ + urllib3==2.0.7 # bs4, lxml are for cloud tests, do not delete # Hudi supports only spark 3.3.*, not 3.4 From 276ccd3d47be40b79abbaf7734f557d578501b19 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 23 Jan 2024 07:18:14 +0200 Subject: [PATCH 0209/1081] empty commit to restart CI checks From 78df07199bc57c8dac9a56fb8092eb1256ad8b56 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 23 Jan 2024 07:16:38 +0000 Subject: [PATCH 0210/1081] More consistent tests for inverted index --- ...> 02346_inverted_index_bug47393.reference} | 0 .../02346_inverted_index_bug47393.sql | 25 +++++++++++++++++++ ...> 02346_inverted_index_bug52019.reference} | 0 ....sql => 02346_inverted_index_bug52019.sql} | 17 ++++++++++--- ...46_inverted_index_detach_attach.reference} | 0 ...=> 02346_inverted_index_detach_attach.sql} | 6 ++--- ...nverted_index_experimental_flag.reference} | 0 ...2346_inverted_index_experimental_flag.sql} | 3 +++ ..._inverted_index_match_predicate.reference} | 0 ... 02346_inverted_index_match_predicate.sql} | 2 ++ .../02346_inverted_index_mutation.sql | 25 ------------------- ... => 02346_inverted_index_search.reference} | 0 ...ch.sql => 02346_inverted_index_search.sql} | 0 13 files changed, 47 insertions(+), 31 deletions(-) rename tests/queries/0_stateless/{02346_inverted_index_mutation.reference => 02346_inverted_index_bug47393.reference} (100%) create mode 100644 tests/queries/0_stateless/02346_inverted_index_bug47393.sql rename tests/queries/0_stateless/{02696_inverted_idx_checksums.reference => 02346_inverted_index_bug52019.reference} (100%) rename tests/queries/0_stateless/{02862_index_inverted_incorrect_args.sql => 02346_inverted_index_bug52019.sql} (62%) rename tests/queries/0_stateless/{02862_index_inverted_incorrect_args.reference => 02346_inverted_index_detach_attach.reference} (100%) rename tests/queries/0_stateless/{02696_inverted_idx_checksums.sql => 02346_inverted_index_detach_attach.sql} (75%) rename tests/queries/0_stateless/{02895_forbid_create_inverted_index.reference => 02346_inverted_index_experimental_flag.reference} (100%) rename tests/queries/0_stateless/{02895_forbid_create_inverted_index.sql => 02346_inverted_index_experimental_flag.sql} (72%) rename tests/queries/0_stateless/{02951_inverted_index_support_match.reference => 02346_inverted_index_match_predicate.reference} (100%) rename tests/queries/0_stateless/{02951_inverted_index_support_match.sql => 02346_inverted_index_match_predicate.sql} (97%) delete mode 100644 tests/queries/0_stateless/02346_inverted_index_mutation.sql rename tests/queries/0_stateless/{02346_full_text_search.reference => 02346_inverted_index_search.reference} (100%) rename tests/queries/0_stateless/{02346_full_text_search.sql => 02346_inverted_index_search.sql} (100%) diff --git a/tests/queries/0_stateless/02346_inverted_index_mutation.reference b/tests/queries/0_stateless/02346_inverted_index_bug47393.reference similarity index 100% rename from tests/queries/0_stateless/02346_inverted_index_mutation.reference rename to tests/queries/0_stateless/02346_inverted_index_bug47393.reference diff --git a/tests/queries/0_stateless/02346_inverted_index_bug47393.sql b/tests/queries/0_stateless/02346_inverted_index_bug47393.sql new file mode 100644 index 00000000000..166e051b120 --- /dev/null +++ b/tests/queries/0_stateless/02346_inverted_index_bug47393.sql @@ -0,0 +1,25 @@ +SET allow_experimental_inverted_index = 1; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab +( + id UInt64, + str String, + INDEX idx str TYPE inverted(3) GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS min_rows_for_wide_part = 1, min_bytes_for_wide_part = 1; + +INSERT INTO tab (str) VALUES ('I am inverted'); + +SELECT data_version FROM system.parts WHERE database = currentDatabase() AND table = 'tab' AND active = 1; + +-- update column synchronously +ALTER TABLE tab UPDATE str = 'I am not inverted' WHERE 1 SETTINGS mutations_sync=1; + +SELECT data_version FROM system.parts WHERE database = currentDatabase() AND table = 'tab' AND active = 1; + +SELECT str FROM tab WHERE str LIKE '%inverted%' SETTINGS force_data_skipping_indices = 'idx'; + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02696_inverted_idx_checksums.reference b/tests/queries/0_stateless/02346_inverted_index_bug52019.reference similarity index 100% rename from tests/queries/0_stateless/02696_inverted_idx_checksums.reference rename to tests/queries/0_stateless/02346_inverted_index_bug52019.reference diff --git a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql b/tests/queries/0_stateless/02346_inverted_index_bug52019.sql similarity index 62% rename from tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql rename to tests/queries/0_stateless/02346_inverted_index_bug52019.sql index 7ba122a7155..c61e17d9cea 100644 --- a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql +++ b/tests/queries/0_stateless/02346_inverted_index_bug52019.sql @@ -1,9 +1,20 @@ --- https://github.com/ClickHouse/ClickHouse/issues/52019 -DROP TABLE IF EXISTS tab; +-- Test for Bug 52019: Undefined behavior + SET allow_experimental_inverted_index=1; -CREATE TABLE tab (`k` UInt64, `s` Map(String, String), INDEX af mapKeys(s) TYPE inverted(2) GRANULARITY 1) ENGINE = MergeTree ORDER BY k SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab ( + k UInt64, + s Map(String, String), + INDEX idx mapKeys(s) TYPE inverted(2) GRANULARITY 1) +ENGINE = MergeTree +ORDER BY k +SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; + INSERT INTO tab (k) VALUES (0); SELECT * FROM tab PREWHERE (s[NULL]) = 'Click a03' SETTINGS allow_experimental_analyzer=1; SELECT * FROM tab PREWHERE (s[1]) = 'Click a03' SETTINGS allow_experimental_analyzer=1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT * FROM tab PREWHERE (s['foo']) = 'Click a03' SETTINGS allow_experimental_analyzer=1; + DROP TABLE tab; diff --git a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.reference b/tests/queries/0_stateless/02346_inverted_index_detach_attach.reference similarity index 100% rename from tests/queries/0_stateless/02862_index_inverted_incorrect_args.reference rename to tests/queries/0_stateless/02346_inverted_index_detach_attach.reference diff --git a/tests/queries/0_stateless/02696_inverted_idx_checksums.sql b/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql similarity index 75% rename from tests/queries/0_stateless/02696_inverted_idx_checksums.sql rename to tests/queries/0_stateless/02346_inverted_index_detach_attach.sql index 92ffa7a6196..762d78922fe 100644 --- a/tests/queries/0_stateless/02696_inverted_idx_checksums.sql +++ b/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql @@ -2,8 +2,8 @@ SET allow_experimental_inverted_index = 1; CREATE TABLE t ( - `key` UInt64, - `str` String, + key UInt64, + str String, INDEX inv_idx str TYPE inverted(0) GRANULARITY 1 ) ENGINE = MergeTree @@ -13,4 +13,4 @@ INSERT INTO t VALUES (1, 'Hello World'); ALTER TABLE t DETACH PART 'all_1_1_0'; -ALTER TABLE t ATTACH PART 'all_1_1_0'; \ No newline at end of file +ALTER TABLE t ATTACH PART 'all_1_1_0'; diff --git a/tests/queries/0_stateless/02895_forbid_create_inverted_index.reference b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.reference similarity index 100% rename from tests/queries/0_stateless/02895_forbid_create_inverted_index.reference rename to tests/queries/0_stateless/02346_inverted_index_experimental_flag.reference diff --git a/tests/queries/0_stateless/02895_forbid_create_inverted_index.sql b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql similarity index 72% rename from tests/queries/0_stateless/02895_forbid_create_inverted_index.sql rename to tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql index dc92d9198fb..bf89265372e 100644 --- a/tests/queries/0_stateless/02895_forbid_create_inverted_index.sql +++ b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql @@ -1,4 +1,7 @@ +-- Tests that the inverted index can only be supported when allow_experimental_inverted_index = 1. + SET allow_experimental_inverted_index = 0; + DROP TABLE IF EXISTS tab; CREATE TABLE tab ( diff --git a/tests/queries/0_stateless/02951_inverted_index_support_match.reference b/tests/queries/0_stateless/02346_inverted_index_match_predicate.reference similarity index 100% rename from tests/queries/0_stateless/02951_inverted_index_support_match.reference rename to tests/queries/0_stateless/02346_inverted_index_match_predicate.reference diff --git a/tests/queries/0_stateless/02951_inverted_index_support_match.sql b/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql similarity index 97% rename from tests/queries/0_stateless/02951_inverted_index_support_match.sql rename to tests/queries/0_stateless/02346_inverted_index_match_predicate.sql index 9ebf10412d9..99405c0acf2 100644 --- a/tests/queries/0_stateless/02951_inverted_index_support_match.sql +++ b/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql @@ -1,3 +1,5 @@ +-- Tests that match() utilizes the inverted index + SET allow_experimental_inverted_index = true; DROP TABLE IF EXISTS tab; diff --git a/tests/queries/0_stateless/02346_inverted_index_mutation.sql b/tests/queries/0_stateless/02346_inverted_index_mutation.sql deleted file mode 100644 index 83b73807cd7..00000000000 --- a/tests/queries/0_stateless/02346_inverted_index_mutation.sql +++ /dev/null @@ -1,25 +0,0 @@ -SET allow_experimental_inverted_index=1; - -DROP TABLE IF EXISTS t; -CREATE TABLE t -( - `timestamp` UInt64, - `s` String, - INDEX idx s TYPE inverted(3) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY tuple() -SETTINGS min_rows_for_wide_part = 1, min_bytes_for_wide_part = 1; - -INSERT INTO t (s) VALUES ('I am inverted'); - -SELECT data_version FROM system.parts WHERE database=currentDatabase() AND table='t' AND active=1; - --- do update column synchronously -ALTER TABLE t UPDATE s='I am not inverted' WHERE 1 SETTINGS mutations_sync=1; - -SELECT data_version FROM system.parts WHERE database=currentDatabase() AND table='t' AND active=1; - -SELECT s FROM t WHERE s LIKE '%inverted%' SETTINGS force_data_skipping_indices='idx'; - -DROP TABLE t; diff --git a/tests/queries/0_stateless/02346_full_text_search.reference b/tests/queries/0_stateless/02346_inverted_index_search.reference similarity index 100% rename from tests/queries/0_stateless/02346_full_text_search.reference rename to tests/queries/0_stateless/02346_inverted_index_search.reference diff --git a/tests/queries/0_stateless/02346_full_text_search.sql b/tests/queries/0_stateless/02346_inverted_index_search.sql similarity index 100% rename from tests/queries/0_stateless/02346_full_text_search.sql rename to tests/queries/0_stateless/02346_inverted_index_search.sql From bfec324b2818a3764c09347508125051273dac25 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 23 Jan 2024 10:06:25 +0000 Subject: [PATCH 0211/1081] Some fixups + test --- src/Storages/MergeTree/MutateTask.cpp | 18 ++++++++++------- .../02346_inverted_index_bug59039.reference | 0 .../02346_inverted_index_bug59039.sql | 20 +++++++++++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02346_inverted_index_bug59039.reference create mode 100644 tests/queries/0_stateless/02346_inverted_index_bug59039.sql diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index fccee6bd887..48aad368dd4 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -674,21 +674,25 @@ static NameToNameVector collectFilesForRenames( { if (command.type == MutationCommand::Type::DROP_INDEX) { - const std::vector suffixes = {".idx2", ".idx", ".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; + static const std::array suffixes = {".idx2", ".idx"}; + static const std::array gin_suffixes = {".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; /// .gin_* is inverted index for (const auto & suffix : suffixes) { - String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + const String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + const String filename_mrk = INDEX_FILE_PREFIX + command.column_name + mrk_extension; - if ((suffix == ".idx2" || suffix == ".idx") && source_part->checksums.has(filename)) + if (source_part->checksums.has(filename)) { add_rename(filename, ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + add_rename(filename_mrk, ""); } - else if (source_part->checksums.has(filename)) - { + } + for (const auto & gin_suffix : gin_suffixes) + { + const String filename = INDEX_FILE_PREFIX + command.column_name + gin_suffix; + if (source_part->checksums.has(filename)) add_rename(filename, ""); - } } } else if (command.type == MutationCommand::Type::DROP_PROJECTION) diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.reference b/tests/queries/0_stateless/02346_inverted_index_bug59039.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.sql b/tests/queries/0_stateless/02346_inverted_index_bug59039.sql new file mode 100644 index 00000000000..0ef0cb0c733 --- /dev/null +++ b/tests/queries/0_stateless/02346_inverted_index_bug59039.sql @@ -0,0 +1,20 @@ +-- This is supposed to test that DROP INDEX removes all index related files. Can't test this directly but at least run the statement and +-- check that no bad things happen. + +SET allow_experimental_inverted_index = 1; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab +( + id UInt64, + doc String, + INDEX text_idx doc TYPE inverted +) +ENGINE = MergeTree +ORDER BY id +SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi', min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0; + +ALTER TABLE tab DROP INDEX text_idx; + +DROP TABLE tab; From 992d859e726895dadc9fbab1ebf99acd4b29881c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 23 Jan 2024 14:16:14 +0100 Subject: [PATCH 0212/1081] Fix style check --- src/Disks/ObjectStorages/IObjectStorage.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index cf113586ddf..b7db353fb6a 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "config.h" #if USE_AZURE_BLOB_STORAGE @@ -33,6 +34,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + class ReadBufferFromFileBase; class WriteBufferFromFileBase; From bef0fcb482c4b8782a1d2e485be9f6d8ffc2dfe9 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 23 Jan 2024 14:56:24 +0100 Subject: [PATCH 0213/1081] Fix is_order_by_all flag in QueryNode --- src/Analyzer/QueryNode.cpp | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/Analyzer/QueryNode.cpp b/src/Analyzer/QueryNode.cpp index 738b1ac62e8..a82fb4489b5 100644 --- a/src/Analyzer/QueryNode.cpp +++ b/src/Analyzer/QueryNode.cpp @@ -119,6 +119,9 @@ void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, s if (is_group_by_all) buffer << ", is_group_by_all: " << is_group_by_all; + if (is_order_by_all) + buffer << ", is_order_by_all: " << is_order_by_all; + std::string group_by_type; if (is_group_by_with_rollup) group_by_type = "rollup"; @@ -257,6 +260,7 @@ bool QueryNode::isEqualImpl(const IQueryTreeNode & rhs) const is_group_by_with_cube == rhs_typed.is_group_by_with_cube && is_group_by_with_grouping_sets == rhs_typed.is_group_by_with_grouping_sets && is_group_by_all == rhs_typed.is_group_by_all && + is_order_by_all == rhs_typed.is_order_by_all && cte_name == rhs_typed.cte_name && projection_columns == rhs_typed.projection_columns && settings_changes == rhs_typed.settings_changes; @@ -288,6 +292,7 @@ void QueryNode::updateTreeHashImpl(HashState & state) const state.update(is_group_by_with_cube); state.update(is_group_by_with_grouping_sets); state.update(is_group_by_all); + state.update(is_order_by_all); state.update(settings_changes.size()); @@ -306,18 +311,19 @@ QueryTreeNodePtr QueryNode::cloneImpl() const { auto result_query_node = std::make_shared(context); - result_query_node->is_subquery = is_subquery; - result_query_node->is_cte = is_cte; - result_query_node->is_distinct = is_distinct; - result_query_node->is_limit_with_ties = is_limit_with_ties; - result_query_node->is_group_by_with_totals = is_group_by_with_totals; - result_query_node->is_group_by_with_rollup = is_group_by_with_rollup; - result_query_node->is_group_by_with_cube = is_group_by_with_cube; + result_query_node->is_subquery = is_subquery; + result_query_node->is_cte = is_cte; + result_query_node->is_distinct = is_distinct; + result_query_node->is_limit_with_ties = is_limit_with_ties; + result_query_node->is_group_by_with_totals = is_group_by_with_totals; + result_query_node->is_group_by_with_rollup = is_group_by_with_rollup; + result_query_node->is_group_by_with_cube = is_group_by_with_cube; result_query_node->is_group_by_with_grouping_sets = is_group_by_with_grouping_sets; - result_query_node->is_group_by_all = is_group_by_all; - result_query_node->cte_name = cte_name; - result_query_node->projection_columns = projection_columns; - result_query_node->settings_changes = settings_changes; + result_query_node->is_group_by_all = is_group_by_all; + result_query_node->is_order_by_all = is_order_by_all; + result_query_node->cte_name = cte_name; + result_query_node->projection_columns = projection_columns; + result_query_node->settings_changes = settings_changes; return result_query_node; } From 8e0aea301ee4b416d6bb4bcfdf664756ebff55ec Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 23 Jan 2024 14:29:26 +0000 Subject: [PATCH 0214/1081] Analyzer: Add cast for ConstantNode from constant folding --- src/Analyzer/ConstantNode.cpp | 5 ++++- tests/analyzer_tech_debt.txt | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/ConstantNode.cpp b/src/Analyzer/ConstantNode.cpp index cb05e6ed4e3..69bed3dbe90 100644 --- a/src/Analyzer/ConstantNode.cpp +++ b/src/Analyzer/ConstantNode.cpp @@ -128,7 +128,10 @@ ASTPtr ConstantNode::toASTImpl(const ConvertToASTOptions & options) const } } - if (need_to_add_cast_function) + // Add cast if constant was created as a result of constant folding. + // Constant folding may lead to type transformation and literal on shard + // may have a different type. + if (need_to_add_cast_function || source_expression != nullptr) { auto constant_type_name_ast = std::make_shared(constant_value->getType()->getName()); return makeASTFunction("_CAST", std::move(constant_value_ast), std::move(constant_type_name_ast)); diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 4643d109c3d..dd747fff7df 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -6,7 +6,6 @@ 01155_rename_move_materialized_view 01214_test_storage_merge_aliases_with_where 01244_optimize_distributed_group_by_sharding_key -01268_shard_avgweighted 01495_subqueries_in_with_statement 01560_merge_distributed_join 01584_distributed_buffer_cannot_find_column From 799a94081ba7587ec47d85554bdbb458ffb1436d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jan 2024 11:40:42 -0300 Subject: [PATCH 0215/1081] Revert "Revert "Allow to attach partition from table with different partition expression when destination partition expression doesn't re-partition"" This reverts commit a1c83e2f51117a69d484c7ae7884c3bc5dd98129. --- .../statements/alter/partition.md | 2 +- src/Interpreters/MonotonicityCheckVisitor.h | 102 +++- src/Interpreters/applyFunction.cpp | 43 ++ src/Interpreters/applyFunction.h | 16 + src/Parsers/queryToString.cpp | 5 + src/Parsers/queryToString.h | 1 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 37 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 1 + src/Storages/MergeTree/KeyCondition.cpp | 83 +-- src/Storages/MergeTree/MergeTreeData.cpp | 350 +++++-------- src/Storages/MergeTree/MergeTreeData.h | 18 + .../MergeTree/MergeTreeDataPartCloner.cpp | 320 ++++++++++++ .../MergeTree/MergeTreeDataPartCloner.h | 43 ++ src/Storages/MergeTree/MergeTreePartition.cpp | 39 ++ src/Storages/MergeTree/MergeTreePartition.h | 10 +- ...ergeTreePartitionCompatibilityVerifier.cpp | 91 ++++ .../MergeTreePartitionCompatibilityVerifier.h | 30 ++ ...TreePartitionGlobalMinMaxIdxCalculator.cpp | 25 + ...geTreePartitionGlobalMinMaxIdxCalculator.h | 24 + src/Storages/StorageMergeTree.cpp | 93 +++- src/Storages/StorageReplicatedMergeTree.cpp | 135 ++++- .../__init__.py | 0 .../configs/remote_servers.xml | 17 + .../test.py | 214 ++++++++ ...artition_different_partition_exp.reference | 467 +++++++++++++++++ ...tach_partition_different_partition_exp.sql | 485 ++++++++++++++++++ 26 files changed, 2310 insertions(+), 341 deletions(-) create mode 100644 src/Interpreters/applyFunction.cpp create mode 100644 src/Interpreters/applyFunction.h create mode 100644 src/Storages/MergeTree/MergeTreeDataPartCloner.cpp create mode 100644 src/Storages/MergeTree/MergeTreeDataPartCloner.h create mode 100644 src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp create mode 100644 src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h create mode 100644 src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp create mode 100644 src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h create mode 100644 tests/integration/test_attach_partition_distinct_expression_replicated/__init__.py create mode 100644 tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml create mode 100644 tests/integration/test_attach_partition_distinct_expression_replicated/test.py create mode 100644 tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference create mode 100644 tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 114b8d5ffe3..5659a0565c5 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -112,7 +112,7 @@ Note that: For the query to run successfully, the following conditions must be met: - Both tables must have the same structure. -- Both tables must have the same partition key, the same order by key and the same primary key. +- Both tables must have the same order by key and the same primary key. - Both tables must have the same indices and projections. - Both tables must have the same storage policy. diff --git a/src/Interpreters/MonotonicityCheckVisitor.h b/src/Interpreters/MonotonicityCheckVisitor.h index cc386825024..4e71bd56851 100644 --- a/src/Interpreters/MonotonicityCheckVisitor.h +++ b/src/Interpreters/MonotonicityCheckVisitor.h @@ -1,13 +1,17 @@ #pragma once #include +#include #include +#include #include #include -#include #include +#include +#include #include #include +#include #include #include #include @@ -33,6 +37,8 @@ public: ASTIdentifier * identifier = nullptr; DataTypePtr arg_data_type = {}; + Range range = Range::createWholeUniverse(); + void reject() { monotonicity.is_monotonic = false; } bool isRejected() const { return !monotonicity.is_monotonic; } @@ -97,13 +103,30 @@ public: if (data.isRejected()) return; - /// TODO: monotonicity for functions of several arguments - if (!ast_function.arguments || ast_function.arguments->children.size() != 1) + /// Monotonicity check only works for functions that contain at most two arguments and one of them must be a constant. + if (!ast_function.arguments) { data.reject(); return; } + auto arguments_size = ast_function.arguments->children.size(); + + if (arguments_size == 0 || arguments_size > 2) + { + data.reject(); + return; + } + else if (arguments_size == 2) + { + /// If the function has two arguments, then one of them must be a constant. + if (!ast_function.arguments->children[0]->as() && !ast_function.arguments->children[1]->as()) + { + data.reject(); + return; + } + } + if (!data.canOptimize(ast_function)) { data.reject(); @@ -124,14 +147,33 @@ public: return; } - ColumnsWithTypeAndName args; - args.emplace_back(data.arg_data_type, "tmp"); - auto function_base = function->build(args); + auto function_arguments = getFunctionArguments(ast_function, data); + + auto function_base = function->build(function_arguments); if (function_base && function_base->hasInformationAboutMonotonicity()) { bool is_positive = data.monotonicity.is_positive; - data.monotonicity = function_base->getMonotonicityForRange(*data.arg_data_type, Field(), Field()); + data.monotonicity = function_base->getMonotonicityForRange(*data.arg_data_type, data.range.left, data.range.right); + + auto & key_range = data.range; + + /// If we apply function to open interval, we can get empty intervals in result. + /// E.g. for ('2020-01-03', '2020-01-20') after applying 'toYYYYMM' we will get ('202001', '202001'). + /// To avoid this we make range left and right included. + /// Any function that treats NULL specially is not monotonic. + /// Thus we can safely use isNull() as an -Inf/+Inf indicator here. + if (!key_range.left.isNull()) + { + key_range.left = applyFunction(function_base, data.arg_data_type, key_range.left); + key_range.left_included = true; + } + + if (!key_range.right.isNull()) + { + key_range.right = applyFunction(function_base, data.arg_data_type, key_range.right); + key_range.right_included = true; + } if (!is_positive) data.monotonicity.is_positive = !data.monotonicity.is_positive; @@ -143,13 +185,53 @@ public: static bool needChildVisit(const ASTPtr & parent, const ASTPtr &) { - /// Currently we check monotonicity only for single-argument functions. - /// Although, multi-argument functions with all but one constant arguments can also be monotonic. + /// Multi-argument functions with all but one constant arguments can be monotonic. if (const auto * func = typeid_cast(parent.get())) - return func->arguments->children.size() < 2; + return func->arguments->children.size() <= 2; return true; } + + static ColumnWithTypeAndName extractLiteralColumnAndTypeFromAstLiteral(const ASTLiteral * literal) + { + ColumnWithTypeAndName result; + + result.type = applyVisitor(FieldToDataType(), literal->value); + result.column = result.type->createColumnConst(0, literal->value); + + return result; + } + + static ColumnsWithTypeAndName getFunctionArguments(const ASTFunction & ast_function, const Data & data) + { + ColumnsWithTypeAndName args; + + auto arguments_size = ast_function.arguments->children.size(); + + chassert(arguments_size == 1 || arguments_size == 2); + + if (arguments_size == 2) + { + if (ast_function.arguments->children[0]->as()) + { + const auto * literal = ast_function.arguments->children[0]->as(); + args.push_back(extractLiteralColumnAndTypeFromAstLiteral(literal)); + args.emplace_back(data.arg_data_type, "tmp"); + } + else + { + const auto * literal = ast_function.arguments->children[1]->as(); + args.emplace_back(data.arg_data_type, "tmp"); + args.push_back(extractLiteralColumnAndTypeFromAstLiteral(literal)); + } + } + else + { + args.emplace_back(data.arg_data_type, "tmp"); + } + + return args; + } }; using MonotonicityCheckVisitor = ConstInDepthNodeVisitor; diff --git a/src/Interpreters/applyFunction.cpp b/src/Interpreters/applyFunction.cpp new file mode 100644 index 00000000000..a53f14f0381 --- /dev/null +++ b/src/Interpreters/applyFunction.cpp @@ -0,0 +1,43 @@ +#include + +#include +#include + +namespace DB +{ + +static Field applyFunctionForField(const FunctionBasePtr & func, const DataTypePtr & arg_type, const Field & arg_value) +{ + ColumnsWithTypeAndName columns{ + {arg_type->createColumnConst(1, arg_value), arg_type, "x"}, + }; + + auto col = func->execute(columns, func->getResultType(), 1); + return (*col)[0]; +} + +FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field) +{ + /// Fallback for fields without block reference. + if (field.isExplicit()) + return applyFunctionForField(func, current_type, field); + + String result_name = "_" + func->getName() + "_" + toString(field.column_idx); + const auto & columns = field.columns; + size_t result_idx = columns->size(); + + for (size_t i = 0; i < result_idx; ++i) + if ((*columns)[i].name == result_name) + result_idx = i; + + if (result_idx == columns->size()) + { + ColumnsWithTypeAndName args{(*columns)[field.column_idx]}; + field.columns->emplace_back(ColumnWithTypeAndName{nullptr, func->getResultType(), result_name}); + (*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size()); + } + + return {field.columns, field.row_idx, result_idx}; +} + +} diff --git a/src/Interpreters/applyFunction.h b/src/Interpreters/applyFunction.h new file mode 100644 index 00000000000..9b8ae43a53c --- /dev/null +++ b/src/Interpreters/applyFunction.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +namespace DB +{ +struct FieldRef; + +class IFunctionBase; +class IDataType; + +using DataTypePtr = std::shared_ptr; +using FunctionBasePtr = std::shared_ptr; + +FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field); +} diff --git a/src/Parsers/queryToString.cpp b/src/Parsers/queryToString.cpp index 9721aa1f128..4a1903393f6 100644 --- a/src/Parsers/queryToString.cpp +++ b/src/Parsers/queryToString.cpp @@ -3,6 +3,11 @@ namespace DB { + String queryToStringNullable(const ASTPtr & query) + { + return query ? queryToString(query) : ""; + } + String queryToString(const ASTPtr & query) { return queryToString(*query); diff --git a/src/Parsers/queryToString.h b/src/Parsers/queryToString.h index 873de218293..3acd560b1e2 100644 --- a/src/Parsers/queryToString.h +++ b/src/Parsers/queryToString.h @@ -6,4 +6,5 @@ namespace DB { String queryToString(const ASTPtr & query); String queryToString(const IAST & query); + String queryToStringNullable(const ASTPtr & query); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 87f23b0da2a..f3057a8254f 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -81,6 +81,7 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Par auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key); size_t minmax_idx_size = minmax_column_types.size(); + hyperrectangle.clear(); hyperrectangle.reserve(minmax_idx_size); for (size_t i = 0; i < minmax_idx_size; ++i) { @@ -104,6 +105,39 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Par initialized = true; } +Block IMergeTreeDataPart::MinMaxIndex::getBlock(const MergeTreeData & data) const +{ + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to get block from uninitialized MinMax index."); + + Block block; + + const auto metadata_snapshot = data.getInMemoryMetadataPtr(); + const auto & partition_key = metadata_snapshot->getPartitionKey(); + + const auto minmax_column_names = data.getMinMaxColumnsNames(partition_key); + const auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key); + const auto minmax_idx_size = minmax_column_types.size(); + + for (size_t i = 0; i < minmax_idx_size; ++i) + { + const auto & data_type = minmax_column_types[i]; + const auto & column_name = minmax_column_names[i]; + + const auto column = data_type->createColumn(); + + const auto min_val = hyperrectangle.at(i).left; + const auto max_val = hyperrectangle.at(i).right; + + column->insert(min_val); + column->insert(max_val); + + block.insert(ColumnWithTypeAndName(column->getPtr(), data_type, column_name)); + } + + return block; +} + IMergeTreeDataPart::MinMaxIndex::WrittenFiles IMergeTreeDataPart::MinMaxIndex::store( const MergeTreeData & data, IDataPartStorage & part_storage, Checksums & out_checksums) const { @@ -185,8 +219,7 @@ void IMergeTreeDataPart::MinMaxIndex::merge(const MinMaxIndex & other) if (!initialized) { - hyperrectangle = other.hyperrectangle; - initialized = true; + *this = other; } else { diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 640a1f1d0a3..29f0f54d419 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -336,6 +336,7 @@ public: } void load(const MergeTreeData & data, const PartMetadataManagerPtr & manager); + Block getBlock(const MergeTreeData & data) const; using WrittenFiles = std::vector>; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index d5922ae1bc2..e5bcb11091f 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1,36 +1,37 @@ -#include -#include -#include +#include +#include #include #include #include #include +#include #include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include #include #include #include -#include -#include +#include +#include +#include #include +#include +#include +#include #include #include @@ -836,21 +837,6 @@ bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants return node.tryGetConstant(out_value, out_type); } - -static Field applyFunctionForField( - const FunctionBasePtr & func, - const DataTypePtr & arg_type, - const Field & arg_value) -{ - ColumnsWithTypeAndName columns - { - { arg_type->createColumnConst(1, arg_value), arg_type, "x" }, - }; - - auto col = func->execute(columns, func->getResultType(), 1); - return (*col)[0]; -} - /// The case when arguments may have types different than in the primary key. static std::pair applyFunctionForFieldOfUnknownType( const FunctionBasePtr & func, @@ -890,33 +876,6 @@ static std::pair applyBinaryFunctionForFieldOfUnknownType( return {std::move(result), std::move(return_type)}; } - -static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field) -{ - /// Fallback for fields without block reference. - if (field.isExplicit()) - return applyFunctionForField(func, current_type, field); - - String result_name = "_" + func->getName() + "_" + toString(field.column_idx); - const auto & columns = field.columns; - size_t result_idx = columns->size(); - - for (size_t i = 0; i < result_idx; ++i) - { - if ((*columns)[i].name == result_name) - result_idx = i; - } - - if (result_idx == columns->size()) - { - ColumnsWithTypeAndName args{(*columns)[field.column_idx]}; - field.columns->emplace_back(ColumnWithTypeAndName {nullptr, func->getResultType(), result_name}); - (*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size()); - } - - return {field.columns, field.row_idx, result_idx}; -} - /** When table's key has expression with these functions from a column, * and when a column in a query is compared with a constant, such as: * CREATE TABLE (x String) ORDER BY toDate(x) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 61332a4ff38..c3e348a549a 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -8,21 +8,6 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include @@ -43,19 +28,20 @@ #include #include #include -#include -#include #include #include #include #include #include #include +#include +#include #include +#include #include -#include -#include #include +#include +#include #include #include #include @@ -64,26 +50,41 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include -#include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -197,6 +198,50 @@ namespace ErrorCodes extern const int LIMIT_EXCEEDED; } +static size_t getPartitionAstFieldsCount(const ASTPartition & partition_ast, ASTPtr partition_value_ast) +{ + if (partition_ast.fields_count.has_value()) + return *partition_ast.fields_count; + + if (partition_value_ast->as()) + return 1; + + const auto * tuple_ast = partition_value_ast->as(); + + if (!tuple_ast) + { + throw Exception( + ErrorCodes::INVALID_PARTITION_VALUE, "Expected literal or tuple for partition key, got {}", partition_value_ast->getID()); + } + + if (tuple_ast->name != "tuple") + { + if (!isFunctionCast(tuple_ast)) + throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); + + if (tuple_ast->arguments->as()->children.empty()) + throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); + + auto first_arg = tuple_ast->arguments->as()->children.at(0); + if (const auto * inner_tuple = first_arg->as(); inner_tuple && inner_tuple->name == "tuple") + { + const auto * arguments_ast = tuple_ast->arguments->as(); + return arguments_ast ? arguments_ast->children.size() : 0; + } + else if (const auto * inner_literal_tuple = first_arg->as(); inner_literal_tuple) + { + return inner_literal_tuple->value.getType() == Field::Types::Tuple ? inner_literal_tuple->value.safeGet().size() : 1; + } + + throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); + } + else + { + const auto * arguments_ast = tuple_ast->arguments->as(); + return arguments_ast ? arguments_ast->children.size() : 0; + } +} + static void checkSuspiciousIndices(const ASTFunction * index_function) { std::unordered_set unique_index_expression_hashes; @@ -4854,7 +4899,7 @@ void MergeTreeData::removePartContributionToColumnAndSecondaryIndexSizes(const D } void MergeTreeData::checkAlterPartitionIsPossible( - const PartitionCommands & commands, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & settings, ContextPtr local_context) const + const PartitionCommands & commands, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & settings, ContextPtr) const { for (const auto & command : commands) { @@ -4882,7 +4927,15 @@ void MergeTreeData::checkAlterPartitionIsPossible( throw DB::Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Only support DROP/DETACH PARTITION ALL currently"); } else - getPartitionIDFromQuery(command.partition, local_context); + { + // The below `getPartitionIDFromQuery` call will not work for attach / replace because it assumes the partition expressions + // are the same and deliberately uses this storage. Later on, `MergeTreeData::replaceFrom` is called, and it makes the right + // call to `getPartitionIDFromQuery` using source storage. + // Note: `PartitionCommand::REPLACE_PARTITION` is used both for `REPLACE PARTITION` and `ATTACH PARTITION FROM` queries. + // But not for `ATTACH PARTITION` queries. + if (command.type != PartitionCommand::REPLACE_PARTITION) + getPartitionIDFromQuery(command.partition, getContext()); + } } } } @@ -5616,69 +5669,8 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc MergeTreePartInfo::validatePartitionID(partition_ast.id->clone(), format_version); return partition_ast.id->as()->value.safeGet(); } - size_t partition_ast_fields_count = 0; ASTPtr partition_value_ast = partition_ast.value->clone(); - if (!partition_ast.fields_count.has_value()) - { - if (partition_value_ast->as()) - { - partition_ast_fields_count = 1; - } - else if (const auto * tuple_ast = partition_value_ast->as()) - { - if (tuple_ast->name != "tuple") - { - if (isFunctionCast(tuple_ast)) - { - if (tuple_ast->arguments->as()->children.empty()) - { - throw Exception( - ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); - } - auto first_arg = tuple_ast->arguments->as()->children.at(0); - if (const auto * inner_tuple = first_arg->as(); inner_tuple && inner_tuple->name == "tuple") - { - const auto * arguments_ast = tuple_ast->arguments->as(); - if (arguments_ast) - partition_ast_fields_count = arguments_ast->children.size(); - else - partition_ast_fields_count = 0; - } - else if (const auto * inner_literal_tuple = first_arg->as(); inner_literal_tuple) - { - if (inner_literal_tuple->value.getType() == Field::Types::Tuple) - partition_ast_fields_count = inner_literal_tuple->value.safeGet().size(); - else - partition_ast_fields_count = 1; - } - else - { - throw Exception( - ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); - } - } - else - throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); - } - else - { - const auto * arguments_ast = tuple_ast->arguments->as(); - if (arguments_ast) - partition_ast_fields_count = arguments_ast->children.size(); - else - partition_ast_fields_count = 0; - } - } - else - { - throw Exception( - ErrorCodes::INVALID_PARTITION_VALUE, "Expected literal or tuple for partition key, got {}", partition_value_ast->getID()); - } - } - else - { - partition_ast_fields_count = *partition_ast.fields_count; - } + auto partition_ast_fields_count = getPartitionAstFieldsCount(partition_ast, partition_value_ast); if (format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { @@ -7014,23 +7006,35 @@ MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(IStorage & sour if (my_snapshot->getColumns().getAllPhysical().sizeOfDifference(src_snapshot->getColumns().getAllPhysical())) throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); - auto query_to_string = [] (const ASTPtr & ast) - { - return ast ? queryToString(ast) : ""; - }; - - if (query_to_string(my_snapshot->getSortingKeyAST()) != query_to_string(src_snapshot->getSortingKeyAST())) + if (queryToStringNullable(my_snapshot->getSortingKeyAST()) != queryToStringNullable(src_snapshot->getSortingKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different ordering"); - if (query_to_string(my_snapshot->getPartitionKeyAST()) != query_to_string(src_snapshot->getPartitionKeyAST())) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); - if (format_version != src_data->format_version) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different format_version"); - if (query_to_string(my_snapshot->getPrimaryKeyAST()) != query_to_string(src_snapshot->getPrimaryKeyAST())) + if (queryToStringNullable(my_snapshot->getPrimaryKeyAST()) != queryToStringNullable(src_snapshot->getPrimaryKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different primary key"); + const auto is_a_subset_of = [](const auto & lhs, const auto & rhs) + { + if (lhs.size() > rhs.size()) + return false; + + const auto rhs_set = NameSet(rhs.begin(), rhs.end()); + for (const auto & lhs_element : lhs) + if (!rhs_set.contains(lhs_element)) + return false; + + return true; + }; + + if (!is_a_subset_of(my_snapshot->getColumnsRequiredForPartitionKey(), src_snapshot->getColumnsRequiredForPartitionKey())) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Destination table partition expression columns must be a subset of source table partition expression columns"); + } + const auto check_definitions = [](const auto & my_descriptions, const auto & src_descriptions) { if (my_descriptions.size() != src_descriptions.size()) @@ -7071,128 +7075,56 @@ std::pair MergeTreeData::cloneAn const ReadSettings & read_settings, const WriteSettings & write_settings) { - /// Check that the storage policy contains the disk where the src_part is located. - bool does_storage_policy_allow_same_disk = false; - for (const DiskPtr & disk : getStoragePolicy()->getDisks()) - { - if (disk->getName() == src_part->getDataPartStorage().getDiskName()) - { - does_storage_policy_allow_same_disk = true; - break; - } - } - if (!does_storage_policy_allow_same_disk) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Could not clone and load part {} because disk does not belong to storage policy", - quoteString(src_part->getDataPartStorage().getFullPath())); + return MergeTreeDataPartCloner::clone( + this, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, require_part_metadata, params, read_settings, write_settings); +} - String dst_part_name = src_part->getNewName(dst_part_info); - String tmp_dst_part_name = tmp_part_prefix + dst_part_name; - auto temporary_directory_lock = getTemporaryPartDirectoryHolder(tmp_dst_part_name); +std::pair MergeTreeData::cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + const MergeTreeData::DataPartPtr & src_part, + const MergeTreePartition & new_partition, + const String & partition_id, + const IMergeTreeDataPart::MinMaxIndex & min_max_index, + const String & tmp_part_prefix, + const StorageMetadataPtr & my_metadata_snapshot, + const IDataPartStorage::ClonePartParams & clone_params, + ContextPtr local_context, + Int64 min_block, + Int64 max_block +) +{ + MergeTreePartInfo dst_part_info(partition_id, min_block, max_block, src_part->info.level); - /// Why it is needed if we only hardlink files? - auto reservation = src_part->getDataPartStorage().reserve(src_part->getBytesOnDisk()); - auto src_part_storage = src_part->getDataPartStoragePtr(); + return MergeTreeDataPartCloner::cloneWithDistinctPartitionExpression( + this, + src_part, + my_metadata_snapshot, + dst_part_info, + tmp_part_prefix, + local_context->getReadSettings(), + local_context->getWriteSettings(), + new_partition, + min_max_index, + false, + clone_params); +} - scope_guard src_flushed_tmp_dir_lock; - MergeTreeData::MutableDataPartPtr src_flushed_tmp_part; +std::pair MergeTreeData::createPartitionAndMinMaxIndexFromSourcePart( + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context) +{ + const auto & src_data = src_part->storage; - /// If source part is in memory, flush it to disk and clone it already in on-disk format - /// Protect tmp dir from removing by cleanup thread with src_flushed_tmp_dir_lock - /// Construct src_flushed_tmp_part in order to delete part with its directory at destructor - if (auto src_part_in_memory = asInMemoryPart(src_part)) - { - auto flushed_part_path = *src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); + auto metadata_manager = std::make_shared(src_part.get()); + IMergeTreeDataPart::MinMaxIndex min_max_index; - auto tmp_src_part_file_name = fs::path(tmp_dst_part_name).filename(); - src_flushed_tmp_dir_lock = src_part->storage.getTemporaryPartDirectoryHolder(tmp_src_part_file_name); + min_max_index.load(src_data, metadata_manager); - auto flushed_part_storage = src_part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot); + MergeTreePartition new_partition; - src_flushed_tmp_part = MergeTreeDataPartBuilder(*this, src_part->name, flushed_part_storage) - .withPartInfo(src_part->info) - .withPartFormatFromDisk() - .build(); + new_partition.create(metadata_snapshot, min_max_index.getBlock(src_data), 0u, local_context); - src_flushed_tmp_part->is_temp = true; - src_part_storage = flushed_part_storage; - } - - String with_copy; - if (params.copy_instead_of_hardlink) - with_copy = " (copying data)"; - - auto dst_part_storage = src_part_storage->freeze( - relative_data_path, - tmp_dst_part_name, - read_settings, - write_settings, - /* save_metadata_callback= */ {}, - params); - - if (params.metadata_version_to_write.has_value()) - { - chassert(!params.keep_metadata_version); - auto out_metadata = dst_part_storage->writeFile(IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, getContext()->getWriteSettings()); - writeText(metadata_snapshot->getMetadataVersion(), *out_metadata); - out_metadata->finalize(); - if (getSettings()->fsync_after_insert) - out_metadata->sync(); - } - - LOG_DEBUG(log, "Clone{} part {} to {}{}", - src_flushed_tmp_part ? " flushed" : "", - src_part_storage->getFullPath(), - std::string(fs::path(dst_part_storage->getFullRootPath()) / tmp_dst_part_name), - with_copy); - - auto dst_data_part = MergeTreeDataPartBuilder(*this, dst_part_name, dst_part_storage) - .withPartFormatFromDisk() - .build(); - - if (!params.copy_instead_of_hardlink && params.hardlinked_files) - { - params.hardlinked_files->source_part_name = src_part->name; - params.hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); - - for (auto it = src_part->getDataPartStorage().iterate(); it->isValid(); it->next()) - { - if (!params.files_to_copy_instead_of_hardlinks.contains(it->name()) - && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED - && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) - { - params.hardlinked_files->hardlinks_from_source_part.insert(it->name()); - } - } - - auto projections = src_part->getProjectionParts(); - for (const auto & [name, projection_part] : projections) - { - const auto & projection_storage = projection_part->getDataPartStorage(); - for (auto it = projection_storage.iterate(); it->isValid(); it->next()) - { - auto file_name_with_projection_prefix = fs::path(projection_storage.getPartDirectory()) / it->name(); - if (!params.files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) - && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED - && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) - { - params.hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); - } - } - } - } - - /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. - TransactionID tid = params.txn ? params.txn->tid : Tx::PrehistoricTID; - dst_data_part->version.setCreationTID(tid, nullptr); - dst_data_part->storeVersionMetadata(); - - dst_data_part->is_temp = true; - - dst_data_part->loadColumnsChecksumsIndexes(require_part_metadata, true); - dst_data_part->modification_time = dst_part_storage->getLastModified().epochTime(); - return std::make_pair(dst_data_part, std::move(temporary_directory_lock)); + return {new_partition, min_max_index}; } String MergeTreeData::getFullPathOnDisk(const DiskPtr & disk) const diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index f0dbaf0e307..9c433e11b84 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -231,6 +231,7 @@ public: } }; + using DataParts = std::set; using MutableDataParts = std::set; using DataPartsVector = std::vector; @@ -848,6 +849,23 @@ public: const ReadSettings & read_settings, const WriteSettings & write_settings); + std::pair cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + const MergeTreeData::DataPartPtr & src_part, + const MergeTreePartition & new_partition, + const String & partition_id, + const IMergeTreeDataPart::MinMaxIndex & min_max_index, + const String & tmp_part_prefix, + const StorageMetadataPtr & my_metadata_snapshot, + const IDataPartStorage::ClonePartParams & clone_params, + ContextPtr local_context, + Int64 min_block, + Int64 max_block); + + static std::pair createPartitionAndMinMaxIndexFromSourcePart( + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context); + virtual std::vector getMutationsStatus() const = 0; /// Returns true if table can create new parts with adaptive granularity diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp new file mode 100644 index 00000000000..78cb9aa0624 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp @@ -0,0 +1,320 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +} + +static Poco::Logger * log = &Poco::Logger::get("MergeTreeDataPartCloner"); + +namespace DistinctPartitionExpression +{ +std::unique_ptr updatePartitionFile( + const MergeTreeData & merge_tree_data, + const MergeTreePartition & partition, + const MergeTreeData::MutableDataPartPtr & dst_part, + IDataPartStorage & storage) +{ + storage.removeFile("partition.dat"); + // Leverage already implemented MergeTreePartition::store to create & store partition.dat. + // Checksum is re-calculated later. + return partition.store(merge_tree_data, storage, dst_part->checksums); +} + +IMergeTreeDataPart::MinMaxIndex::WrittenFiles updateMinMaxFiles( + const MergeTreeData & merge_tree_data, + const MergeTreeData::MutableDataPartPtr & dst_part, + IDataPartStorage & storage, + const StorageMetadataPtr & metadata_snapshot) +{ + for (const auto & column_name : MergeTreeData::getMinMaxColumnsNames(metadata_snapshot->partition_key)) + { + auto file = "minmax_" + escapeForFileName(column_name) + ".idx"; + storage.removeFile(file); + } + + return dst_part->minmax_idx->store(merge_tree_data, storage, dst_part->checksums); +} + +void finalizeNewFiles(const std::vector> & files, bool sync_new_files) +{ + for (const auto & file : files) + { + file->finalize(); + if (sync_new_files) + file->sync(); + } +} + +void updateNewPartFiles( + const MergeTreeData & merge_tree_data, + const MergeTreeData::MutableDataPartPtr & dst_part, + const MergeTreePartition & new_partition, + const IMergeTreeDataPart::MinMaxIndex & new_min_max_index, + const StorageMetadataPtr & src_metadata_snapshot, + bool sync_new_files) +{ + auto & storage = dst_part->getDataPartStorage(); + + *dst_part->minmax_idx = new_min_max_index; + + auto partition_file = updatePartitionFile(merge_tree_data, new_partition, dst_part, storage); + + auto min_max_files = updateMinMaxFiles(merge_tree_data, dst_part, storage, src_metadata_snapshot); + + IMergeTreeDataPart::MinMaxIndex::WrittenFiles written_files; + + if (partition_file) + written_files.emplace_back(std::move(partition_file)); + + written_files.insert(written_files.end(), std::make_move_iterator(min_max_files.begin()), std::make_move_iterator(min_max_files.end())); + + finalizeNewFiles(written_files, sync_new_files); + + // MergeTreeDataPartCloner::finalize_part calls IMergeTreeDataPart::loadColumnsChecksumsIndexes, which will re-create + // the checksum file if it doesn't exist. Relying on that is cumbersome, but this refactoring is simply a code extraction + // with small improvements. It can be further improved in the future. + storage.removeFile("checksums.txt"); +} +} + +namespace +{ +bool doesStoragePolicyAllowSameDisk(MergeTreeData * merge_tree_data, const MergeTreeData::DataPartPtr & src_part) +{ + for (const DiskPtr & disk : merge_tree_data->getStoragePolicy()->getDisks()) + if (disk->getName() == src_part->getDataPartStorage().getDiskName()) + return true; + return false; +} + +DataPartStoragePtr flushPartStorageToDiskIfInMemory( + MergeTreeData * merge_tree_data, + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const String & tmp_part_prefix, + const String & tmp_dst_part_name, + scope_guard & src_flushed_tmp_dir_lock, + MergeTreeData::MutableDataPartPtr src_flushed_tmp_part) +{ + if (auto src_part_in_memory = asInMemoryPart(src_part)) + { + auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); + auto tmp_src_part_file_name = fs::path(tmp_dst_part_name).filename(); + + src_flushed_tmp_dir_lock = src_part->storage.getTemporaryPartDirectoryHolder(tmp_src_part_file_name); + + auto flushed_part_storage = src_part_in_memory->flushToDisk(*flushed_part_path, metadata_snapshot); + + src_flushed_tmp_part = MergeTreeDataPartBuilder(*merge_tree_data, src_part->name, flushed_part_storage) + .withPartInfo(src_part->info) + .withPartFormatFromDisk() + .build(); + + src_flushed_tmp_part->is_temp = true; + + return flushed_part_storage; + } + + return src_part->getDataPartStoragePtr(); +} + +std::shared_ptr hardlinkAllFiles( + MergeTreeData * merge_tree_data, + const DB::ReadSettings & read_settings, + const DB::WriteSettings & write_settings, + const DataPartStoragePtr & storage, + const String & path, + const DB::IDataPartStorage::ClonePartParams & params) +{ + return storage->freeze( + merge_tree_data->getRelativeDataPath(), + path, + read_settings, + write_settings, + /*save_metadata_callback=*/{}, + params); +} + +std::pair cloneSourcePart( + MergeTreeData * merge_tree_data, + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const DB::IDataPartStorage::ClonePartParams & params) +{ + const auto dst_part_name = src_part->getNewName(dst_part_info); + + const auto tmp_dst_part_name = tmp_part_prefix + dst_part_name; + + auto temporary_directory_lock = merge_tree_data->getTemporaryPartDirectoryHolder(tmp_dst_part_name); + + src_part->getDataPartStorage().reserve(src_part->getBytesOnDisk()); + + scope_guard src_flushed_tmp_dir_lock; + MergeTreeData::MutableDataPartPtr src_flushed_tmp_part; + + auto src_part_storage = flushPartStorageToDiskIfInMemory( + merge_tree_data, src_part, metadata_snapshot, tmp_part_prefix, tmp_dst_part_name, src_flushed_tmp_dir_lock, src_flushed_tmp_part); + + auto dst_part_storage = hardlinkAllFiles(merge_tree_data, read_settings, write_settings, src_part_storage, tmp_dst_part_name, params); + + if (params.metadata_version_to_write.has_value()) + { + chassert(!params.keep_metadata_version); + auto out_metadata = dst_part_storage->writeFile( + IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, merge_tree_data->getContext()->getWriteSettings()); + writeText(metadata_snapshot->getMetadataVersion(), *out_metadata); + out_metadata->finalize(); + if (merge_tree_data->getSettings()->fsync_after_insert) + out_metadata->sync(); + } + + LOG_DEBUG( + log, + "Clone {} part {} to {}{}", + src_flushed_tmp_part ? "flushed" : "", + src_part_storage->getFullPath(), + std::string(fs::path(dst_part_storage->getFullRootPath()) / tmp_dst_part_name), + false); + + + auto part = MergeTreeDataPartBuilder(*merge_tree_data, dst_part_name, dst_part_storage).withPartFormatFromDisk().build(); + + return std::make_pair(part, std::move(temporary_directory_lock)); +} + +void handleHardLinkedParameterFiles(const MergeTreeData::DataPartPtr & src_part, const DB::IDataPartStorage::ClonePartParams & params) +{ + const auto & hardlinked_files = params.hardlinked_files; + + hardlinked_files->source_part_name = src_part->name; + hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); + + for (auto it = src_part->getDataPartStorage().iterate(); it->isValid(); it->next()) + { + if (!params.files_to_copy_instead_of_hardlinks.contains(it->name()) + && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED + && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) + { + hardlinked_files->hardlinks_from_source_part.insert(it->name()); + } + } +} + +void handleProjections(const MergeTreeData::DataPartPtr & src_part, const DB::IDataPartStorage::ClonePartParams & params) +{ + auto projections = src_part->getProjectionParts(); + for (const auto & [name, projection_part] : projections) + { + const auto & projection_storage = projection_part->getDataPartStorage(); + for (auto it = projection_storage.iterate(); it->isValid(); it->next()) + { + auto file_name_with_projection_prefix = fs::path(projection_storage.getPartDirectory()) / it->name(); + if (!params.files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) + && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED + && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) + { + params.hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); + } + } + } +} + +MergeTreeData::MutableDataPartPtr finalizePart( + const MergeTreeData::MutableDataPartPtr & dst_part, const DB::IDataPartStorage::ClonePartParams & params, bool require_part_metadata) +{ + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + TransactionID tid = params.txn ? params.txn->tid : Tx::PrehistoricTID; + dst_part->version.setCreationTID(tid, nullptr); + dst_part->storeVersionMetadata(); + + dst_part->is_temp = true; + + dst_part->loadColumnsChecksumsIndexes(require_part_metadata, true); + + dst_part->modification_time = dst_part->getDataPartStorage().getLastModified().epochTime(); + + return dst_part; +} + +std::pair cloneAndHandleHardlinksAndProjections( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const IDataPartStorage::ClonePartParams & params) +{ + if (!doesStoragePolicyAllowSameDisk(merge_tree_data, src_part)) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Could not clone and load part {} because disk does not belong to storage policy", + quoteString(src_part->getDataPartStorage().getFullPath())); + + auto [destination_part, temporary_directory_lock] = cloneSourcePart( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + if (!params.copy_instead_of_hardlink && params.hardlinked_files) + { + handleHardLinkedParameterFiles(src_part, params); + handleProjections(src_part, params); + } + + return std::make_pair(destination_part, std::move(temporary_directory_lock)); +} +} + +std::pair MergeTreeDataPartCloner::clone( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + bool require_part_metadata, + const IDataPartStorage::ClonePartParams & params, + const ReadSettings & read_settings, + const WriteSettings & write_settings) +{ + auto [destination_part, temporary_directory_lock] = cloneAndHandleHardlinksAndProjections( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + return std::make_pair(finalizePart(destination_part, params, require_part_metadata), std::move(temporary_directory_lock)); +} + +std::pair MergeTreeDataPartCloner::cloneWithDistinctPartitionExpression( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const MergeTreePartition & new_partition, + const IMergeTreeDataPart::MinMaxIndex & new_min_max_index, + bool sync_new_files, + const IDataPartStorage::ClonePartParams & params) +{ + auto [destination_part, temporary_directory_lock] = cloneAndHandleHardlinksAndProjections( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + DistinctPartitionExpression::updateNewPartFiles( + *merge_tree_data, destination_part, new_partition, new_min_max_index, src_part->storage.getInMemoryMetadataPtr(), sync_new_files); + + return std::make_pair(finalizePart(destination_part, params, false), std::move(temporary_directory_lock)); +} + +} diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.h b/src/Storages/MergeTree/MergeTreeDataPartCloner.h new file mode 100644 index 00000000000..53585f20b7f --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.h @@ -0,0 +1,43 @@ +#pragma once + +namespace DB +{ + +struct StorageInMemoryMetadata; +using StorageMetadataPtr = std::shared_ptr; +struct MergeTreePartition; +class IMergeTreeDataPart; + +class MergeTreeDataPartCloner +{ +public: + using DataPart = IMergeTreeDataPart; + using MutableDataPartPtr = std::shared_ptr; + using DataPartPtr = std::shared_ptr; + + static std::pair clone( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + bool require_part_metadata, + const IDataPartStorage::ClonePartParams & params, + const ReadSettings & read_settings, + const WriteSettings & write_settings); + + static std::pair cloneWithDistinctPartitionExpression( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const MergeTreePartition & new_partition, + const IMergeTreeDataPart::MinMaxIndex & new_min_max_index, + bool sync_new_files, + const IDataPartStorage::ClonePartParams & params); +}; + +} diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index ddeaf69136a..76ef3be25b3 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -467,6 +467,45 @@ void MergeTreePartition::create(const StorageMetadataPtr & metadata_snapshot, Bl } } +void MergeTreePartition::createAndValidateMinMaxPartitionIds( + const StorageMetadataPtr & metadata_snapshot, Block block_with_min_max_partition_ids, ContextPtr context) +{ + if (!metadata_snapshot->hasPartitionKey()) + return; + + auto partition_key_names_and_types = executePartitionByExpression(metadata_snapshot, block_with_min_max_partition_ids, context); + value.resize(partition_key_names_and_types.size()); + + /// Executing partition_by expression adds new columns to passed block according to partition functions. + /// The block is passed by reference and is used afterwards. `moduloLegacy` needs to be substituted back + /// with just `modulo`, because it was a temporary substitution. + static constexpr std::string_view modulo_legacy_function_name = "moduloLegacy"; + + size_t i = 0; + for (const auto & element : partition_key_names_and_types) + { + auto & partition_column = block_with_min_max_partition_ids.getByName(element.name); + + if (element.name.starts_with(modulo_legacy_function_name)) + partition_column.name.replace(0, modulo_legacy_function_name.size(), "modulo"); + + Field extracted_min_partition_id_field; + Field extracted_max_partition_id_field; + + partition_column.column->get(0, extracted_min_partition_id_field); + partition_column.column->get(1, extracted_max_partition_id_field); + + if (extracted_min_partition_id_field != extracted_max_partition_id_field) + { + throw Exception( + ErrorCodes::INVALID_PARTITION_VALUE, + "Can not create the partition. A partition can not contain values that have different partition ids"); + } + + partition_column.column->get(0u, value[i++]); + } +} + NamesAndTypesList MergeTreePartition::executePartitionByExpression(const StorageMetadataPtr & metadata_snapshot, Block & block, ContextPtr context) { auto adjusted_partition_key = adjustPartitionKey(metadata_snapshot, context); diff --git a/src/Storages/MergeTree/MergeTreePartition.h b/src/Storages/MergeTree/MergeTreePartition.h index 78b141f26ec..fd7ae02cde4 100644 --- a/src/Storages/MergeTree/MergeTreePartition.h +++ b/src/Storages/MergeTree/MergeTreePartition.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include #include #include #include -#include +#include +#include namespace DB { @@ -51,6 +52,11 @@ public: void create(const StorageMetadataPtr & metadata_snapshot, Block block, size_t row, ContextPtr context); + /// Copy of MergeTreePartition::create, but also validates if min max partition keys are equal. If they are different, + /// it means the partition can't be created because the data doesn't belong to the same partition. + void createAndValidateMinMaxPartitionIds( + const StorageMetadataPtr & metadata_snapshot, Block block_with_min_max_partition_ids, ContextPtr context); + static void appendFiles(const MergeTreeData & storage, Strings & files); /// Adjust partition key and execute its expression on block. Return sample block according to used expression. diff --git a/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp new file mode 100644 index 00000000000..21bcdb84a96 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +} + +namespace +{ +bool isDestinationPartitionExpressionMonotonicallyIncreasing( + const std::vector & hyperrectangle, const MergeTreeData & destination_storage) +{ + auto destination_table_metadata = destination_storage.getInMemoryMetadataPtr(); + + auto key_description = destination_table_metadata->getPartitionKey(); + auto definition_ast = key_description.definition_ast->clone(); + + auto table_identifier = std::make_shared(destination_storage.getStorageID().getTableName()); + auto table_with_columns + = TableWithColumnNamesAndTypes{DatabaseAndTableWithAlias(table_identifier), destination_table_metadata->getColumns().getOrdinary()}; + + auto expression_list = extractKeyExpressionList(definition_ast); + + MonotonicityCheckVisitor::Data data{{table_with_columns}, destination_storage.getContext(), /*group_by_function_hashes*/ {}}; + + for (auto i = 0u; i < expression_list->children.size(); i++) + { + data.range = hyperrectangle[i]; + + MonotonicityCheckVisitor(data).visit(expression_list->children[i]); + + if (!data.monotonicity.is_monotonic || !data.monotonicity.is_positive) + return false; + } + + return true; +} + +bool isExpressionDirectSubsetOf(const ASTPtr source, const ASTPtr destination) +{ + auto source_expression_list = extractKeyExpressionList(source); + auto destination_expression_list = extractKeyExpressionList(destination); + + std::unordered_set source_columns; + + for (auto i = 0u; i < source_expression_list->children.size(); ++i) + source_columns.insert(source_expression_list->children[i]->getColumnName()); + + for (auto i = 0u; i < destination_expression_list->children.size(); ++i) + if (!source_columns.contains(destination_expression_list->children[i]->getColumnName())) + return false; + + return true; +} +} + +void MergeTreePartitionCompatibilityVerifier::verify( + const MergeTreeData & source_storage, const MergeTreeData & destination_storage, const DataPartsVector & source_parts) +{ + const auto source_metadata = source_storage.getInMemoryMetadataPtr(); + const auto destination_metadata = destination_storage.getInMemoryMetadataPtr(); + + const auto source_partition_key_ast = source_metadata->getPartitionKeyAST(); + const auto destination_partition_key_ast = destination_metadata->getPartitionKeyAST(); + + // If destination partition expression columns are a subset of source partition expression columns, + // there is no need to check for monotonicity. + if (isExpressionDirectSubsetOf(source_partition_key_ast, destination_partition_key_ast)) + return; + + const auto src_global_min_max_indexes = MergeTreePartitionGlobalMinMaxIdxCalculator::calculate(source_parts, destination_storage); + + assert(!src_global_min_max_indexes.hyperrectangle.empty()); + + if (!isDestinationPartitionExpressionMonotonicallyIncreasing(src_global_min_max_indexes.hyperrectangle, destination_storage)) + throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Destination table partition expression is not monotonically increasing"); + + MergeTreePartition().createAndValidateMinMaxPartitionIds( + destination_storage.getInMemoryMetadataPtr(), + src_global_min_max_indexes.getBlock(destination_storage), + destination_storage.getContext()); +} + +} diff --git a/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h new file mode 100644 index 00000000000..1682add3ebd --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/* + * Verifies that source and destination partitions are compatible. + * To be compatible, one of the following criteria must be met: + * 1. Destination partition expression columns are a subset of source partition columns; or + * 2. Destination partition expression is monotonic on the source global min_max idx Range AND the computer partition id for + * the source global min_max idx range is the same. + * + * If not, an exception is thrown. + * */ + +class MergeTreePartitionCompatibilityVerifier +{ +public: + using DataPart = IMergeTreeDataPart; + using DataPartPtr = std::shared_ptr; + using DataPartsVector = std::vector; + + static void + verify(const MergeTreeData & source_storage, const MergeTreeData & destination_storage, const DataPartsVector & source_parts); +}; + +} diff --git a/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp new file mode 100644 index 00000000000..0871efadf0c --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp @@ -0,0 +1,25 @@ +#include + +namespace DB +{ + +IMergeTreeDataPart::MinMaxIndex +MergeTreePartitionGlobalMinMaxIdxCalculator::calculate(const DataPartsVector & parts, const MergeTreeData & storage) +{ + IMergeTreeDataPart::MinMaxIndex global_min_max_indexes; + + for (const auto & part : parts) + { + auto metadata_manager = std::make_shared(part.get()); + + auto local_min_max_index = MergeTreeData::DataPart::MinMaxIndex(); + + local_min_max_index.load(storage, metadata_manager); + + global_min_max_indexes.merge(local_min_max_index); + } + + return global_min_max_indexes; +} + +} diff --git a/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h new file mode 100644 index 00000000000..4f271177246 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include +#include + +namespace DB +{ + +/* + * Calculates global min max indexes for a given set of parts on given storage. + * */ +class MergeTreePartitionGlobalMinMaxIdxCalculator +{ + using DataPart = IMergeTreeDataPart; + using DataPartPtr = std::shared_ptr; + using DataPartsVector = std::vector; + +public: + static IMergeTreeDataPart::MinMaxIndex calculate(const DataPartsVector & parts, const MergeTreeData & storage); +}; + +} diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 4761ccd8b58..fd5354a00a9 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -5,9 +5,9 @@ #include #include -#include #include #include +#include #include "Common/Exception.h" #include #include @@ -20,25 +20,30 @@ #include #include #include +#include #include -#include #include #include #include #include #include -#include #include +#include +#include #include #include #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include #include #include @@ -2039,41 +2044,73 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con ProfileEventsScope profile_events_scope; MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, my_metadata_snapshot); - String partition_id = getPartitionIDFromQuery(partition, local_context); + String partition_id = src_data.getPartitionIDFromQuery(partition, local_context); DataPartsVector src_parts = src_data.getVisibleDataPartsVectorInPartition(local_context, partition_id); + + bool attach_empty_partition = !replace && src_parts.empty(); + if (attach_empty_partition) + return; + MutableDataPartsVector dst_parts; std::vector dst_parts_locks; static const String TMP_PREFIX = "tmp_replace_from_"; - for (const DataPartPtr & src_part : src_parts) + const auto my_partition_expression = my_metadata_snapshot->getPartitionKeyAST(); + const auto src_partition_expression = source_metadata_snapshot->getPartitionKeyAST(); + const auto is_partition_exp_different = queryToStringNullable(my_partition_expression) != queryToStringNullable(src_partition_expression); + + if (is_partition_exp_different && !src_parts.empty()) + MergeTreePartitionCompatibilityVerifier::verify(src_data, /* destination_storage */ *this, src_parts); + + for (DataPartPtr & src_part : src_parts) { if (!canReplacePartition(src_part)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot replace partition '{}' because part '{}' has inconsistent granularity with table", partition_id, src_part->name); - /// This will generate unique name in scope of current server process. - Int64 temp_index = insert_increment.get(); - MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( - src_part, - TMP_PREFIX, - dst_part_info, - my_metadata_snapshot, - clone_params, - local_context->getReadSettings(), - local_context->getWriteSettings()); - dst_parts.emplace_back(std::move(dst_part)); - dst_parts_locks.emplace_back(std::move(part_lock)); - } + /// This will generate unique name in scope of current server process. + auto index = insert_increment.get(); - /// ATTACH empty part set - if (!replace && dst_parts.empty()) - return; + if (is_partition_exp_different) + { + auto [new_partition, new_min_max_index] = createPartitionAndMinMaxIndexFromSourcePart( + src_part, my_metadata_snapshot, local_context); + + auto [dst_part, part_lock] = cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + src_part, + new_partition, + new_partition.getID(*this), + new_min_max_index, + TMP_PREFIX, + my_metadata_snapshot, + clone_params, + local_context, + index, + index); + + dst_parts.emplace_back(std::move(dst_part)); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + else + { + MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); + + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( + src_part, + TMP_PREFIX, + dst_part_info, + my_metadata_snapshot, + clone_params, + local_context->getReadSettings(), + local_context->getWriteSettings()); + dst_parts.emplace_back(std::move(dst_part)); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + } MergeTreePartInfo drop_range; if (replace) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index f7e6783dbc2..512811e39d7 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -26,22 +26,21 @@ #include -#include #include #include #include #include #include -#include #include #include -#include #include #include #include #include #include +#include #include +#include #include #include #include @@ -53,9 +52,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -2713,16 +2714,48 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) .copy_instead_of_hardlink = storage_settings_ptr->always_use_copy_instead_of_hardlinks || ((our_zero_copy_enabled || source_zero_copy_enabled) && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport()), .metadata_version_to_write = metadata_snapshot->getMetadataVersion() }; - auto [res_part, temporary_part_lock] = cloneAndLoadDataPartOnSameDisk( - part_desc->src_table_part, - TMP_PREFIX + "clone_", - part_desc->new_part_info, - metadata_snapshot, - clone_params, - getContext()->getReadSettings(), - getContext()->getWriteSettings()); - part_desc->res_part = std::move(res_part); - part_desc->temporary_part_lock = std::move(temporary_part_lock); + + const auto my_partition_expression = metadata_snapshot->getPartitionKeyAST(); + const auto src_partition_expression = source_table->getInMemoryMetadataPtr()->getPartitionKeyAST(); + + const auto is_partition_exp_different = queryToStringNullable(my_partition_expression) != queryToStringNullable(src_partition_expression); + + if (is_partition_exp_different) + { + auto [new_partition, new_min_max_index] = createPartitionAndMinMaxIndexFromSourcePart( + part_desc->src_table_part, metadata_snapshot, getContext()); + + auto partition_id = new_partition.getID(*this); + + auto [res_part, temporary_part_lock] = cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + part_desc->src_table_part, + new_partition, + partition_id, + new_min_max_index, + TMP_PREFIX + "clone_", + metadata_snapshot, + clone_params, + getContext(), + part_desc->new_part_info.min_block, + part_desc->new_part_info.max_block); + + part_desc->res_part = std::move(res_part); + part_desc->temporary_part_lock = std::move(temporary_part_lock); + } + else + { + auto [res_part, temporary_part_lock] = cloneAndLoadDataPartOnSameDisk( + part_desc->src_table_part, + TMP_PREFIX + "clone_", + part_desc->new_part_info, + metadata_snapshot, + clone_params, + getContext()->getReadSettings(), + getContext()->getWriteSettings()); + + part_desc->res_part = std::move(res_part); + part_desc->temporary_part_lock = std::move(temporary_part_lock); + } } else if (!part_desc->replica.empty()) { @@ -7852,11 +7885,22 @@ void StorageReplicatedMergeTree::replacePartitionFrom( ProfileEventsScope profile_events_scope; MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, metadata_snapshot); - String partition_id = getPartitionIDFromQuery(partition, query_context); + String partition_id = src_data.getPartitionIDFromQuery(partition, query_context); /// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet. DataPartsVector src_all_parts = src_data.getVisibleDataPartsVectorInPartition(query_context, partition_id); + bool attach_empty_partition = !replace && src_all_parts.empty(); + if (attach_empty_partition) + return; + + const auto my_partition_expression = metadata_snapshot->getPartitionKeyAST(); + const auto src_partition_expression = source_metadata_snapshot->getPartitionKeyAST(); + const auto is_partition_exp_different = queryToStringNullable(my_partition_expression) != queryToStringNullable(src_partition_expression); + + if (is_partition_exp_different && !src_all_parts.empty()) + MergeTreePartitionCompatibilityVerifier::verify(src_data, /* destination_storage */ *this, src_all_parts); + LOG_DEBUG(log, "Cloning {} parts", src_all_parts.size()); static const String TMP_PREFIX = "tmp_replace_from_"; @@ -7911,6 +7955,18 @@ void StorageReplicatedMergeTree::replacePartitionFrom( "Cannot replace partition '{}' because part '{}" "' has inconsistent granularity with table", partition_id, src_part->name); + IMergeTreeDataPart::MinMaxIndex min_max_index = *src_part->minmax_idx; + MergeTreePartition merge_tree_partition = src_part->partition; + + if (is_partition_exp_different) + { + auto [new_partition, new_min_max_index] = createPartitionAndMinMaxIndexFromSourcePart(src_part, metadata_snapshot, query_context); + + merge_tree_partition = new_partition; + min_max_index = new_min_max_index; + partition_id = merge_tree_partition.getID(*this); + } + String hash_hex = src_part->checksums.getTotalChecksumHex(); const bool is_duplicated_part = replaced_parts.contains(hash_hex); replaced_parts.insert(hash_hex); @@ -7929,27 +7985,52 @@ void StorageReplicatedMergeTree::replacePartitionFrom( continue; } - UInt64 index = lock->getNumber(); - MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - bool zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication || dynamic_cast(source_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + + UInt64 index = lock->getNumber(); + IDataPartStorage::ClonePartParams clone_params { .copy_instead_of_hardlink = storage_settings_ptr->always_use_copy_instead_of_hardlinks || (zero_copy_enabled && src_part->isStoredOnRemoteDiskWithZeroCopySupport()), .metadata_version_to_write = metadata_snapshot->getMetadataVersion() }; - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( - src_part, - TMP_PREFIX, - dst_part_info, - metadata_snapshot, - clone_params, - query_context->getReadSettings(), - query_context->getWriteSettings()); + + if (is_partition_exp_different) + { + auto [dst_part, part_lock] = cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + src_part, + merge_tree_partition, + partition_id, + min_max_index, + TMP_PREFIX, + metadata_snapshot, + clone_params, + query_context, + index, + index); + + dst_parts.emplace_back(dst_part); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + else + { + MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); + + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( + src_part, + TMP_PREFIX, + dst_part_info, + metadata_snapshot, + clone_params, + query_context->getReadSettings(), + query_context->getWriteSettings()); + + dst_parts.emplace_back(dst_part); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + src_parts.emplace_back(src_part); - dst_parts.emplace_back(dst_part); - dst_parts_locks.emplace_back(std::move(part_lock)); ephemeral_locks.emplace_back(std::move(*lock)); block_id_paths.emplace_back(block_id_path); part_checksums.emplace_back(hash_hex); diff --git a/tests/integration/test_attach_partition_distinct_expression_replicated/__init__.py b/tests/integration/test_attach_partition_distinct_expression_replicated/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml b/tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml new file mode 100644 index 00000000000..b40730e9f7d --- /dev/null +++ b/tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml @@ -0,0 +1,17 @@ + + + + + true + + replica1 + 9000 + + + replica2 + 9000 + + + + + diff --git a/tests/integration/test_attach_partition_distinct_expression_replicated/test.py b/tests/integration/test_attach_partition_distinct_expression_replicated/test.py new file mode 100644 index 00000000000..1d8ac4e9e37 --- /dev/null +++ b/tests/integration/test_attach_partition_distinct_expression_replicated/test.py @@ -0,0 +1,214 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + +cluster = ClickHouseCluster(__file__) + +replica1 = cluster.add_instance( + "replica1", with_zookeeper=True, main_configs=["configs/remote_servers.xml"] +) +replica2 = cluster.add_instance( + "replica2", with_zookeeper=True, main_configs=["configs/remote_servers.xml"] +) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + except Exception as ex: + print(ex) + finally: + cluster.shutdown() + + +def cleanup(nodes): + for node in nodes: + node.query("DROP TABLE IF EXISTS source SYNC") + node.query("DROP TABLE IF EXISTS destination SYNC") + + +def create_table(node, table_name, replicated): + replica = node.name + engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/{table_name}', '{replica}')" + if replicated + else "MergeTree()" + ) + partition_expression = ( + "toYYYYMMDD(timestamp)" if table_name == "source" else "toYYYYMM(timestamp)" + ) + node.query_with_retry( + """ + CREATE TABLE {table_name}(timestamp DateTime) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY {partition_expression} + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + table_name=table_name, + engine=engine, + partition_expression=partition_expression, + ) + ) + + +def test_both_replicated(start_cluster): + for node in [replica1, replica2]: + create_table(node, "source", True) + create_table(node, "destination", True) + + replica1.query("INSERT INTO source VALUES ('2010-03-02 02:01:01')") + replica1.query("SYSTEM SYNC REPLICA source") + replica1.query("SYSTEM SYNC REPLICA destination") + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT * FROM destination", "2010-03-02 02:01:01\n" + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination", + replica2.query(f"SELECT * FROM destination"), + ) + + cleanup([replica1, replica2]) + + +def test_only_destination_replicated(start_cluster): + create_table(replica1, "source", False) + create_table(replica1, "destination", True) + create_table(replica2, "destination", True) + + replica1.query("INSERT INTO source VALUES ('2010-03-02 02:01:01')") + replica1.query("SYSTEM SYNC REPLICA destination") + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT * FROM destination", "2010-03-02 02:01:01\n" + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination", + replica2.query(f"SELECT * FROM destination"), + ) + + cleanup([replica1, replica2]) + + +def test_both_replicated_partitioned_to_unpartitioned(start_cluster): + def create_tables(nodes): + for node in nodes: + source_engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/source', '{node.name}')" + ) + node.query( + """ + CREATE TABLE source(timestamp DateTime) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp) + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=source_engine, + ) + ) + + destination_engine = f"ReplicatedMergeTree('/clickhouse/tables/1/destination', '{node.name}')" + node.query( + """ + CREATE TABLE destination(timestamp DateTime) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY tuple() + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=destination_engine, + ) + ) + + create_tables([replica1, replica2]) + + replica1.query("INSERT INTO source VALUES ('2010-03-02 02:01:01')") + replica1.query("INSERT INTO source VALUES ('2010-03-03 02:01:01')") + replica1.query("SYSTEM SYNC REPLICA source") + replica1.query("SYSTEM SYNC REPLICA destination") + + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source" + ) + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100303' FROM source" + ) + + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY timestamp", + "2010-03-02 02:01:01\n2010-03-03 02:01:01\n", + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY timestamp", + replica2.query(f"SELECT * FROM destination ORDER BY timestamp"), + ) + + cleanup([replica1, replica2]) + + +def test_both_replicated_different_exp_same_id(start_cluster): + def create_tables(nodes): + for node in nodes: + source_engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/source', '{node.name}')" + ) + node.query( + """ + CREATE TABLE source(a UInt16,b UInt16,c UInt16,extra UInt64,Path String,Time DateTime,Value Float64,Timestamp Int64,sign Int8) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY a % 3 + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=source_engine, + ) + ) + + destination_engine = f"ReplicatedMergeTree('/clickhouse/tables/1/destination', '{node.name}')" + node.query( + """ + CREATE TABLE destination(a UInt16,b UInt16,c UInt16,extra UInt64,Path String,Time DateTime,Value Float64,Timestamp Int64,sign Int8) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY a + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=destination_engine, + ) + ) + + create_tables([replica1, replica2]) + + replica1.query( + "INSERT INTO source (a, b, c, extra, sign) VALUES (1, 5, 9, 1000, 1)" + ) + replica1.query( + "INSERT INTO source (a, b, c, extra, sign) VALUES (2, 6, 10, 1000, 1)" + ) + replica1.query("SYSTEM SYNC REPLICA source") + replica1.query("SYSTEM SYNC REPLICA destination") + + replica1.query(f"ALTER TABLE destination ATTACH PARTITION 1 FROM source") + replica1.query(f"ALTER TABLE destination ATTACH PARTITION 2 FROM source") + + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY a", + "1\t5\t9\t1000\t\t1970-01-01 00:00:00\t0\t0\t1\n2\t6\t10\t1000\t\t1970-01-01 00:00:00\t0\t0\t1\n", + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY a", + replica2.query(f"SELECT * FROM destination ORDER BY a"), + ) + + cleanup([replica1, replica2]) diff --git a/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference new file mode 100644 index 00000000000..f1d036b08bf --- /dev/null +++ b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference @@ -0,0 +1,467 @@ +-- { echoOn } +-- Should be allowed since destination partition expr is monotonically increasing and compatible +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +-- Should be allowed since destination partition expr is monotonically increasing and compatible. Note that even though +-- the destination partition expression is more granular, the data would still fall in the same partition. Thus, it is valid +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +20100302 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +20100302 +-- Should be allowed since destination partition expr is monotonically increasing and compatible for those specific values +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 1); +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION 0 FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +-- Should be allowed because dst partition exp is monot inc and data is not split +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); +ALTER TABLE destination ATTACH PARTITION ID '17908065610379824077' from source; +SELECT * FROM source ORDER BY productName; +mop general +rice food +spaghetti food +SELECT * FROM destination ORDER BY productName; +rice food +spaghetti food +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +59532f3c39a412a413f0f014c7750a9d +59532f3c39a412a413f0f014c7750a9d +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '17908065610379824077' from source; +SELECT * FROM source ORDER BY productName; +mop general +rice food +spaghetti food +SELECT * FROM destination ORDER BY productName; +rice food +spaghetti food +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +59532f3c39a412a413f0f014c7750a9d +59532f3c39a412a413f0f014c7750a9d +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747574133 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY intDiv(timestamp, 86400000); +CREATE TABLE destination (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY toYear(toDateTime(intDiv(timestamp, 1000))); +INSERT INTO TABLE source VALUES (1267495261123); +ALTER TABLE destination ATTACH PARTITION ID '14670' FROM source; +SELECT * FROM source ORDER BY timestamp; +1267495261123 +SELECT * FROM destination ORDER BY timestamp; +1267495261123 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +2010 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '14670' from source; +SELECT * FROM source ORDER BY timestamp; +1267495261123 +SELECT * FROM destination ORDER BY timestamp; +1267495261123 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +2010 +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747511726 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY toYear(timestamp); +CREATE TABLE destination (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY (intDiv(toUInt32(timestamp),86400)); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01',1,1),('2010-03-02 02:01:01',1,1),('2011-02-02 02:01:03',1,1); +ALTER TABLE destination ATTACH PARTITION ID '2010' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +2011-02-02 02:01:03 1 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +14670 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '2010' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +2011-02-02 02:01:03 1 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +14670 +-- Should be allowed, partitioned table to unpartitioned. Since the destination is unpartitioned, parts would ultimately +-- fall into the same partition. +-- Destination partition by expression is omitted, which causes StorageMetadata::getPartitionKeyAST() to be nullptr. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '201003' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +-- Same as above, but destination partition by expression is explicitly defined. Test case required to validate that +-- partition by tuple() is accepted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '201003' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b); +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1-2 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1-2 +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY a; +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +-- Should be allowed. Special test case, tricky to explain. First column of source partition expression is +-- timestamp, while first column of destination partition expression is `A`. One of the previous implementations +-- would not match the columns, which could lead to `timestamp` min max being used to calculate monotonicity of `A`. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY tuple(toYYYYMM(timestamp), intDiv(A, 6)) ORDER BY timestamp; +CREATE TABLE destination (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY A ORDER BY timestamp; +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 5); +ALTER TABLE destination ATTACH PARTITION ID '201003-0' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +5 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (201003, 0) from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +5 +-- Should be allowed. Destination partition expression contains multiple expressions, but all of them are monotonically +-- increasing in the source partition min max indexes. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); +INSERT INTO TABLE source VALUES (6, 12); +ALTER TABLE destination ATTACH PARTITION ID '6-12' FROM source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +3-6 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (6, 12) from source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +3-6 +-- Should be allowed. The same scenario as above, but partition expressions inverted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); +INSERT INTO TABLE source VALUES (6, 12); +ALTER TABLE destination ATTACH PARTITION ID '3-6' FROM source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +6-12 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (3, 6) from source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +6-12 +-- Should be allowed, it is a local operation, no different than regular attach. Replicated to replicated. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE + source(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/source_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY tuple(); +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '20100302' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +-- Should be allowed, it is a local operation, no different than regular attach. Non replicated to replicated +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; +CREATE TABLE source(timestamp DateTime) ENGINE = MergeTree() PARTITION BY toYYYYMMDD(timestamp) ORDER BY tuple(); +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_non_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '20100302' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-03 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION '201003' from source; -- { serverError 248 } +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 2); +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION 0 FROM source; -- { serverError 248 } +-- Should not be allowed because dst partition exp takes more than two arguments, so it's not considered monotonically inc +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY substring(category, 1, 2); +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } +-- Should not be allowed because dst partition exp depends on a different set of columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(productName); +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } +-- Should not be allowed because dst partition exp is not monotonically increasing +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY left(productName, 2); +CREATE TABLE destination (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(productName); +INSERT INTO TABLE source VALUES ('bread'), ('mop'); +INSERT INTO TABLE source VALUES ('broccoli'); +ALTER TABLE destination ATTACH PARTITION ID '4589453b7ee96ce9de1265bd57674496' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'br' from source; -- { serverError 36 } +-- Empty/ non-existent partition, same partition expression. Nothing should happen +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +-- Empty/ non-existent partition, different partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +-- Replace instead of attach. Empty/ non-existent partition, same partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +ALTER TABLE destination REPLACE PARTITION '1' FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +-- Replace instead of attach. Empty/ non-existent partition to non-empty partition, same partition id. +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; +CREATE TABLE destination (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; +INSERT INTO TABLE destination VALUES (1); +ALTER TABLE destination REPLACE PARTITION '1' FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; diff --git a/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql new file mode 100644 index 00000000000..9547d6ae249 --- /dev/null +++ b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql @@ -0,0 +1,485 @@ +-- { echoOn } +-- Should be allowed since destination partition expr is monotonically increasing and compatible +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed since destination partition expr is monotonically increasing and compatible. Note that even though +-- the destination partition expression is more granular, the data would still fall in the same partition. Thus, it is valid +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed since destination partition expr is monotonically increasing and compatible for those specific values +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); + +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 1); + +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION 0 FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed because dst partition exp is monot inc and data is not split +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); + +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); + +ALTER TABLE destination ATTACH PARTITION ID '17908065610379824077' from source; + +SELECT * FROM source ORDER BY productName; +SELECT * FROM destination ORDER BY productName; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '17908065610379824077' from source; + +SELECT * FROM source ORDER BY productName; +SELECT * FROM destination ORDER BY productName; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747574133 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY intDiv(timestamp, 86400000); +CREATE TABLE destination (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY toYear(toDateTime(intDiv(timestamp, 1000))); + +INSERT INTO TABLE source VALUES (1267495261123); + +ALTER TABLE destination ATTACH PARTITION ID '14670' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '14670' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747511726 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY toYear(timestamp); +CREATE TABLE destination (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY (intDiv(toUInt32(timestamp),86400)); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01',1,1),('2010-03-02 02:01:01',1,1),('2011-02-02 02:01:03',1,1); + +ALTER TABLE destination ATTACH PARTITION ID '2010' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '2010' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, partitioned table to unpartitioned. Since the destination is unpartitioned, parts would ultimately +-- fall into the same partition. +-- Destination partition by expression is omitted, which causes StorageMetadata::getPartitionKeyAST() to be nullptr. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '201003' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Same as above, but destination partition by expression is explicitly defined. Test case required to validate that +-- partition by tuple() is accepted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '201003' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b); + +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); + +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY a; + +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); + +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed. Special test case, tricky to explain. First column of source partition expression is +-- timestamp, while first column of destination partition expression is `A`. One of the previous implementations +-- would not match the columns, which could lead to `timestamp` min max being used to calculate monotonicity of `A`. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY tuple(toYYYYMM(timestamp), intDiv(A, 6)) ORDER BY timestamp; +CREATE TABLE destination (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY A ORDER BY timestamp; + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 5); + +ALTER TABLE destination ATTACH PARTITION ID '201003-0' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (201003, 0) from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed. Destination partition expression contains multiple expressions, but all of them are monotonically +-- increasing in the source partition min max indexes. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); + +INSERT INTO TABLE source VALUES (6, 12); + +ALTER TABLE destination ATTACH PARTITION ID '6-12' FROM source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (6, 12) from source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed. The same scenario as above, but partition expressions inverted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); + +INSERT INTO TABLE source VALUES (6, 12); + +ALTER TABLE destination ATTACH PARTITION ID '3-6' FROM source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (3, 6) from source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, it is a local operation, no different than regular attach. Replicated to replicated. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE + source(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/source_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY tuple(); + +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '20100302' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, it is a local operation, no different than regular attach. Non replicated to replicated +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; +CREATE TABLE source(timestamp DateTime) ENGINE = MergeTree() PARTITION BY toYYYYMMDD(timestamp) ORDER BY tuple(); + +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_non_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '20100302' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-03 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION '201003' from source; -- { serverError 248 } + +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); + +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 2); + +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION 0 FROM source; -- { serverError 248 } + +-- Should not be allowed because dst partition exp takes more than two arguments, so it's not considered monotonically inc +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY substring(category, 1, 2); + +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); + +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } + +-- Should not be allowed because dst partition exp depends on a different set of columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(productName); + +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); + +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } + +-- Should not be allowed because dst partition exp is not monotonically increasing +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY left(productName, 2); +CREATE TABLE destination (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(productName); + +INSERT INTO TABLE source VALUES ('bread'), ('mop'); +INSERT INTO TABLE source VALUES ('broccoli'); + +ALTER TABLE destination ATTACH PARTITION ID '4589453b7ee96ce9de1265bd57674496' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'br' from source; -- { serverError 36 } + +-- Empty/ non-existent partition, same partition expression. Nothing should happen +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Empty/ non-existent partition, different partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Replace instead of attach. Empty/ non-existent partition, same partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +ALTER TABLE destination REPLACE PARTITION '1' FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Replace instead of attach. Empty/ non-existent partition to non-empty partition, same partition id. +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; +CREATE TABLE destination (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; + +INSERT INTO TABLE destination VALUES (1); + +ALTER TABLE destination REPLACE PARTITION '1' FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; From 5179891aef9792366d948efd9f1a2454dfe8da69 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jan 2024 11:43:08 -0300 Subject: [PATCH 0216/1081] remove static log --- src/Storages/MergeTree/MergeTreeDataPartCloner.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp index 78cb9aa0624..e384e1b7066 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp @@ -13,8 +13,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -static Poco::Logger * log = &Poco::Logger::get("MergeTreeDataPartCloner"); - namespace DistinctPartitionExpression { std::unique_ptr updatePartitionFile( @@ -182,7 +180,7 @@ std::pair cloneSourcePart( } LOG_DEBUG( - log, + &Poco::Logger::get("MergeTreeDataPartCloner"), "Clone {} part {} to {}{}", src_flushed_tmp_part ? "flushed" : "", src_part_storage->getFullPath(), From 7e86c0e9280bb6e46183c2c358474bfd283e2554 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 23 Jan 2024 23:03:15 +0800 Subject: [PATCH 0217/1081] Compress state of dashboard --- programs/server/dashboard.html | 6 +++--- programs/server/js/lz-string.js | 1 + src/Server/WebUIRequestHandler.cpp | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 programs/server/js/lz-string.js diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 04fdfb2d3ca..1f32048da79 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -5,6 +5,7 @@ ClickHouse Dashboard +