From 30f48e18938bbc5683d781f1cbfe7bfcf3fec8d9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Jan 2024 23:54:35 +0100 Subject: [PATCH 001/112] Use MergeTree as a default table engine --- src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e0b3ca39899..4460a365846 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -558,7 +558,7 @@ class IColumn; M(UInt64, min_free_disk_space_for_temporary_data, 0, "The minimum disk space to keep while writing temporary data used in external sorting and aggregation.", 0) \ \ M(DefaultTableEngine, default_temporary_table_engine, DefaultTableEngine::Memory, "Default table engine used when ENGINE is not set in CREATE TEMPORARY statement.",0) \ - M(DefaultTableEngine, default_table_engine, DefaultTableEngine::None, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ + M(DefaultTableEngine, default_table_engine, DefaultTableEngine::MergeTree, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ M(Bool, show_table_uuid_in_table_create_query_if_not_nil, false, "For tables in databases with Engine=Atomic show UUID of the table in its CREATE query.", 0) \ M(Bool, database_atomic_wait_for_drop_and_detach_synchronously, false, "When executing DROP or DETACH TABLE in Atomic database, wait for table data to be finally dropped or detached.", 0) \ M(Bool, enable_scalar_subquery_optimization, true, "If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index dff0ebb759c..7bdab886934 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -100,6 +100,7 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, + {"default_table_engine", DefaultTableEngine::None, DefaultTableEngine::MergeTree, "Set default table engine to MergeTree for better usability"}, {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, From 3f1ec9a9881949c7e676cf11f35fb75df3b95f78 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jan 2024 04:23:16 +0100 Subject: [PATCH 002/112] Fix error --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 692d8fc6360..53b14ddc385 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -102,7 +102,7 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, - {"default_table_engine", DefaultTableEngine::None, DefaultTableEngine::MergeTree, "Set default table engine to MergeTree for better usability"}, + {"default_table_engine", "None", "MergeTree", "Set default table engine to MergeTree for better usability"}, {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}, {"azure_max_unexpected_write_error_retries", 4, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write"}}}, From 299c390d2b17e118a0fc87a21bc8859d135e006b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 15:56:41 +0100 Subject: [PATCH 003/112] Add some fuzzing to ASTLiterals --- src/Client/QueryFuzzer.cpp | 57 ++++++++++++++++++++++++++++++++++---- src/Client/QueryFuzzer.h | 2 ++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index 629d36e7960..786d5af0cb3 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -903,6 +903,54 @@ void QueryFuzzer::notifyQueryFailed(ASTPtr ast) remove_fuzzed_table(insert->getTable()); } +ASTPtr QueryFuzzer::fuzzLiteralUnderExpressionList(ASTPtr child) +{ + auto * l = child->as(); + chassert(l); + auto type = l->value.getType(); + if (type == Field::Types::Which::String && fuzz_rand() % 7 == 0) + { + String value = l->value.get(); + child = makeASTFunction( + "toFixedString", std::make_shared(value), std::make_shared(static_cast(value.size()))); + } + + if (fuzz_rand() % 11 == 0) + { + String value = l->value.get(); + child = makeASTFunction("toNullable", child); + } + + if (fuzz_rand() % 11 == 0) + { + String value = l->value.get(); + child = makeASTFunction("toLowCardinality", child); + } + + if (fuzz_rand() % 11 == 0) + { + String value = l->value.get(); + child = makeASTFunction("materialize", child); + } + + return child; +} + + +void QueryFuzzer::fuzzExpressionList(ASTExpressionList & expr_list) +{ + for (size_t i = 0; i < expr_list.children.size(); i++) + { + if (auto * literal = typeid_cast(expr_list.children[i].get())) + { + if (fuzz_rand() % 13 == 0) + expr_list.children[i] = fuzzLiteralUnderExpressionList(expr_list.children[i]); + } + else + fuzz(expr_list.children[i]); + } +} + void QueryFuzzer::fuzz(ASTs & asts) { for (auto & ast : asts) @@ -989,7 +1037,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast) } else if (auto * expr_list = typeid_cast(ast.get())) { - fuzz(expr_list->children); + fuzzExpressionList(*expr_list); } else if (auto * order_by_element = typeid_cast(ast.get())) { @@ -1108,7 +1156,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast) } /* * The time to fuzz the settings has not yet come. - * Apparently we don't have any infractructure to validate the values of + * Apparently we don't have any infrastructure to validate the values of * the settings, and the first query with max_block_size = -1 breaks * because of overflows here and there. *//* @@ -1131,9 +1179,8 @@ void QueryFuzzer::fuzz(ASTPtr & ast) // are ASTPtr -- this is redundant ownership, but hides the error if the // child field is replaced. Others can be ASTLiteral * or the like, which // leads to segfault if the pointed-to AST is replaced. - // Replacing children is safe in case of ASTExpressionList. In a more - // general case, we can change the value of ASTLiteral, which is what we - // do here. + // Replacing children is safe in case of ASTExpressionList (done in fuzzExpressionList). In a more + // general case, we can change the value of ASTLiteral, which is what we do here if (fuzz_rand() % 11 == 0) { literal->value = fuzzField(literal->value); diff --git a/src/Client/QueryFuzzer.h b/src/Client/QueryFuzzer.h index 18c7b8a9241..cdeba2b76fd 100644 --- a/src/Client/QueryFuzzer.h +++ b/src/Client/QueryFuzzer.h @@ -95,6 +95,8 @@ struct QueryFuzzer void fuzzExplainSettings(ASTSetQuery & settings_ast, ASTExplainQuery::ExplainKind kind); void fuzzColumnDeclaration(ASTColumnDeclaration & column); void fuzzTableName(ASTTableExpression & table); + ASTPtr fuzzLiteralUnderExpressionList(ASTPtr child); + void fuzzExpressionList(ASTExpressionList & expr_list); void fuzz(ASTs & asts); void fuzz(ASTPtr & ast); void collectFuzzInfoMain(ASTPtr ast); From 4b5e992565b060cc002495f8c58cceb79c75d53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 19:33:31 +0100 Subject: [PATCH 004/112] Fix problems --- src/Client/QueryFuzzer.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index 786d5af0cb3..bb551fcb11e 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -916,22 +916,13 @@ ASTPtr QueryFuzzer::fuzzLiteralUnderExpressionList(ASTPtr child) } if (fuzz_rand() % 11 == 0) - { - String value = l->value.get(); child = makeASTFunction("toNullable", child); - } if (fuzz_rand() % 11 == 0) - { - String value = l->value.get(); child = makeASTFunction("toLowCardinality", child); - } if (fuzz_rand() % 11 == 0) - { - String value = l->value.get(); child = makeASTFunction("materialize", child); - } return child; } @@ -939,15 +930,15 @@ ASTPtr QueryFuzzer::fuzzLiteralUnderExpressionList(ASTPtr child) void QueryFuzzer::fuzzExpressionList(ASTExpressionList & expr_list) { - for (size_t i = 0; i < expr_list.children.size(); i++) + for (auto & child : expr_list.children) { - if (auto * literal = typeid_cast(expr_list.children[i].get())) + if (auto * literal = typeid_cast(child.get())) { if (fuzz_rand() % 13 == 0) - expr_list.children[i] = fuzzLiteralUnderExpressionList(expr_list.children[i]); + child = fuzzLiteralUnderExpressionList(child); } else - fuzz(expr_list.children[i]); + fuzz(child); } } From a3f0546f48af77d7c120a7e71d94b992a4446e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 19:44:55 +0100 Subject: [PATCH 005/112] Handle both fuzzer.log and fuzzer.log.ztd --- tests/ci/ast_fuzzer_check.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 41e4ef19361..95a887484f2 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -113,7 +113,6 @@ def main(): paths = { "run.log": run_log_path, "main.log": main_log_path, - "fuzzer.log": workspace_path / "fuzzer.log", "report.html": workspace_path / "report.html", "core.zst": workspace_path / "core.zst", "dmesg.log": workspace_path / "dmesg.log", @@ -129,6 +128,14 @@ def main(): if not_compressed_server_log_path.exists(): paths["server.log"] = not_compressed_server_log_path + # Same idea but with the fuzzer log + compressed_fuzzer_log_path = workspace_path / "fuzzer.log.zst" + if compressed_fuzzer_log_path.exists(): + paths["fuzzer.log.zst"] = compressed_fuzzer_log_path + not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" + if not_compressed_fuzzer_log_path.exists(): + paths["fuzzer.log"] = not_compressed_fuzzer_log_path + # Try to get status message saved by the fuzzer try: with open(workspace_path / "status.txt", "r", encoding="utf-8") as status_f: From 4f0c78d66557bd74d21796ce2ea661132c26abc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 20:25:26 +0100 Subject: [PATCH 006/112] Upload one file. Save the planet --- docker/test/fuzzer/run-fuzzer.sh | 4 ++-- tests/ci/ast_fuzzer_check.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 050d4b68628..ca6bff9c6be 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -389,8 +389,8 @@ fi rg --text -F '' server.log > fatal.log ||: dmesg -T > dmesg.log ||: -zstd --threads=0 server.log -zstd --threads=0 fuzzer.log +zstd --threads=0 --rm server.log +zstd --threads=0 --rm fuzzer.log cat > report.html < diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 95a887484f2..26ce7f5140b 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -121,20 +121,20 @@ def main(): compressed_server_log_path = workspace_path / "server.log.zst" if compressed_server_log_path.exists(): paths["server.log.zst"] = compressed_server_log_path - - # The script can fail before the invocation of `zstd`, but we are still interested in its log: - - not_compressed_server_log_path = workspace_path / "server.log" - if not_compressed_server_log_path.exists(): - paths["server.log"] = not_compressed_server_log_path + else: + # The script can fail before the invocation of `zstd`, but we are still interested in its log: + not_compressed_server_log_path = workspace_path / "server.log" + if not_compressed_server_log_path.exists(): + paths["server.log"] = not_compressed_server_log_path # Same idea but with the fuzzer log compressed_fuzzer_log_path = workspace_path / "fuzzer.log.zst" if compressed_fuzzer_log_path.exists(): paths["fuzzer.log.zst"] = compressed_fuzzer_log_path - not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" - if not_compressed_fuzzer_log_path.exists(): - paths["fuzzer.log"] = not_compressed_fuzzer_log_path + else: + not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" + if not_compressed_fuzzer_log_path.exists(): + paths["fuzzer.log"] = not_compressed_fuzzer_log_path # Try to get status message saved by the fuzzer try: From 17ab2674f4c8ad7a09194659e0a0c86d4440f203 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 30 Jan 2024 20:35:10 +0100 Subject: [PATCH 007/112] impl --- src/Common/ElapsedTimeProfileEventIncrement.h | 3 +- src/Common/ProfileEvents.cpp | 7 +++ .../MergeTreeDataPartWriterOnDisk.cpp | 15 ++++++ .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 2 + .../MergeTree/MergeTreeDataWriter.cpp | 47 ++++++++++++++----- 5 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/Common/ElapsedTimeProfileEventIncrement.h b/src/Common/ElapsedTimeProfileEventIncrement.h index b30afd24a4c..731295a4cfd 100644 --- a/src/Common/ElapsedTimeProfileEventIncrement.h +++ b/src/Common/ElapsedTimeProfileEventIncrement.h @@ -14,12 +14,13 @@ enum Time Seconds, }; -template