#include "QueryFuzzer.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int TOO_DEEP_RECURSION; } Field QueryFuzzer::getRandomField(int type) { static constexpr Int64 bad_int64_values[] = {-2, -1, 0, 1, 2, 3, 7, 10, 100, 255, 256, 257, 1023, 1024, 1025, 65535, 65536, 65537, 1024 * 1024 - 1, 1024 * 1024, 1024 * 1024 + 1, INT_MIN - 1ll, INT_MIN, INT_MIN + 1, INT_MAX - 1, INT_MAX, INT_MAX + 1ll, INT64_MIN, INT64_MIN + 1, INT64_MAX - 1, INT64_MAX}; switch (type) { case 0: { return bad_int64_values[fuzz_rand() % (sizeof(bad_int64_values) / sizeof(*bad_int64_values))]; } case 1: { static constexpr float values[] = {NAN, INFINITY, -INFINITY, 0., -0., 0.0001, 0.5, 0.9999, 1., 1.0001, 2., 10.0001, 100.0001, 1000.0001, 1e10, 1e20, FLT_MIN, FLT_MIN + FLT_EPSILON, FLT_MAX, FLT_MAX + FLT_EPSILON}; return values[fuzz_rand() % (sizeof(values) / sizeof(*values))]; } case 2: { static constexpr UInt64 scales[] = {0, 1, 2, 10}; return DecimalField( bad_int64_values[fuzz_rand() % (sizeof(bad_int64_values) / sizeof(*bad_int64_values))], scales[fuzz_rand() % (sizeof(scales) / sizeof(*scales))]); } default: assert(false); return Null{}; } } Field QueryFuzzer::fuzzField(Field field) { const auto type = field.getType(); int type_index = -1; if (type == Field::Types::Int64 || type == Field::Types::UInt64) { type_index = 0; } else if (type == Field::Types::Float64) { type_index = 1; } else if (type == Field::Types::Decimal32 || type == Field::Types::Decimal64 || type == Field::Types::Decimal128) { type_index = 2; } if (fuzz_rand() % 20 == 0) { return Null{}; } if (type_index >= 0) { if (fuzz_rand() % 20 == 0) { // Change type sometimes, but not often, because it mostly leads to // boring errors. type_index = fuzz_rand() % 3; } return getRandomField(type_index); } if (type == Field::Types::String) { auto & str = field.get(); UInt64 action = fuzz_rand() % 10; switch (action) { case 0: str = ""; break; case 1: str = str + str; break; case 2: str = str + str + str + str; break; case 4: if (!str.empty()) { str[fuzz_rand() % str.size()] = '\0'; } break; default: // Do nothing break; } } else if (type == Field::Types::Array || type == Field::Types::Tuple) { auto & arr = field.reinterpret(); if (fuzz_rand() % 5 == 0 && !arr.empty()) { size_t pos = fuzz_rand() % arr.size(); arr.erase(arr.begin() + pos); std::cerr << "erased\n"; } if (fuzz_rand() % 5 == 0) { if (!arr.empty()) { size_t pos = fuzz_rand() % arr.size(); arr.insert(arr.begin() + pos, fuzzField(arr[pos])); std::cerr << fmt::format("inserted (pos {})\n", pos); } else { arr.insert(arr.begin(), getRandomField(0)); std::cerr << "inserted (0)\n"; } } for (auto & element : arr) { element = fuzzField(element); } } return field; } ASTPtr QueryFuzzer::getRandomColumnLike() { if (column_like.empty()) { return nullptr; } ASTPtr new_ast = column_like[fuzz_rand() % column_like.size()]->clone(); new_ast->setAlias(""); return new_ast; } void QueryFuzzer::replaceWithColumnLike(ASTPtr & ast) { if (column_like.empty()) { return; } std::string old_alias = ast->tryGetAlias(); ast = getRandomColumnLike(); ast->setAlias(old_alias); } void QueryFuzzer::replaceWithTableLike(ASTPtr & ast) { if (table_like.empty()) { return; } ASTPtr new_ast = table_like[fuzz_rand() % table_like.size()]->clone(); std::string old_alias = ast->tryGetAlias(); new_ast->setAlias(old_alias); ast = new_ast; } void QueryFuzzer::fuzzOrderByElement(ASTOrderByElement * elem) { switch (fuzz_rand() % 10) { case 0: elem->direction = -1; break; case 1: elem->direction = 1; break; case 2: elem->nulls_direction = -1; elem->nulls_direction_was_explicitly_specified = true; break; case 3: elem->nulls_direction = 1; elem->nulls_direction_was_explicitly_specified = true; break; case 4: elem->nulls_direction = elem->direction; elem->nulls_direction_was_explicitly_specified = false; break; default: // do nothing break; } } void QueryFuzzer::fuzzOrderByList(IAST * ast) { if (!ast) { return; } auto * list = assert_cast(ast); // Remove element if (fuzz_rand() % 50 == 0 && list->children.size() > 1) { // Don't remove last element -- this leads to questionable // constructs such as empty select. list->children.erase(list->children.begin() + fuzz_rand() % list->children.size()); } // Add element if (fuzz_rand() % 50 == 0) { auto pos = list->children.empty() ? list->children.begin() : list->children.begin() + fuzz_rand() % list->children.size(); auto col = getRandomColumnLike(); if (col) { auto elem = std::make_shared(); elem->children.push_back(col); elem->direction = 1; elem->nulls_direction = 1; elem->nulls_direction_was_explicitly_specified = false; elem->with_fill = false; list->children.insert(pos, elem); } else { std::cerr << "No random column.\n"; } } // We don't have to recurse here to fuzz the children, this is handled by // the generic recursion into IAST.children. } void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast) { if (!ast) { return; } auto * impl = assert_cast(ast); // Remove element if (fuzz_rand() % 50 == 0 && impl->children.size() > 1) { // Don't remove last element -- this leads to questionable // constructs such as empty select. impl->children.erase(impl->children.begin() + fuzz_rand() % impl->children.size()); } // Add element if (fuzz_rand() % 50 == 0) { auto pos = impl->children.empty() ? impl->children.begin() : impl->children.begin() + fuzz_rand() % impl->children.size(); auto col = getRandomColumnLike(); if (col) impl->children.insert(pos, col); else std::cerr << "No random column.\n"; } // We don't have to recurse here to fuzz the children, this is handled by // the generic recursion into IAST.children. } void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def) { switch (fuzz_rand() % 40) { case 0: { const auto r = fuzz_rand() % 3; def.frame_type = r == 0 ? WindowFrame::FrameType::ROWS : r == 1 ? WindowFrame::FrameType::RANGE : WindowFrame::FrameType::GROUPS; break; } case 1: { const auto r = fuzz_rand() % 3; def.frame_begin_type = r == 0 ? WindowFrame::BoundaryType::Unbounded : r == 1 ? WindowFrame::BoundaryType::Current : WindowFrame::BoundaryType::Offset; if (def.frame_begin_type == WindowFrame::BoundaryType::Offset) { // The offsets are fuzzed normally through 'children'. def.frame_begin_offset = std::make_shared(getRandomField(0)); } else { def.frame_begin_offset = nullptr; } break; } case 2: { const auto r = fuzz_rand() % 3; def.frame_end_type = r == 0 ? WindowFrame::BoundaryType::Unbounded : r == 1 ? WindowFrame::BoundaryType::Current : WindowFrame::BoundaryType::Offset; if (def.frame_end_type == WindowFrame::BoundaryType::Offset) { def.frame_end_offset = std::make_shared(getRandomField(0)); } else { def.frame_end_offset = nullptr; } break; } case 5: { def.frame_begin_preceding = fuzz_rand() % 2; break; } case 6: { def.frame_end_preceding = fuzz_rand() % 2; break; } default: break; } if (def.frame_type == WindowFrame::FrameType::RANGE && def.frame_begin_type == WindowFrame::BoundaryType::Unbounded && def.frame_begin_preceding && def.frame_end_type == WindowFrame::BoundaryType::Current) { def.frame_is_default = true; /* NOLINT clang-tidy could you just shut up please */ } else { def.frame_is_default = false; } } void QueryFuzzer::fuzzCreateQuery(ASTCreateQuery & create) { if (create.columns_list && create.columns_list->columns) { for (auto & ast : create.columns_list->columns->children) { if (auto * column = ast->as()) { fuzzColumnDeclaration(*column); } } } if (create.storage && create.storage->engine) { auto & engine_name = create.storage->engine->name; if (startsWith(engine_name, "Replicated")) engine_name = engine_name.substr(strlen("Replicated")); } auto full_name = create.getTable(); auto original_name = full_name.substr(0, full_name.find("__fuzz_")); size_t index = index_of_fuzzed_table[original_name]++; auto new_name = original_name + "__fuzz_" + toString(index); create.setTable(new_name); SipHash sip_hash; sip_hash.update(original_name); if (create.columns_list) create.columns_list->updateTreeHash(sip_hash); if (create.storage) create.storage->updateTreeHash(sip_hash); IAST::Hash hash; sip_hash.get128(hash); if (created_tables_hashes.insert(hash).second) original_table_name_to_fuzzed[original_name].push_back(new_name); } void QueryFuzzer::fuzzColumnDeclaration(ASTColumnDeclaration & column) { if (column.type) { auto data_type = fuzzDataType(DataTypeFactory::instance().get(column.type)); ParserDataType parser; column.type = parseQuery(parser, data_type->getName(), DBMS_DEFAULT_MAX_QUERY_SIZE, DBMS_DEFAULT_MAX_PARSER_DEPTH); } } DataTypePtr QueryFuzzer::fuzzDataType(DataTypePtr type) { /// Do not replace Array with not Array to often. const auto * type_array = typeid_cast(type.get()); if (type_array && fuzz_rand() % 5 != 0) return std::make_shared(fuzzDataType(type_array->getNestedType())); const auto * type_tuple = typeid_cast(type.get()); if (type_tuple && fuzz_rand() % 5 != 0) { DataTypes elements; for (const auto & element : type_tuple->getElements()) elements.push_back(fuzzDataType(element)); return type_tuple->haveExplicitNames() ? std::make_shared(elements, type_tuple->getElementNames()) : std::make_shared(elements); } const auto * type_map = typeid_cast(type.get()); if (type_map && fuzz_rand() % 5 != 0) { auto key_type = fuzzDataType(type_map->getKeyType()); auto value_type = fuzzDataType(type_map->getValueType()); if (!DataTypeMap::checkKeyType(key_type)) key_type = type_map->getKeyType(); return std::make_shared(key_type, value_type); } const auto * type_nullable = typeid_cast(type.get()); if (type_nullable) { size_t tmp = fuzz_rand() % 3; if (tmp == 0) return type_nullable->getNestedType(); if (tmp == 1) { auto nested_type = fuzzDataType(type_nullable->getNestedType()); if (nested_type->canBeInsideNullable()) return std::make_shared(nested_type); } } const auto * type_low_cardinality = typeid_cast(type.get()); if (type_low_cardinality) { size_t tmp = fuzz_rand() % 3; if (tmp == 0) return type_low_cardinality->getDictionaryType(); if (tmp == 1) { auto nested_type = fuzzDataType(type_low_cardinality->getDictionaryType()); if (nested_type->canBeInsideLowCardinality()) return std::make_shared(nested_type); } } size_t tmp = fuzz_rand() % 10; if (tmp <= 1 && type->canBeInsideNullable()) return std::make_shared(type); if (tmp <= 3 && type->canBeInsideLowCardinality()) return std::make_shared(type); if (tmp == 4) return getRandomType(); return type; } DataTypePtr QueryFuzzer::getRandomType() { auto type_id = static_cast(fuzz_rand() % static_cast(TypeIndex::Tuple) + 1); if (type_id == TypeIndex::Tuple) { size_t tuple_size = fuzz_rand() % 6 + 1; DataTypes elements; for (size_t i = 0; i < tuple_size; ++i) elements.push_back(getRandomType()); return std::make_shared(elements); } if (type_id == TypeIndex::Array) return std::make_shared(getRandomType()); /// NOLINTNEXTLINE #define DISPATCH(DECIMAL) \ if (type_id == TypeIndex::DECIMAL) \ return std::make_shared>( \ DataTypeDecimal::maxPrecision(), \ fuzz_rand() % DataTypeDecimal::maxPrecision() + 1); DISPATCH(Decimal32) DISPATCH(Decimal64) DISPATCH(Decimal128) DISPATCH(Decimal256) #undef DISPATCH if (type_id == TypeIndex::FixedString) return std::make_shared(fuzz_rand() % 20); if (type_id == TypeIndex::Enum8) return std::make_shared(); if (type_id == TypeIndex::Enum16) return std::make_shared(); return DataTypeFactory::instance().get(String(magic_enum::enum_name(type_id))); } void QueryFuzzer::fuzzTableName(ASTTableExpression & table) { if (!table.database_and_table_name || fuzz_rand() % 3 == 0) return; const auto * identifier = table.database_and_table_name->as(); if (!identifier) return; auto table_id = identifier->getTableId(); if (table_id.empty()) return; auto it = original_table_name_to_fuzzed.find(table_id.getTableName()); if (it != original_table_name_to_fuzzed.end() && !it->second.empty()) { const auto & new_table_name = it->second[fuzz_rand() % it->second.size()]; StorageID new_table_id(table_id.database_name, new_table_name); table.database_and_table_name = std::make_shared(new_table_id); } } static ASTPtr tryParseInsertQuery(const String & full_query) { const char * pos = full_query.data(); const char * end = full_query.data() + full_query.size(); ParserInsertQuery parser(end, false); String message; return tryParseQuery(parser, pos, end, message, false, "", false, DBMS_DEFAULT_MAX_QUERY_SIZE, DBMS_DEFAULT_MAX_PARSER_DEPTH); } ASTs QueryFuzzer::getInsertQueriesForFuzzedTables(const String & full_query) { auto parsed_query = tryParseInsertQuery(full_query); if (!parsed_query) return {}; const auto & insert = *parsed_query->as(); if (!insert.table) return {}; auto table_name = insert.getTable(); auto it = original_table_name_to_fuzzed.find(table_name); if (it == original_table_name_to_fuzzed.end()) return {}; ASTs queries; for (const auto & fuzzed_name : it->second) { auto & query = queries.emplace_back(tryParseInsertQuery(full_query)); query->as()->setTable(fuzzed_name); } return queries; } void QueryFuzzer::fuzz(ASTs & asts) { for (auto & ast : asts) { fuzz(ast); } } struct ScopedIncrement { size_t & counter; explicit ScopedIncrement(size_t & counter_) : counter(counter_) { ++counter; } ~ScopedIncrement() { --counter; } }; void QueryFuzzer::fuzz(ASTPtr & ast) { if (!ast) return; // Check for exceeding max depth. ScopedIncrement depth_increment(current_ast_depth); if (current_ast_depth > 500) { // The AST is too deep (see the comment for current_ast_depth). Throw // an exception to fail fast and not use this query as an etalon, or we'll // end up in a very slow and useless loop. It also makes sense to set it // lower than the default max parse depth on the server (1000), so that // we don't get the useless error about parse depth from the server either. throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "AST depth exceeded while fuzzing ({})", current_ast_depth); } // Check for loops. auto [_, inserted] = debug_visited_nodes.insert(ast.get()); if (!inserted) { fmt::print(stderr, "The AST node '{}' was already visited before." " Depth {}, {} visited nodes, current top AST:\n{}\n", static_cast(ast.get()), current_ast_depth, debug_visited_nodes.size(), (*debug_top_ast)->dumpTree()); assert(false); } // The fuzzing. if (auto * with_union = typeid_cast(ast.get())) { fuzz(with_union->list_of_selects); } else if (auto * with_intersect_except = typeid_cast(ast.get())) { auto selects = with_intersect_except->getListOfSelects(); fuzz(selects); } else if (auto * tables = typeid_cast(ast.get())) { fuzz(tables->children); } else if (auto * tables_element = typeid_cast(ast.get())) { fuzz(tables_element->table_join); fuzz(tables_element->table_expression); fuzz(tables_element->array_join); } else if (auto * table_expr = typeid_cast(ast.get())) { fuzzTableName(*table_expr); fuzz(table_expr->children); } else if (auto * expr_list = typeid_cast(ast.get())) { fuzz(expr_list->children); } else if (auto * order_by_element = typeid_cast(ast.get())) { fuzzOrderByElement(order_by_element); } else if (auto * fn = typeid_cast(ast.get())) { fuzzColumnLikeExpressionList(fn->arguments.get()); fuzzColumnLikeExpressionList(fn->parameters.get()); if (fn->is_window_function && fn->window_definition) { auto & def = fn->window_definition->as(); fuzzColumnLikeExpressionList(def.partition_by.get()); fuzzOrderByList(def.order_by.get()); fuzzWindowFrame(def); } fuzz(fn->children); } else if (auto * select = typeid_cast(ast.get())) { fuzzColumnLikeExpressionList(select->select().get()); fuzzColumnLikeExpressionList(select->groupBy().get()); fuzzOrderByList(select->orderBy().get()); fuzz(select->children); } /* * The time to fuzz the settings has not yet come. * Apparently we don't have any infractructure to validate the values of * the settings, and the first query with max_block_size = -1 breaks * because of overflows here and there. *//* * else if (auto * set = typeid_cast(ast.get())) * { * for (auto & c : set->changes) * { * if (fuzz_rand() % 50 == 0) * { * c.value = fuzzField(c.value); * } * } * } */ else if (auto * literal = typeid_cast(ast.get())) { // There is a caveat with fuzzing the children: many ASTs also keep the // links to particular children in own fields. This means that replacing // the child with another object might lead to error. Many of these fields // are ASTPtr -- this is redundant ownership, but hides the error if the // child field is replaced. Others can be ASTLiteral * or the like, which // leads to segfault if the pointed-to AST is replaced. // Replacing children is safe in case of ASTExpressionList. In a more // general case, we can change the value of ASTLiteral, which is what we // do here. if (fuzz_rand() % 11 == 0) { literal->value = fuzzField(literal->value); } } else if (auto * create_query = typeid_cast(ast.get())) { fuzzCreateQuery(*create_query); } else { fuzz(ast->children); } } /* * This functions collects various parts of query that we can then substitute * to a query being fuzzed. * * TODO: we just stop remembering new parts after our corpus reaches certain size. * This is boring, should implement a random replacement of existing parst with * small probability. Do this after we add this fuzzer to CI and fix all the * problems it can routinely find even in this boring version. */ void QueryFuzzer::collectFuzzInfoMain(ASTPtr ast) { collectFuzzInfoRecurse(ast); aliases.clear(); for (const auto & alias : aliases_set) { aliases.push_back(alias); } column_like.clear(); for (const auto & [name, value] : column_like_map) { column_like.push_back(value); } table_like.clear(); for (const auto & [name, value] : table_like_map) { table_like.push_back(value); } } void QueryFuzzer::addTableLike(ASTPtr ast) { if (table_like_map.size() > 1000) { table_like_map.clear(); } const auto name = ast->formatForErrorMessage(); if (name.size() < 200) { table_like_map.insert({name, ast}); } } void QueryFuzzer::addColumnLike(ASTPtr ast) { if (column_like_map.size() > 1000) { column_like_map.clear(); } const auto name = ast->formatForErrorMessage(); if (name == "Null") { // The `Null` identifier from FORMAT Null clause. We don't quote it // properly when formatting the AST, and while the resulting query // technically works, it has non-standard case for Null (the standard // is NULL), so it breaks the query formatting idempotence check. // Just plug this particular case for now. return; } if (name.size() < 200) { column_like_map.insert({name, ast}); } } void QueryFuzzer::collectFuzzInfoRecurse(ASTPtr ast) { if (auto * impl = dynamic_cast(ast.get())) { if (aliases_set.size() > 1000) { aliases_set.clear(); } aliases_set.insert(impl->alias); } if (typeid_cast(ast.get())) { addColumnLike(ast); } else if (typeid_cast(ast.get())) { addColumnLike(ast); } else if (typeid_cast(ast.get())) { addColumnLike(ast); } else if (typeid_cast(ast.get())) { addTableLike(ast); } else if (typeid_cast(ast.get())) { addTableLike(ast); } for (const auto & child : ast->children) { collectFuzzInfoRecurse(child); } } void QueryFuzzer::fuzzMain(ASTPtr & ast) { current_ast_depth = 0; debug_visited_nodes.clear(); debug_top_ast = * collectFuzzInfoMain(ast); fuzz(ast); std::cout << std::endl; WriteBufferFromOStream ast_buf(std::cout, 4096); formatAST(*ast, ast_buf, false /*highlight*/); ast_buf.next(); std::cout << std::endl << std::endl; } }