mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 16:42:05 +00:00
644 lines
17 KiB
C++
644 lines
17 KiB
C++
#include "QueryFuzzer.h"
|
|
|
|
#include <unordered_set>
|
|
|
|
#include <pcg_random.hpp>
|
|
#include <Common/assert_cast.h>
|
|
#include <Common/typeid_cast.h>
|
|
#include <Core/Types.h>
|
|
#include <IO/Operators.h>
|
|
#include <IO/UseSSL.h>
|
|
#include <IO/WriteBufferFromOStream.h>
|
|
#include <Parsers/ASTExpressionList.h>
|
|
#include <Parsers/ASTFunction.h>
|
|
#include <Parsers/ASTIdentifier.h>
|
|
#include <Parsers/ASTInsertQuery.h>
|
|
#include <Parsers/ASTLiteral.h>
|
|
#include <Parsers/ASTOrderByElement.h>
|
|
#include <Parsers/ASTQueryWithOutput.h>
|
|
#include <Parsers/ASTSelectQuery.h>
|
|
#include <Parsers/ASTSelectWithUnionQuery.h>
|
|
#include <Parsers/ASTSetQuery.h>
|
|
#include <Parsers/ASTSubquery.h>
|
|
#include <Parsers/ASTTablesInSelectQuery.h>
|
|
#include <Parsers/ASTUseQuery.h>
|
|
#include <Parsers/ASTWindowDefinition.h>
|
|
#include <Parsers/ParserQuery.h>
|
|
#include <Parsers/formatAST.h>
|
|
#include <Parsers/parseQuery.h>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int TOO_DEEP_RECURSION;
|
|
}
|
|
|
|
Field QueryFuzzer::getRandomField(int type)
|
|
{
|
|
switch (type)
|
|
{
|
|
case 0:
|
|
{
|
|
static constexpr Int64 values[]
|
|
= {-2, -1, 0, 1, 2, 3, 7, 10, 100, 255, 256, 257, 1023, 1024,
|
|
1025, 65535, 65536, 65537, 1024 * 1024 - 1, 1024 * 1024,
|
|
1024 * 1024 + 1, INT64_MIN, INT64_MAX};
|
|
return values[fuzz_rand() % (sizeof(values) / sizeof(*values))];
|
|
}
|
|
case 1:
|
|
{
|
|
static constexpr float values[]
|
|
= {NAN, INFINITY, -INFINITY, 0., 0.0001, 0.5, 0.9999,
|
|
1., 1.0001, 2., 10.0001, 100.0001, 1000.0001};
|
|
return values[fuzz_rand() % (sizeof(values) / sizeof(*values))];
|
|
}
|
|
case 2:
|
|
{
|
|
static constexpr Int64 values[]
|
|
= {-2, -1, 0, 1, 2, 3, 7, 10, 100, 255, 256, 257, 1023, 1024,
|
|
1025, 65535, 65536, 65537, 1024 * 1024 - 1, 1024 * 1024,
|
|
1024 * 1024 + 1, INT64_MIN, INT64_MAX};
|
|
static constexpr UInt64 scales[] = {0, 1, 2, 10};
|
|
return DecimalField<Decimal64>(
|
|
values[fuzz_rand() % (sizeof(values) / sizeof(*values))],
|
|
scales[fuzz_rand() % (sizeof(scales) / sizeof(*scales))]
|
|
);
|
|
}
|
|
default:
|
|
assert(false);
|
|
return Null{};
|
|
}
|
|
}
|
|
|
|
Field QueryFuzzer::fuzzField(Field field)
|
|
{
|
|
const auto type = field.getType();
|
|
|
|
int type_index = -1;
|
|
|
|
if (type == Field::Types::Int64
|
|
|| type == Field::Types::UInt64)
|
|
{
|
|
type_index = 0;
|
|
}
|
|
else if (type == Field::Types::Float64)
|
|
{
|
|
type_index = 1;
|
|
}
|
|
else if (type == Field::Types::Decimal32
|
|
|| type == Field::Types::Decimal64
|
|
|| type == Field::Types::Decimal128)
|
|
{
|
|
type_index = 2;
|
|
}
|
|
|
|
if (fuzz_rand() % 20 == 0)
|
|
{
|
|
return Null{};
|
|
}
|
|
|
|
if (type_index >= 0)
|
|
{
|
|
if (fuzz_rand() % 20 == 0)
|
|
{
|
|
// Change type sometimes, but not often, because it mostly leads to
|
|
// boring errors.
|
|
type_index = fuzz_rand() % 3;
|
|
}
|
|
return getRandomField(type_index);
|
|
}
|
|
|
|
if (type == Field::Types::String)
|
|
{
|
|
auto & str = field.get<std::string>();
|
|
UInt64 action = fuzz_rand() % 10;
|
|
switch (action)
|
|
{
|
|
case 0:
|
|
str = "";
|
|
break;
|
|
case 1:
|
|
str = str + str;
|
|
break;
|
|
case 2:
|
|
str = str + str + str + str;
|
|
break;
|
|
case 4:
|
|
if (!str.empty())
|
|
{
|
|
str[fuzz_rand() % str.size()] = '\0';
|
|
}
|
|
break;
|
|
default:
|
|
// Do nothing
|
|
break;
|
|
}
|
|
}
|
|
else if (type == Field::Types::Array || type == Field::Types::Tuple)
|
|
{
|
|
auto & arr = field.reinterpret<FieldVector>();
|
|
|
|
if (fuzz_rand() % 5 == 0 && !arr.empty())
|
|
{
|
|
size_t pos = fuzz_rand() % arr.size();
|
|
arr.erase(arr.begin() + pos);
|
|
fprintf(stderr, "erased\n");
|
|
}
|
|
|
|
if (fuzz_rand() % 5 == 0)
|
|
{
|
|
if (!arr.empty())
|
|
{
|
|
size_t pos = fuzz_rand() % arr.size();
|
|
arr.insert(arr.begin() + pos, fuzzField(arr[pos]));
|
|
fprintf(stderr, "inserted (pos %zd)\n", pos);
|
|
}
|
|
else
|
|
{
|
|
arr.insert(arr.begin(), getRandomField(0));
|
|
fprintf(stderr, "inserted (0)\n");
|
|
}
|
|
|
|
}
|
|
|
|
for (auto & element : arr)
|
|
{
|
|
element = fuzzField(element);
|
|
}
|
|
}
|
|
|
|
return field;
|
|
}
|
|
|
|
ASTPtr QueryFuzzer::getRandomColumnLike()
|
|
{
|
|
if (column_like.empty())
|
|
{
|
|
return nullptr;
|
|
}
|
|
|
|
ASTPtr new_ast = column_like[fuzz_rand() % column_like.size()]->clone();
|
|
new_ast->setAlias("");
|
|
|
|
return new_ast;
|
|
}
|
|
|
|
void QueryFuzzer::replaceWithColumnLike(ASTPtr & ast)
|
|
{
|
|
if (column_like.empty())
|
|
{
|
|
return;
|
|
}
|
|
|
|
std::string old_alias = ast->tryGetAlias();
|
|
ast = getRandomColumnLike();
|
|
ast->setAlias(old_alias);
|
|
}
|
|
|
|
void QueryFuzzer::replaceWithTableLike(ASTPtr & ast)
|
|
{
|
|
if (table_like.empty())
|
|
{
|
|
return;
|
|
}
|
|
|
|
ASTPtr new_ast = table_like[fuzz_rand() % table_like.size()]->clone();
|
|
|
|
std::string old_alias = ast->tryGetAlias();
|
|
new_ast->setAlias(old_alias);
|
|
|
|
ast = new_ast;
|
|
}
|
|
|
|
void QueryFuzzer::fuzzOrderByElement(ASTOrderByElement * elem)
|
|
{
|
|
switch (fuzz_rand() % 10)
|
|
{
|
|
case 0:
|
|
elem->direction = -1;
|
|
break;
|
|
case 1:
|
|
elem->direction = 1;
|
|
break;
|
|
case 2:
|
|
elem->nulls_direction = -1;
|
|
elem->nulls_direction_was_explicitly_specified = true;
|
|
break;
|
|
case 3:
|
|
elem->nulls_direction = 1;
|
|
elem->nulls_direction_was_explicitly_specified = true;
|
|
break;
|
|
case 4:
|
|
elem->nulls_direction = elem->direction;
|
|
elem->nulls_direction_was_explicitly_specified = false;
|
|
break;
|
|
default:
|
|
// do nothing
|
|
break;
|
|
}
|
|
}
|
|
|
|
void QueryFuzzer::fuzzOrderByList(IAST * ast)
|
|
{
|
|
if (!ast)
|
|
{
|
|
return;
|
|
}
|
|
|
|
auto * list = assert_cast<ASTExpressionList *>(ast);
|
|
|
|
// Remove element
|
|
if (fuzz_rand() % 50 == 0 && list->children.size() > 1)
|
|
{
|
|
// Don't remove last element -- this leads to questionable
|
|
// constructs such as empty select.
|
|
list->children.erase(list->children.begin()
|
|
+ fuzz_rand() % list->children.size());
|
|
}
|
|
|
|
// Add element
|
|
if (fuzz_rand() % 50 == 0)
|
|
{
|
|
auto pos = list->children.empty()
|
|
? list->children.begin()
|
|
: list->children.begin() + fuzz_rand() % list->children.size();
|
|
auto col = getRandomColumnLike();
|
|
if (col)
|
|
{
|
|
auto elem = std::make_shared<ASTOrderByElement>();
|
|
elem->children.push_back(col);
|
|
elem->direction = 1;
|
|
elem->nulls_direction = 1;
|
|
elem->nulls_direction_was_explicitly_specified = false;
|
|
elem->with_fill = false;
|
|
|
|
list->children.insert(pos, elem);
|
|
}
|
|
else
|
|
{
|
|
fprintf(stderr, "no random col!\n");
|
|
}
|
|
}
|
|
|
|
// We don't have to recurse here to fuzz the children, this is handled by
|
|
// the generic recursion into IAST.children.
|
|
}
|
|
|
|
void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
|
|
{
|
|
if (!ast)
|
|
{
|
|
return;
|
|
}
|
|
|
|
auto * impl = assert_cast<ASTExpressionList *>(ast);
|
|
|
|
// Remove element
|
|
if (fuzz_rand() % 50 == 0 && impl->children.size() > 1)
|
|
{
|
|
// Don't remove last element -- this leads to questionable
|
|
// constructs such as empty select.
|
|
impl->children.erase(impl->children.begin()
|
|
+ fuzz_rand() % impl->children.size());
|
|
}
|
|
|
|
// Add element
|
|
if (fuzz_rand() % 50 == 0)
|
|
{
|
|
auto pos = impl->children.empty()
|
|
? impl->children.begin()
|
|
: impl->children.begin() + fuzz_rand() % impl->children.size();
|
|
auto col = getRandomColumnLike();
|
|
if (col)
|
|
{
|
|
impl->children.insert(pos, col);
|
|
}
|
|
else
|
|
{
|
|
fprintf(stderr, "no random col!\n");
|
|
}
|
|
}
|
|
|
|
// We don't have to recurse here to fuzz the children, this is handled by
|
|
// the generic recursion into IAST.children.
|
|
}
|
|
|
|
void QueryFuzzer::fuzzWindowFrame(WindowFrame & frame)
|
|
{
|
|
switch (fuzz_rand() % 40)
|
|
{
|
|
case 0:
|
|
{
|
|
const auto r = fuzz_rand() % 3;
|
|
frame.type = r == 0 ? WindowFrame::FrameType::Rows
|
|
: r == 1 ? WindowFrame::FrameType::Range
|
|
: WindowFrame::FrameType::Groups;
|
|
break;
|
|
}
|
|
case 1:
|
|
{
|
|
const auto r = fuzz_rand() % 3;
|
|
frame.begin_type = r == 0 ? WindowFrame::BoundaryType::Unbounded
|
|
: r == 1 ? WindowFrame::BoundaryType::Current
|
|
: WindowFrame::BoundaryType::Offset;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
const auto r = fuzz_rand() % 3;
|
|
frame.end_type = r == 0 ? WindowFrame::BoundaryType::Unbounded
|
|
: r == 1 ? WindowFrame::BoundaryType::Current
|
|
: WindowFrame::BoundaryType::Offset;
|
|
break;
|
|
}
|
|
case 3:
|
|
{
|
|
frame.begin_offset = getRandomField(0).get<Int64>();
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
frame.end_offset = getRandomField(0).get<Int64>();
|
|
break;
|
|
}
|
|
case 5:
|
|
{
|
|
frame.begin_preceding = fuzz_rand() % 2;
|
|
break;
|
|
}
|
|
case 6:
|
|
{
|
|
frame.end_preceding = fuzz_rand() % 2;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
frame.is_default = (frame == WindowFrame{});
|
|
}
|
|
|
|
void QueryFuzzer::fuzz(ASTs & asts)
|
|
{
|
|
for (auto & ast : asts)
|
|
{
|
|
fuzz(ast);
|
|
}
|
|
}
|
|
|
|
struct ScopedIncrement
|
|
{
|
|
size_t & counter;
|
|
|
|
explicit ScopedIncrement(size_t & counter_) : counter(counter_) { ++counter; }
|
|
~ScopedIncrement() { --counter; }
|
|
};
|
|
|
|
void QueryFuzzer::fuzz(ASTPtr & ast)
|
|
{
|
|
if (!ast)
|
|
return;
|
|
|
|
// Check for exceeding max depth.
|
|
ScopedIncrement depth_increment(current_ast_depth);
|
|
if (current_ast_depth > 500)
|
|
{
|
|
// The AST is too deep (see the comment for current_ast_depth). Throw
|
|
// an exception to fail fast and not use this query as an etalon, or we'll
|
|
// end up in a very slow and useless loop. It also makes sense to set it
|
|
// lower than the default max parse depth on the server (1000), so that
|
|
// we don't get the useless error about parse depth from the server either.
|
|
throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
|
|
"AST depth exceeded while fuzzing ({})", current_ast_depth);
|
|
}
|
|
|
|
// Check for loops.
|
|
auto [_, inserted] = debug_visited_nodes.insert(ast.get());
|
|
if (!inserted)
|
|
{
|
|
fmt::print(stderr, "The AST node '{}' was already visited before."
|
|
" Depth {}, {} visited nodes, current top AST:\n{}\n",
|
|
static_cast<void *>(ast.get()), current_ast_depth,
|
|
debug_visited_nodes.size(), (*debug_top_ast)->dumpTree());
|
|
assert(false);
|
|
}
|
|
|
|
// The fuzzing.
|
|
if (auto * with_union = typeid_cast<ASTSelectWithUnionQuery *>(ast.get()))
|
|
{
|
|
fuzz(with_union->list_of_selects);
|
|
}
|
|
else if (auto * tables = typeid_cast<ASTTablesInSelectQuery *>(ast.get()))
|
|
{
|
|
fuzz(tables->children);
|
|
}
|
|
else if (auto * tables_element = typeid_cast<ASTTablesInSelectQueryElement *>(ast.get()))
|
|
{
|
|
fuzz(tables_element->table_join);
|
|
fuzz(tables_element->table_expression);
|
|
fuzz(tables_element->array_join);
|
|
}
|
|
else if (auto * table_expr = typeid_cast<ASTTableExpression *>(ast.get()))
|
|
{
|
|
fuzz(table_expr->database_and_table_name);
|
|
fuzz(table_expr->subquery);
|
|
fuzz(table_expr->table_function);
|
|
}
|
|
else if (auto * expr_list = typeid_cast<ASTExpressionList *>(ast.get()))
|
|
{
|
|
fuzz(expr_list->children);
|
|
}
|
|
else if (auto * order_by_element = typeid_cast<ASTOrderByElement *>(ast.get()))
|
|
{
|
|
fuzzOrderByElement(order_by_element);
|
|
}
|
|
else if (auto * fn = typeid_cast<ASTFunction *>(ast.get()))
|
|
{
|
|
fuzzColumnLikeExpressionList(fn->arguments.get());
|
|
fuzzColumnLikeExpressionList(fn->parameters.get());
|
|
|
|
if (fn->is_window_function && fn->window_definition)
|
|
{
|
|
auto & def = fn->window_definition->as<ASTWindowDefinition &>();
|
|
fuzzColumnLikeExpressionList(def.partition_by.get());
|
|
fuzzOrderByList(def.order_by.get());
|
|
fuzzWindowFrame(def.frame);
|
|
}
|
|
|
|
fuzz(fn->children);
|
|
}
|
|
else if (auto * select = typeid_cast<ASTSelectQuery *>(ast.get()))
|
|
{
|
|
fuzzColumnLikeExpressionList(select->select().get());
|
|
fuzzColumnLikeExpressionList(select->groupBy().get());
|
|
fuzzOrderByList(select->orderBy().get());
|
|
|
|
fuzz(select->children);
|
|
}
|
|
/*
|
|
* The time to fuzz the settings has not yet come.
|
|
* Apparently we don't have any infractructure to validate the values of
|
|
* the settings, and the first query with max_block_size = -1 breaks
|
|
* because of overflows here and there.
|
|
*//*
|
|
* else if (auto * set = typeid_cast<ASTSetQuery *>(ast.get()))
|
|
* {
|
|
* for (auto & c : set->changes)
|
|
* {
|
|
* if (fuzz_rand() % 50 == 0)
|
|
* {
|
|
* c.value = fuzzField(c.value);
|
|
* }
|
|
* }
|
|
* }
|
|
*/
|
|
else if (auto * literal = typeid_cast<ASTLiteral *>(ast.get()))
|
|
{
|
|
// There is a caveat with fuzzing the children: many ASTs also keep the
|
|
// links to particular children in own fields. This means that replacing
|
|
// the child with another object might lead to error. Many of these fields
|
|
// are ASTPtr -- this is redundant ownership, but hides the error if the
|
|
// child field is replaced. Others can be ASTLiteral * or the like, which
|
|
// leads to segfault if the pointed-to AST is replaced.
|
|
// Replacing children is safe in case of ASTExpressionList. In a more
|
|
// general case, we can change the value of ASTLiteral, which is what we
|
|
// do here.
|
|
if (fuzz_rand() % 11 == 0)
|
|
{
|
|
literal->value = fuzzField(literal->value);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
fuzz(ast->children);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This functions collects various parts of query that we can then substitute
|
|
* to a query being fuzzed.
|
|
*
|
|
* TODO: we just stop remembering new parts after our corpus reaches certain size.
|
|
* This is boring, should implement a random replacement of existing parst with
|
|
* small probability. Do this after we add this fuzzer to CI and fix all the
|
|
* problems it can routinely find even in this boring version.
|
|
*/
|
|
void QueryFuzzer::collectFuzzInfoMain(const ASTPtr ast)
|
|
{
|
|
collectFuzzInfoRecurse(ast);
|
|
|
|
aliases.clear();
|
|
for (const auto & alias : aliases_set)
|
|
{
|
|
aliases.push_back(alias);
|
|
}
|
|
|
|
column_like.clear();
|
|
for (const auto & [name, value] : column_like_map)
|
|
{
|
|
column_like.push_back(value);
|
|
}
|
|
|
|
table_like.clear();
|
|
for (const auto & [name, value] : table_like_map)
|
|
{
|
|
table_like.push_back(value);
|
|
}
|
|
}
|
|
|
|
void QueryFuzzer::addTableLike(const ASTPtr ast)
|
|
{
|
|
if (table_like_map.size() > 1000)
|
|
{
|
|
table_like_map.clear();
|
|
}
|
|
|
|
const auto name = ast->formatForErrorMessage();
|
|
if (name.size() < 200)
|
|
{
|
|
table_like_map.insert({name, ast});
|
|
}
|
|
}
|
|
|
|
void QueryFuzzer::addColumnLike(const ASTPtr ast)
|
|
{
|
|
if (column_like_map.size() > 1000)
|
|
{
|
|
column_like_map.clear();
|
|
}
|
|
|
|
const auto name = ast->formatForErrorMessage();
|
|
if (name == "Null")
|
|
{
|
|
// The `Null` identifier from FORMAT Null clause. We don't quote it
|
|
// properly when formatting the AST, and while the resulting query
|
|
// technically works, it has non-standard case for Null (the standard
|
|
// is NULL), so it breaks the query formatting idempotence check.
|
|
// Just plug this particular case for now.
|
|
return;
|
|
}
|
|
if (name.size() < 200)
|
|
{
|
|
column_like_map.insert({name, ast});
|
|
}
|
|
}
|
|
|
|
void QueryFuzzer::collectFuzzInfoRecurse(const ASTPtr ast)
|
|
{
|
|
if (auto * impl = dynamic_cast<ASTWithAlias *>(ast.get()))
|
|
{
|
|
if (aliases_set.size() > 1000)
|
|
{
|
|
aliases_set.clear();
|
|
}
|
|
|
|
aliases_set.insert(impl->alias);
|
|
}
|
|
|
|
if (typeid_cast<ASTLiteral *>(ast.get()))
|
|
{
|
|
addColumnLike(ast);
|
|
}
|
|
else if (typeid_cast<ASTIdentifier *>(ast.get()))
|
|
{
|
|
addColumnLike(ast);
|
|
}
|
|
else if (typeid_cast<ASTFunction *>(ast.get()))
|
|
{
|
|
addColumnLike(ast);
|
|
}
|
|
else if (typeid_cast<ASTTableExpression *>(ast.get()))
|
|
{
|
|
addTableLike(ast);
|
|
}
|
|
else if (typeid_cast<ASTSubquery *>(ast.get()))
|
|
{
|
|
addTableLike(ast);
|
|
}
|
|
|
|
for (const auto & child : ast->children)
|
|
{
|
|
collectFuzzInfoRecurse(child);
|
|
}
|
|
}
|
|
|
|
void QueryFuzzer::fuzzMain(ASTPtr & ast)
|
|
{
|
|
current_ast_depth = 0;
|
|
debug_visited_nodes.clear();
|
|
debug_top_ast = *
|
|
|
|
collectFuzzInfoMain(ast);
|
|
fuzz(ast);
|
|
|
|
std::cout << std::endl;
|
|
WriteBufferFromOStream ast_buf(std::cout, 4096);
|
|
formatAST(*ast, ast_buf, false /*highlight*/);
|
|
ast_buf.next();
|
|
std::cout << std::endl << std::endl;
|
|
}
|
|
|
|
}
|