ClickHouse/src/Interpreters/TreeCNFConverter.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

407 lines
11 KiB
C++
Raw Normal View History

2021-01-04 20:55:32 +00:00
#include <Interpreters/TreeCNFConverter.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTFunction.h>
2021-11-18 14:24:06 +00:00
#include <Parsers/ASTIdentifier.h>
#include <Parsers/queryToString.h>
2021-11-18 14:24:06 +00:00
#include <Common/checkStackSize.h>
2022-01-17 04:33:47 +00:00
#include <IO/Operators.h>
2021-11-18 14:24:06 +00:00
2021-01-04 20:55:32 +00:00
namespace DB
{
2021-11-10 17:57:59 +00:00
2021-05-06 08:29:24 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
2021-05-07 10:49:05 +00:00
extern const int INCORRECT_QUERY;
2021-11-18 14:24:06 +00:00
extern const int TOO_MANY_TEMPORARY_COLUMNS;
2021-05-06 08:29:24 +00:00
}
2021-01-04 20:55:32 +00:00
2021-11-10 17:57:59 +00:00
namespace
{
2021-11-18 14:24:06 +00:00
bool isLogicalFunction(const ASTFunction & func)
{
return func.name == "and" || func.name == "or" || func.name == "not";
}
size_t countAtoms(const ASTPtr & node)
{
checkStackSize();
if (node->as<ASTIdentifier>() || node->as<ASTLiteral>())
2021-11-18 14:24:06 +00:00
return 1;
const auto * func = node->as<ASTFunction>();
if (func && !isLogicalFunction(*func))
return 1;
size_t num_atoms = 0;
for (const auto & child : node->children)
num_atoms += countAtoms(child);
return num_atoms;
}
2021-01-04 20:55:32 +00:00
/// Splits AND(a, b, c) to AND(a, AND(b, c)) for AND/OR
void splitMultiLogic(ASTPtr & node)
{
2021-11-18 14:24:06 +00:00
checkStackSize();
2021-01-04 20:55:32 +00:00
auto * func = node->as<ASTFunction>();
if (func && (func->name == "and" || func->name == "or"))
{
2021-05-07 10:49:05 +00:00
if (func->arguments->children.size() < 2)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Bad AND or OR function. Expected at least 2 arguments");
2021-05-07 10:49:05 +00:00
2021-01-04 20:55:32 +00:00
if (func->arguments->children.size() > 2)
{
2021-05-08 09:19:18 +00:00
ASTPtr res = func->arguments->children[0]->clone();
2021-01-04 20:55:32 +00:00
for (size_t i = 1; i < func->arguments->children.size(); ++i)
2021-02-14 14:47:15 +00:00
res = makeASTFunction(func->name, res, func->arguments->children[i]->clone());
2021-11-18 14:24:06 +00:00
2021-05-19 19:07:27 +00:00
node = res;
2021-01-04 20:55:32 +00:00
}
auto * new_func = node->as<ASTFunction>();
for (auto & child : new_func->arguments->children)
splitMultiLogic(child);
}
2021-05-19 19:40:38 +00:00
else if (func && func->name == "not")
2021-05-19 19:07:27 +00:00
{
for (auto & child : func->arguments->children)
splitMultiLogic(child);
}
2021-01-04 20:55:32 +00:00
}
2021-01-05 20:51:19 +00:00
/// Push NOT to leafs, remove NOT NOT ...
2021-01-04 20:55:32 +00:00
void traversePushNot(ASTPtr & node, bool add_negation)
{
2021-11-18 14:24:06 +00:00
checkStackSize();
2021-01-04 20:55:32 +00:00
auto * func = node->as<ASTFunction>();
if (func && (func->name == "and" || func->name == "or"))
{
2021-05-04 18:43:58 +00:00
if (add_negation)
{
2021-05-05 13:29:28 +00:00
if (func->arguments->children.size() != 2)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Bad AND or OR function. Expected at least 2 arguments");
2021-11-10 17:57:59 +00:00
2021-01-04 20:55:32 +00:00
/// apply De Morgan's Law
node = makeASTFunction(
(func->name == "and" ? "or" : "and"),
2021-02-14 14:47:15 +00:00
func->arguments->children[0]->clone(),
func->arguments->children[1]->clone());
2021-01-04 20:55:32 +00:00
}
auto * new_func = node->as<ASTFunction>();
for (auto & child : new_func->arguments->children)
traversePushNot(child, add_negation);
}
else if (func && func->name == "not")
{
2021-05-05 13:29:28 +00:00
if (func->arguments->children.size() != 1)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Bad NOT function. Expected 1 argument");
2021-01-04 20:55:32 +00:00
/// delete NOT
2021-02-14 14:47:15 +00:00
node = func->arguments->children[0]->clone();
2021-01-04 20:55:32 +00:00
traversePushNot(node, !add_negation);
}
else
{
if (add_negation)
2021-02-14 14:47:15 +00:00
node = makeASTFunction("not", node->clone());
2021-01-04 20:55:32 +00:00
}
}
2021-05-19 19:07:27 +00:00
/// Push Or inside And (actually pull AND to top)
2021-11-18 14:24:06 +00:00
bool traversePushOr(ASTPtr & node, size_t num_atoms, size_t max_atoms)
2021-05-19 19:43:02 +00:00
{
2021-11-18 14:24:06 +00:00
if (max_atoms && num_atoms > max_atoms)
return false;
checkStackSize();
2021-01-04 20:55:32 +00:00
auto * func = node->as<ASTFunction>();
2021-05-19 19:07:27 +00:00
if (func && (func->name == "or" || func->name == "and"))
2021-01-04 20:55:32 +00:00
{
for (auto & child : func->arguments->children)
2021-11-18 14:24:06 +00:00
if (!traversePushOr(child, num_atoms, max_atoms))
return false;
2021-01-04 20:55:32 +00:00
}
2021-05-19 19:07:27 +00:00
if (func && func->name == "or")
2021-01-04 20:55:32 +00:00
{
2021-11-10 17:57:59 +00:00
assert(func->arguments->children.size() == 2);
2021-05-19 19:07:27 +00:00
size_t and_node_id = func->arguments->children.size();
for (size_t i = 0; i < func->arguments->children.size(); ++i)
2021-01-04 20:55:32 +00:00
{
2021-05-19 19:07:27 +00:00
auto & child = func->arguments->children[i];
2021-01-04 20:55:32 +00:00
auto * and_func = child->as<ASTFunction>();
2021-05-04 21:36:30 +00:00
if (and_func && and_func->name == "and")
2021-01-04 20:55:32 +00:00
and_node_id = i;
}
2021-11-18 14:24:06 +00:00
2021-05-19 19:07:27 +00:00
if (and_node_id == func->arguments->children.size())
2021-11-18 14:24:06 +00:00
return true;
2021-01-04 20:55:32 +00:00
2021-11-18 14:24:06 +00:00
const size_t other_node_id = 1 - and_node_id;
2021-05-19 19:07:27 +00:00
const auto * and_func = func->arguments->children[and_node_id]->as<ASTFunction>();
2021-11-18 14:24:06 +00:00
2021-05-19 19:07:27 +00:00
auto a = func->arguments->children[other_node_id];
2021-01-04 20:55:32 +00:00
auto b = and_func->arguments->children[0];
auto c = and_func->arguments->children[1];
/// apply the distributive law ( a or (b and c) -> (a or b) and (a or c) )
2021-05-19 19:07:27 +00:00
node = makeASTFunction(
"and",
makeASTFunction("or", a->clone(), b),
makeASTFunction("or", a, c));
2021-11-18 14:24:06 +00:00
/// Count all atoms from 'a', because it was cloned.
num_atoms += countAtoms(a);
return traversePushOr(node, num_atoms, max_atoms);
2021-01-04 20:55:32 +00:00
}
2021-11-18 14:24:06 +00:00
return true;
2021-01-04 20:55:32 +00:00
}
/// transform ast into cnf groups
void traverseCNF(const ASTPtr & node, CNFQuery::AndGroup & and_group, CNFQuery::OrGroup & or_group)
{
2021-11-18 14:24:06 +00:00
checkStackSize();
2021-01-04 20:55:32 +00:00
auto * func = node->as<ASTFunction>();
if (func && func->name == "and")
{
for (auto & child : func->arguments->children)
{
CNFQuery::OrGroup group;
traverseCNF(child, and_group, group);
if (!group.empty())
and_group.insert(std::move(group));
}
}
else if (func && func->name == "or")
{
for (auto & child : func->arguments->children)
{
traverseCNF(child, and_group, or_group);
}
}
2021-03-04 12:11:43 +00:00
else if (func && func->name == "not")
{
2021-05-07 10:49:05 +00:00
if (func->arguments->children.size() != 1)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Bad NOT function. Expected 1 argument");
2021-03-04 12:11:43 +00:00
or_group.insert(CNFQuery::AtomicFormula{true, func->arguments->children.front()});
}
2021-01-04 20:55:32 +00:00
else
{
2021-03-04 12:11:43 +00:00
or_group.insert(CNFQuery::AtomicFormula{false, node});
2021-01-04 20:55:32 +00:00
}
}
void traverseCNF(const ASTPtr & node, CNFQuery::AndGroup & result)
{
CNFQuery::OrGroup or_group;
traverseCNF(node, result, or_group);
if (!or_group.empty())
result.insert(or_group);
}
2021-11-10 17:57:59 +00:00
}
2021-11-18 14:24:06 +00:00
std::optional<CNFQuery> TreeCNFConverter::tryConvertToCNF(
2021-11-26 14:47:40 +00:00
const ASTPtr & query, size_t max_growth_multiplier)
2021-01-04 20:55:32 +00:00
{
auto cnf = query->clone();
2021-11-18 14:24:06 +00:00
size_t num_atoms = countAtoms(cnf);
2021-01-04 20:55:32 +00:00
splitMultiLogic(cnf);
traversePushNot(cnf, false);
2021-11-18 14:24:06 +00:00
2021-11-26 14:47:40 +00:00
size_t max_atoms = max_growth_multiplier
? std::max(MAX_ATOMS_WITHOUT_CHECK, num_atoms * max_growth_multiplier)
2021-11-18 14:24:06 +00:00
: 0;
if (!traversePushOr(cnf, num_atoms, max_atoms))
return {};
2021-01-04 20:55:32 +00:00
CNFQuery::AndGroup and_group;
traverseCNF(cnf, and_group);
CNFQuery result{std::move(and_group)};
return result;
}
2021-11-18 14:24:06 +00:00
CNFQuery TreeCNFConverter::toCNF(
2021-11-26 14:47:40 +00:00
const ASTPtr & query, size_t max_growth_multiplier)
2021-11-18 14:24:06 +00:00
{
2021-11-26 14:47:40 +00:00
auto cnf = tryConvertToCNF(query, max_growth_multiplier);
2021-11-18 14:24:06 +00:00
if (!cnf)
throw Exception(ErrorCodes::TOO_MANY_TEMPORARY_COLUMNS,
2021-11-26 14:47:40 +00:00
"Cannot convert expression '{}' to CNF, because it produces to many clauses."
"Size of boolean formula in CNF can be exponential of size of source formula.",
queryToString(query));
2021-11-18 14:24:06 +00:00
return *cnf;
}
2021-01-04 20:55:32 +00:00
ASTPtr TreeCNFConverter::fromCNF(const CNFQuery & cnf)
{
const auto & groups = cnf.getStatements();
if (groups.empty())
return nullptr;
ASTs or_groups;
for (const auto & group : groups)
{
if (group.size() == 1)
2021-03-04 12:11:43 +00:00
{
if ((*group.begin()).negative)
or_groups.push_back(makeASTFunction("not", (*group.begin()).ast->clone()));
else
or_groups.push_back((*group.begin()).ast->clone());
}
2021-01-04 20:55:32 +00:00
else if (group.size() > 1)
{
or_groups.push_back(makeASTFunction("or"));
auto * func = or_groups.back()->as<ASTFunction>();
2021-03-04 12:11:43 +00:00
for (const auto & atom : group)
{
2021-11-18 14:24:06 +00:00
if (atom.negative)
2021-03-04 12:11:43 +00:00
func->arguments->children.push_back(makeASTFunction("not", atom.ast->clone()));
else
func->arguments->children.push_back(atom.ast->clone());
}
2021-01-04 20:55:32 +00:00
}
}
if (or_groups.size() == 1)
return or_groups.front();
ASTPtr res = makeASTFunction("and");
auto * func = res->as<ASTFunction>();
for (const auto & group : or_groups)
func->arguments->children.push_back(group);
return res;
}
2021-11-10 17:57:59 +00:00
static void pushPullNotInAtom(CNFQuery::AtomicFormula & atom, const std::unordered_map<std::string, std::string> & inverse_relations)
2021-03-04 12:11:43 +00:00
{
auto * func = atom.ast->as<ASTFunction>();
if (!func)
return;
if (auto it = inverse_relations.find(func->name); it != std::end(inverse_relations))
{
/// inverse func
atom.ast = atom.ast->clone();
auto * new_func = atom.ast->as<ASTFunction>();
new_func->name = it->second;
/// add not
atom.negative = !atom.negative;
}
}
2021-11-10 17:57:59 +00:00
static void pullNotOut(CNFQuery::AtomicFormula & atom)
2021-01-05 20:51:19 +00:00
{
2021-11-10 17:57:59 +00:00
static const std::unordered_map<std::string, std::string> inverse_relations = {
2021-01-05 20:51:19 +00:00
{"notEquals", "equals"},
{"greaterOrEquals", "less"},
{"greater", "lessOrEquals"},
{"notIn", "in"},
{"notLike", "like"},
{"notEmpty", "empty"},
};
2021-03-04 12:11:43 +00:00
pushPullNotInAtom(atom, inverse_relations);
2021-01-05 20:51:19 +00:00
}
2021-03-04 12:11:43 +00:00
void pushNotIn(CNFQuery::AtomicFormula & atom)
2021-01-05 20:51:19 +00:00
{
2021-03-04 12:11:43 +00:00
if (!atom.negative)
return;
2021-11-10 17:57:59 +00:00
static const std::unordered_map<std::string, std::string> inverse_relations = {
2021-01-05 20:51:19 +00:00
{"equals", "notEquals"},
{"less", "greaterOrEquals"},
{"lessOrEquals", "greater"},
{"in", "notIn"},
{"like", "notLike"},
{"empty", "notEmpty"},
2021-03-04 12:11:43 +00:00
{"notEquals", "equals"},
{"greaterOrEquals", "less"},
{"greater", "lessOrEquals"},
{"notIn", "in"},
{"notLike", "like"},
{"notEmpty", "empty"},
2021-01-05 20:51:19 +00:00
};
2021-03-04 12:11:43 +00:00
pushPullNotInAtom(atom, inverse_relations);
2021-01-05 20:51:19 +00:00
}
CNFQuery & CNFQuery::pullNotOutFunctions()
{
2021-03-04 12:11:43 +00:00
transformAtoms([](const AtomicFormula & atom) -> AtomicFormula
{
AtomicFormula result{atom.negative, atom.ast->clone()};
pullNotOut(result);
return result;
});
2021-01-05 20:51:19 +00:00
return *this;
}
2022-09-05 01:50:24 +00:00
CNFQuery & CNFQuery::pushNotInFunctions()
2021-01-05 20:51:19 +00:00
{
2021-03-04 12:11:43 +00:00
transformAtoms([](const AtomicFormula & atom) -> AtomicFormula
2021-01-05 20:51:19 +00:00
{
2021-03-04 12:11:43 +00:00
AtomicFormula result{atom.negative, atom.ast->clone()};
pushNotIn(result);
2021-01-05 20:51:19 +00:00
return result;
});
return *this;
}
2021-05-05 08:51:25 +00:00
CNFQuery & CNFQuery::reduce()
{
while (true)
{
AndGroup new_statements = reduceOnceCNFStatements(statements);
2021-05-05 08:51:25 +00:00
if (statements == new_statements)
2021-05-05 11:28:02 +00:00
{
statements = filterCNFSubsets(statements);
2021-05-05 08:51:25 +00:00
return *this;
2021-05-05 11:28:02 +00:00
}
2021-05-05 08:51:25 +00:00
else
statements = new_statements;
}
}
2021-01-04 20:55:32 +00:00
std::string CNFQuery::dump() const
{
2021-05-04 18:43:58 +00:00
WriteBufferFromOwnString res;
2021-01-04 20:55:32 +00:00
bool first = true;
for (const auto & group : statements)
{
if (!first)
res << " AND ";
first = false;
res << "(";
bool first_in_group = true;
2021-03-04 12:11:43 +00:00
for (const auto & atom : group)
2021-01-04 20:55:32 +00:00
{
if (!first_in_group)
res << " OR ";
first_in_group = false;
2021-03-04 12:11:43 +00:00
if (atom.negative)
res << " NOT ";
res << atom.ast->getColumnName();
2021-01-04 20:55:32 +00:00
}
res << ")";
}
return res.str();
}
}