more optimization

This commit is contained in:
Nikita Vasilev 2021-01-05 23:51:19 +03:00
parent 92e4f1370e
commit 10fecb9183
6 changed files with 288 additions and 25 deletions

View File

@ -30,7 +30,7 @@ void splitMultiLogic(ASTPtr & node)
}
}
/// Push NOT to leafs
/// Push NOT to leafs, remove NOT NOT ...
void traversePushNot(ASTPtr & node, bool add_negation)
{
auto * func = node->as<ASTFunction>();
@ -196,7 +196,7 @@ ASTPtr TreeCNFConverter::fromCNF(const CNFQuery & cnf)
or_groups.push_back(makeASTFunction("or"));
auto * func = or_groups.back()->as<ASTFunction>();
for (const auto & ast : group)
func->arguments->children.push_back(ast);
func->arguments->children.push_back(ast->clone());
}
}
@ -211,6 +211,90 @@ ASTPtr TreeCNFConverter::fromCNF(const CNFQuery & cnf)
return res;
}
void pullNotOut(ASTPtr & node)
{
static const std::map<std::string, std::string> inverse_relations = {
{"notEquals", "equals"},
{"greaterOrEquals", "less"},
{"greater", "lessOrEquals"},
{"notIn", "in"},
{"notLike", "like"},
{"notEmpty", "empty"},
};
auto * func = node->as<ASTFunction>();
if (!func)
return;
if (auto it = inverse_relations.find(func->name); it != std::end(inverse_relations))
{
/// inverse func
node = node->clone();
auto * new_func = node->as<ASTFunction>();
new_func->name = it->second;
/// add not
node = makeASTFunction("not", node);
}
}
void pushNotIn(ASTPtr & node)
{
static const std::map<std::string, std::string> inverse_relations = {
{"equals", "notEquals"},
{"less", "greaterOrEquals"},
{"lessOrEquals", "greater"},
{"in", "notIn"},
{"like", "notLike"},
{"empty", "notEmpty"},
};
auto * func = node->as<ASTFunction>();
if (!func)
return;
if (auto it = inverse_relations.find(func->name); it != std::end(inverse_relations))
{
/// inverse func
node = node->clone();
auto * new_func = node->as<ASTFunction>();
new_func->name = it->second;
/// add not
node = makeASTFunction("not", node);
}
}
CNFQuery & CNFQuery::pullNotOutFunctions()
{
transformAtoms([](const ASTPtr & node) -> ASTPtr
{
auto * func = node->as<ASTFunction>();
if (!func)
return node;
ASTPtr result = node->clone();
if (func->name == "not")
pullNotOut(func->arguments->children.front());
else
pullNotOut(result);
traversePushNot(result, false);
return result;
});
return *this;
}
CNFQuery & CNFQuery::pushNotInFuntions()
{
transformAtoms([](const ASTPtr & node) -> ASTPtr
{
auto * func = node->as<ASTFunction>();
if (!func)
return node;
ASTPtr result = node->clone();
if (func->name == "not")
pushNotIn(func->arguments->children.front());
traversePushNot(result, false);
return result;
});
return *this;
}
std::string CNFQuery::dump() const
{
std::stringstream res;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Parsers/IAST_fwd.h>
#include <Parsers/ASTLiteral.h>
#include <vector>
#include <set>
#include <unordered_map>
@ -14,11 +15,11 @@ public:
using OrGroup = std::set<ASTPtr>; // Add NOT container???
using AndGroup = std::set<OrGroup>;
CNFQuery(AndGroup && statements_)
: statements(std::move(statements_)) {}
CNFQuery(AndGroup && statements_) : statements(std::move(statements_)) { }
template <typename P>
void filterGroups(P predicate) {
CNFQuery & filterGroups(P predicate) /// delete always true groups
{
AndGroup filtered;
for (const auto & or_group : statements)
{
@ -26,28 +27,78 @@ public:
filtered.insert(or_group);
}
std::swap(statements, filtered);
return *this;
}
template <typename P>
void filterAtoms(P predicate) {
CNFQuery & filterAtoms(P predicate) /// delete always false atoms
{
AndGroup filtered;
for (const auto & or_group : statements)
{
OrGroup filtered_group;
for (auto ast : or_group) {
for (auto ast : or_group)
{
if (predicate(ast))
filtered_group.insert(ast);
}
if (!filtered_group.empty())
filtered.insert(filtered_group);
else
{
/// all atoms false -> group false -> CNF false
filtered.clear();
filtered_group.clear();
filtered_group.insert(std::make_shared<ASTLiteral>(static_cast<UInt8>(0)));
filtered.insert(filtered_group);
std::swap(statements, filtered);
return *this;
}
}
std::swap(statements, filtered);
return *this;
}
template <typename F>
CNFQuery & transformGroups(F func)
{
AndGroup result;
for (const auto & group : statements)
{
auto new_group = func(group);
if (!new_group.empty())
result.insert(std::move(new_group));
}
std::swap(statements, result);
return *this;
}
template <typename F>
CNFQuery & transformAtoms(F func)
{
transformGroups([func](const OrGroup & group) -> OrGroup
{
OrGroup result;
for (const auto & ast : group)
{
auto new_ast = func(ast);
if (new_ast)
result.insert(std::move(new_ast));
}
return result;
});
return *this;
}
const AndGroup & getStatements() const { return statements; }
std::string dump() const;
/// Converts != -> NOT =; <,>= -> (NOT) <; >,<= -> (NOT) <= for simpler matching
CNFQuery & pullNotOutFunctions();
/// Revert pullNotOutFunctions actions
CNFQuery & pushNotInFuntions();
private:
AndGroup statements;
};

View File

@ -514,6 +514,10 @@ void optimizeWithConstraints(ASTSelectQuery * select_query, Aliases & aliases, c
const StorageMetadataPtr & metadata_snapshot)
{
WhereConstraintsOptimizer(select_query, aliases, source_columns_set, tables_with_columns, metadata_snapshot).perform();
if (select_query->where())
Poco::Logger::get("KEK").information(select_query->where()->dumpTree());
else
Poco::Logger::get("KEK").information("NO WHERE");
}
/// transform where to CNF for more convenient optimization
@ -524,6 +528,8 @@ void convertQueryToCNF(ASTSelectQuery * select_query)
auto cnf_form = TreeCNFConverter::toCNF(select_query->where());
select_query->refWhere() = TreeCNFConverter::fromCNF(cnf_form);
}
if (select_query->where())
Poco::Logger::get("KEK").information(select_query->where()->dumpTree());
}
/// Remove duplicated columns from USING(...).

View File

@ -1,5 +1,6 @@
#include <Interpreters/WhereConstraintsOptimizer.h>
#include <Interpreters/TreeCNFConverter.h>
#include <Interpreters/ConstraintMatcherVisitor.h>
#include <Parsers/ASTConstraintDeclaration.h>
#include <Storages/StorageInMemoryMetadata.h>
@ -7,16 +8,17 @@
namespace DB
{
ConstraintMatcherVisitor::Data getConstraintData(const StorageMetadataPtr & metadata_snapshot)
std::vector<std::vector<ASTPtr>> getConstraintData(const StorageMetadataPtr & metadata_snapshot)
{
ConstraintMatcherVisitor::Data constraint_data;
std::vector<std::vector<ASTPtr>> constraint_data;
for (const auto & constraint : metadata_snapshot->getConstraints().filterConstraints(ConstraintsDescription::ConstraintType::ALWAYS_TRUE))
for (const auto & constraint :
metadata_snapshot->getConstraints().filterConstraints(ConstraintsDescription::ConstraintType::ALWAYS_TRUE))
{
const auto expr = constraint->as<ASTConstraintDeclaration>()->expr->clone();
constraint_data.constraints[expr->getTreeHash().second].push_back(expr);
const auto cnf = TreeCNFConverter::toCNF(constraint->as<ASTConstraintDeclaration>()->expr->ptr())
.pullNotOutFunctions();
for (const auto & group : cnf.getStatements())
constraint_data.emplace_back(std::begin(group), std::end(group));
}
return constraint_data;
@ -29,21 +31,134 @@ WhereConstraintsOptimizer::WhereConstraintsOptimizer(
const std::vector<TableWithColumnNamesAndTypes> & /*tables_with_columns_*/,
const StorageMetadataPtr & metadata_snapshot_)
: select_query(select_query_)
/* , aliases(aliases_)
/* , aliases(aliases_)
, source_columns_set(source_columns_set_)
, tables_with_columns(tables_with_columns_)*/
, metadata_snapshot(metadata_snapshot_)
{
}
bool checkIfGroupAlwaysTrue(const CNFQuery::OrGroup & group, const std::vector<std::vector<ASTPtr>> & constraints)
{
/// TODO: this is temporary; need to write more effective search
/// TODO: go deeper into asts (a < b, a = b,...) with z3 or some visitor
for (const auto & constraint : constraints) /// one constraint in group is enough,
/// otherwise it's difficult to make judgements without using constraint solving (z3..)
{
bool group_always_true = true;
for (const auto & constraint_ast : constraint)
{
bool found_match = false;
for (const auto & group_ast : group)
{
bool match_means_ok = true;
ASTPtr a = constraint_ast;
ASTPtr b = group_ast;
{
auto * func_a = a->as<ASTFunction>();
if (func_a && func_a->name == "not")
{
a = func_a->arguments->children.front();
match_means_ok ^= true;
}
}
{
auto * func_b = b->as<ASTFunction>();
if (func_b && func_b->name == "not")
{
b = func_b->arguments->children.front();
match_means_ok ^= true;
}
}
if (a->getTreeHash() == b->getTreeHash() &&
a->getColumnName() == b->getColumnName())
{
found_match = match_means_ok;
break;
}
}
if (!found_match)
{
group_always_true = false;
break;
}
}
if (group_always_true)
return true;
}
return false;
}
bool checkIfAtomAlwaysFalse(const ASTPtr & atom, const std::vector<std::vector<ASTPtr>> & constraints)
{
/// TODO: more efficient matching
for (const auto & constraint : constraints)
{
if (constraint.size() > 1)
continue; /// TMP; Too hard to do something at current time (without more powerful instruments)
for (const auto & constraint_ast : constraint)
{
bool match_means_ok = true;
ASTPtr a = constraint_ast;
ASTPtr b = atom;
{
auto * func_a = a->as<ASTFunction>();
if (func_a && func_a->name == "not")
{
a = func_a->arguments->children.front();
match_means_ok ^= true;
}
}
{
auto * func_b = b->as<ASTFunction>();
if (func_b && func_b->name == "not")
{
b = func_b->arguments->children.front();
match_means_ok ^= true;
}
}
Poco::Logger::get("MATCHER a").information(a->dumpTree());
Poco::Logger::get("MATCHER b").information(b->dumpTree());
Poco::Logger::get("MATCHER a>>").information(a->getColumnName());
Poco::Logger::get("MATCHER b>>" ).information(b->getColumnName());
if (a->getTreeHash() == b->getTreeHash() &&
a->getColumnName() == b->getColumnName())
{
Poco::Logger::get("MATCH").information(std::to_string(static_cast<int>(match_means_ok)));
return !match_means_ok;
}
}
}
return false;
}
void WhereConstraintsOptimizer::perform()
{
auto constraint_data = getConstraintData(metadata_snapshot);
/// TODO: split != -> NOT =, >= -> NOT <, ...
if (select_query->where())
ConstraintMatcherVisitor(constraint_data).visit(select_query->refWhere());
{
auto cnf = TreeCNFConverter::toCNF(select_query->where());
cnf.pullNotOutFunctions()
.filterGroups([&constraint_data](const auto & group)
{ return !checkIfGroupAlwaysTrue(group, constraint_data); }) /// remove always true functions in CNF
.filterAtoms([&constraint_data](const auto & ast)
{ return !checkIfAtomAlwaysFalse(ast, constraint_data); }) /// TODO: remove always false atoms in CNF
.pushNotInFuntions();
//ConstraintMatcherVisitor(constraint_data).visit(select_query->refWhere());
Poco::Logger::get("AFTER OPT").information(cnf.dump());
select_query->setExpression(ASTSelectQuery::Expression::WHERE, TreeCNFConverter::fromCNF(cnf));
}
}
}

View File

@ -4,18 +4,22 @@ DROP TABLE IF EXISTS constraint_test.constrained;
SET optimize_using_constraints = 1;
CREATE DATABASE constraint_test;
CREATE TABLE constraint_test.assumption (URL String, CONSTRAINT is_yandex ASSUME domainWithoutWWW(URL) = 'yandex.ru', CONSTRAINT is_utf8 CHECK isValidUTF8(URL)) ENGINE = TinyLog;
CREATE TABLE constraint_test.assumption (URL String, a Int32, CONSTRAINT c1 ASSUME domainWithoutWWW(URL) = 'yandex.ru', CONSTRAINT c2 ASSUME URL > 'zzz', CONSTRAINT c3 CHECK isValidUTF8(URL)) ENGINE = TinyLog;
--- Add wrong rows in order to check optimization
INSERT INTO constraint_test.assumption (URL) VALUES ('1');
INSERT INTO constraint_test.assumption (URL) VALUES ('2');
INSERT INTO constraint_test.assumption (URL) VALUES ('yandex.ru');
INSERT INTO constraint_test.assumption (URL) VALUES ('3');
INSERT INTO constraint_test.assumption (URL, a) VALUES ('1', 1);
INSERT INTO constraint_test.assumption (URL, a) VALUES ('2', 2);
INSERT INTO constraint_test.assumption (URL, a) VALUES ('yandex.ru', 3);
INSERT INTO constraint_test.assumption (URL, a) VALUES ('3', 4);
SELECT count() FROM constraint_test.assumption WHERE domainWithoutWWW(URL) = 'yandex.ru'; --- assumption -> 4
SELECT count() FROM constraint_test.assumption WHERE NOT (domainWithoutWWW(URL) = 'yandex.ru'); --- assumption -> 0
SELECT count() FROM constraint_test.assumption WHERE domainWithoutWWW(URL) != 'yandex.ru'; --- not optimized -> 3
SELECT count() FROM constraint_test.assumption WHERE domainWithoutWWW(URL) != 'yandex.ru'; --- assumption -> 0
SELECT count() FROM constraint_test.assumption WHERE domainWithoutWWW(URL) = 'nothing'; --- not optimized -> 0
SELECT count() FROM constraint_test.assumption WHERE (domainWithoutWWW(URL) = 'yandex.ru' AND URL > 'zzz'); ---> assumption -> 4
SELECT count() FROM constraint_test.assumption WHERE (domainWithoutWWW(URL) = 'yandex.ru' AND URL > 'zzz') OR (a = 10 AND a + 5 < 100); ---> assumption -> 4
SELECT count() FROM constraint_test.assumption WHERE (domainWithoutWWW(URL) = 'yandex.ru' AND URL = '111'); ---> assumption & no assumption -> 0
DROP TABLE constraint_test.assumption;
DROP DATABASE constraint_test;