Replace missed Object(Nullable(json)) subcolumns

This commit is contained in:
VanDarkholme7 2023-09-19 09:55:45 +08:00
parent cf8c614fed
commit 78660eb859
6 changed files with 179 additions and 0 deletions

View File

@ -30,6 +30,7 @@
#include <Interpreters/getTableExpressions.h>
#include <Interpreters/replaceAliasColumnsInQuery.h>
#include <Interpreters/replaceForPositionalArguments.h>
#include <Interpreters/replaceMissedSubcolumnsInQuery.h>
#include <Functions/UserDefined/UserDefinedSQLFunctionFactory.h>
#include <Functions/UserDefined/UserDefinedSQLFunctionVisitor.h>
@ -48,6 +49,7 @@
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <IO/WriteHelpers.h>
@ -948,6 +950,10 @@ void TreeRewriterResult::collectSourceColumns(bool add_special)
source_columns.swap(columns_from_storage);
else
source_columns.insert(source_columns.end(), columns_from_storage.begin(), columns_from_storage.end());
auto metadata_snapshot = storage->getInMemoryMetadataPtr();
auto metadata_column_descriptions = metadata_snapshot->getColumns();
source_columns_ordinary = metadata_column_descriptions.getOrdinary();
}
source_columns_set = removeDuplicateColumns(source_columns);
@ -1117,6 +1123,33 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
}
}
/// Collect missed object subcolumns
if (!unknown_required_source_columns.empty())
{
for (const NameAndTypePair & pair : source_columns_ordinary)
{
for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();)
{
size_t object_pos = it->find('.');
if (object_pos != std::string::npos)
{
String object_name = it->substr(0, object_pos);
if (pair.type->getTypeId() == TypeIndex::Object)
{
const auto * object_type = typeid_cast<const DataTypeObject *>(pair.type.get());
if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns())
{
missed_subcolumns.insert(*it);
it = unknown_required_source_columns.erase(it);
continue;
}
}
}
++it;
}
}
}
if (!unknown_required_source_columns.empty())
{
constexpr auto format_string = "Missing columns: {} while processing query: '{}', required columns:{}{}";
@ -1301,6 +1334,13 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
result.collectUsedColumns(query, true, settings.query_plan_optimize_primary_key);
if (!result.missed_subcolumns.empty())
{
for (const String & column_name : result.missed_subcolumns)
replaceMissedSubcolumnsInQuery(query, column_name);
result.missed_subcolumns.clear();
}
result.required_source_columns_before_expanding_alias_columns = result.required_source_columns.getNames();
/// rewrite filters for select query, must go after getArrayJoinedColumns
@ -1399,6 +1439,14 @@ TreeRewriterResultPtr TreeRewriter::analyze(
bool is_ok = result.collectUsedColumns(query, false, settings.query_plan_optimize_primary_key, no_throw);
if (!is_ok)
return {};
if (!result.missed_subcolumns.empty())
{
for (const String & column_name : result.missed_subcolumns)
replaceMissedSubcolumnsInQuery(query, column_name);
result.missed_subcolumns.clear();
}
return std::make_shared<const TreeRewriterResult>(result);
}

View File

@ -36,6 +36,11 @@ struct TreeRewriterResult
/// Same as above but also record alias columns which are expanded. This is for RBAC access check.
Names required_source_columns_before_expanding_alias_columns;
/// Set of columns that object columns are not extended. This is for distinguishing JSON and Tuple type.
NamesAndTypesList source_columns_ordinary;
NameSet missed_subcolumns;
/// Set of alias columns that are expanded to their alias expressions. We still need the original columns to check access permission.
NameSet expanded_aliases;

View File

@ -0,0 +1,70 @@
#include <Interpreters/replaceMissedSubcolumnsInQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
namespace DB
{
bool replaceMissedSubcolumnsInFunction(ASTPtr & ast, const String & column_name)
{
bool is_replaced = false;
if (auto * identifier = ast->as<ASTIdentifier>())
{
if (column_name == identifier->getColumnName())
{
ast = std::make_shared<ASTLiteral>(Field());
return true;
}
}
else if (auto * node = ast->as<ASTFunction>())
{
if (node->arguments)
{
size_t num_arguments = node->arguments->children.size();
for (size_t arg = 0; arg < num_arguments; ++arg)
{
auto & child = node->arguments->children[arg];
if (replaceMissedSubcolumnsInFunction(child, column_name))
is_replaced = true;
}
}
}
else
{
for (auto & child : ast->children)
{
if (replaceMissedSubcolumnsInFunction(child, column_name))
is_replaced = true;
}
}
return is_replaced;
}
void replaceMissedSubcolumnsInQuery(ASTPtr & ast, const String & column_name)
{
if (auto * identifier = ast->as<ASTIdentifier>())
{
if (column_name == identifier->getColumnName())
{
auto literal = std::make_shared<ASTLiteral>(Field());
literal->setAlias(identifier->getAliasOrColumnName());
ast = literal;
}
}
else if (auto * node = ast->as<ASTFunction>())
{
String function_alias = node->getAliasOrColumnName();
if (replaceMissedSubcolumnsInFunction(ast, column_name))
ast->setAlias(function_alias);
}
else
{
for (auto & child : ast->children)
replaceMissedSubcolumnsInQuery(child, column_name);
}
}
}

View File

@ -0,0 +1,20 @@
#pragma once
#include <Core/Names.h>
#include <Parsers/IAST_fwd.h>
#include <base/types.h>
namespace DB
{
/// Replace missed Object(Nullable('json')) subcolumns to NULL in query.
void replaceMissedSubcolumnsInQuery(ASTPtr & ast, const String & column_name);
/// Return true if the ASTFunction has missed object subcolumns.
/// Resolving ASTFunction independently is because we may lose the column name of missed object subcolumns.
/// For example, if `b.d` is a missed object subcolumn, the column name of `b.d * 2 + 3` will be `plus(multiply(NULL, 2), 3)`,
/// while we want to keep it as `plus(multiply(b.d, 2), 3)`.
bool replaceMissedSubcolumnsInFunction(ASTPtr & ast, const String & column_name);
}

View File

@ -0,0 +1,7 @@
4 1
{"id":"1","n":"aaa","obj.k4":null}
{"id":"2","n":"bbb","obj.k4":null}
{"id":"3","n":"ccc","obj.k4":null}
{"id":"4","n":"ddd","obj.k4":null}
4 1
4 1

View File

@ -0,0 +1,29 @@
DROP TABLE IF EXISTS t_mutations_subcolumns;
SET allow_experimental_object_type = 1;
CREATE TABLE t_missed_subcolumns (id UInt64, n String, obj Object(Nullable('json')))
ENGINE = MergeTree ORDER BY id;
INSERT INTO t_missed_subcolumns VALUES (1, 'aaa', '{"k1": {"k2": "foo"}, "k3": 5}');
INSERT INTO t_missed_subcolumns VALUES (2, 'bbb', '{"k1": {"k2": "fee"}, "k3": 4}');
INSERT INTO t_missed_subcolumns VALUES (3, 'ccc', '{"k1": {"k2": "foo", "k4": "baz"}, "k3": 4}');
INSERT INTO t_missed_subcolumns VALUES (4, 'ddd', '{"k1": {"k2": "foo"}, "k3": 4}');
OPTIMIZE TABLE t_missed_subcolumns FINAL;
SELECT count(), min(id) FROM t_missed_subcolumns;
SELECT * FROM t_missed_subcolumns WHERE obj.k4 = 5 ORDER BY id FORMAT JSONEachRow;
SELECT * FROM t_missed_subcolumns WHERE obj.k1.k3 = 'fee' ORDER BY id FORMAT JSONEachRow;
SELECT id, n, obj.k4 FROM t_missed_subcolumns ORDER BY id FORMAT JSONEachRow;
ALTER TABLE t_missed_subcolumns DELETE WHERE obj.k4 = 5;
SELECT count(), min(id) FROM t_missed_subcolumns;
DELETE FROM t_missed_subcolumns WHERE obj.k1.k3 = 'fee';
SELECT count(), min(id) FROM t_missed_subcolumns;
DROP TABLE IF EXISTS t_missed_subcolumns;