diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index d87ac1ed435..244a4c6fdb0 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,7 @@ #include #include +#include #include #include @@ -948,6 +950,10 @@ void TreeRewriterResult::collectSourceColumns(bool add_special) source_columns.swap(columns_from_storage); else source_columns.insert(source_columns.end(), columns_from_storage.begin(), columns_from_storage.end()); + + auto metadata_snapshot = storage->getInMemoryMetadataPtr(); + auto metadata_column_descriptions = metadata_snapshot->getColumns(); + source_columns_ordinary = metadata_column_descriptions.getOrdinary(); } source_columns_set = removeDuplicateColumns(source_columns); @@ -1117,6 +1123,33 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select } } + /// Collect missed object subcolumns + if (!unknown_required_source_columns.empty()) + { + for (const NameAndTypePair & pair : source_columns_ordinary) + { + for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) + { + size_t object_pos = it->find('.'); + if (object_pos != std::string::npos) + { + String object_name = it->substr(0, object_pos); + if (pair.type->getTypeId() == TypeIndex::Object) + { + const auto * object_type = typeid_cast(pair.type.get()); + if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns()) + { + missed_subcolumns.insert(*it); + it = unknown_required_source_columns.erase(it); + continue; + } + } + } + ++it; + } + } + } + if (!unknown_required_source_columns.empty()) { constexpr auto format_string = "Missing columns: {} while processing query: '{}', required columns:{}{}"; @@ -1301,6 +1334,13 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( result.collectUsedColumns(query, true, settings.query_plan_optimize_primary_key); + if (!result.missed_subcolumns.empty()) + { + for (const String & column_name : result.missed_subcolumns) + replaceMissedSubcolumnsInQuery(query, column_name); + result.missed_subcolumns.clear(); + } + result.required_source_columns_before_expanding_alias_columns = result.required_source_columns.getNames(); /// rewrite filters for select query, must go after getArrayJoinedColumns @@ -1399,6 +1439,14 @@ TreeRewriterResultPtr TreeRewriter::analyze( bool is_ok = result.collectUsedColumns(query, false, settings.query_plan_optimize_primary_key, no_throw); if (!is_ok) return {}; + + if (!result.missed_subcolumns.empty()) + { + for (const String & column_name : result.missed_subcolumns) + replaceMissedSubcolumnsInQuery(query, column_name); + result.missed_subcolumns.clear(); + } + return std::make_shared(result); } diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h index 60832f49b35..75d8fcf0223 100644 --- a/src/Interpreters/TreeRewriter.h +++ b/src/Interpreters/TreeRewriter.h @@ -36,6 +36,11 @@ struct TreeRewriterResult /// Same as above but also record alias columns which are expanded. This is for RBAC access check. Names required_source_columns_before_expanding_alias_columns; + /// Set of columns that object columns are not extended. This is for distinguishing JSON and Tuple type. + NamesAndTypesList source_columns_ordinary; + + NameSet missed_subcolumns; + /// Set of alias columns that are expanded to their alias expressions. We still need the original columns to check access permission. NameSet expanded_aliases; diff --git a/src/Interpreters/replaceMissedSubcolumnsInQuery.cpp b/src/Interpreters/replaceMissedSubcolumnsInQuery.cpp new file mode 100644 index 00000000000..f8e628c0a34 --- /dev/null +++ b/src/Interpreters/replaceMissedSubcolumnsInQuery.cpp @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +namespace DB +{ + +bool replaceMissedSubcolumnsInFunction(ASTPtr & ast, const String & column_name) +{ + bool is_replaced = false; + + if (auto * identifier = ast->as()) + { + if (column_name == identifier->getColumnName()) + { + ast = std::make_shared(Field()); + return true; + } + } + else if (auto * node = ast->as()) + { + if (node->arguments) + { + size_t num_arguments = node->arguments->children.size(); + for (size_t arg = 0; arg < num_arguments; ++arg) + { + auto & child = node->arguments->children[arg]; + if (replaceMissedSubcolumnsInFunction(child, column_name)) + is_replaced = true; + } + } + } + else + { + for (auto & child : ast->children) + { + if (replaceMissedSubcolumnsInFunction(child, column_name)) + is_replaced = true; + } + } + + return is_replaced; +} + +void replaceMissedSubcolumnsInQuery(ASTPtr & ast, const String & column_name) +{ + if (auto * identifier = ast->as()) + { + if (column_name == identifier->getColumnName()) + { + auto literal = std::make_shared(Field()); + literal->setAlias(identifier->getAliasOrColumnName()); + ast = literal; + } + } + else if (auto * node = ast->as()) + { + String function_alias = node->getAliasOrColumnName(); + if (replaceMissedSubcolumnsInFunction(ast, column_name)) + ast->setAlias(function_alias); + } + else + { + for (auto & child : ast->children) + replaceMissedSubcolumnsInQuery(child, column_name); + } +} + +} diff --git a/src/Interpreters/replaceMissedSubcolumnsInQuery.h b/src/Interpreters/replaceMissedSubcolumnsInQuery.h new file mode 100644 index 00000000000..7f4b53bbbf2 --- /dev/null +++ b/src/Interpreters/replaceMissedSubcolumnsInQuery.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/// Replace missed Object(Nullable('json')) subcolumns to NULL in query. +void replaceMissedSubcolumnsInQuery(ASTPtr & ast, const String & column_name); + +/// Return true if the ASTFunction has missed object subcolumns. +/// Resolving ASTFunction independently is because we may lose the column name of missed object subcolumns. +/// For example, if `b.d` is a missed object subcolumn, the column name of `b.d * 2 + 3` will be `plus(multiply(NULL, 2), 3)`, +/// while we want to keep it as `plus(multiply(b.d, 2), 3)`. +bool replaceMissedSubcolumnsInFunction(ASTPtr & ast, const String & column_name); + +} + diff --git a/tests/queries/0_stateless/02886_missed_json_subcolumns.reference b/tests/queries/0_stateless/02886_missed_json_subcolumns.reference new file mode 100644 index 00000000000..d2bb7e33e7e --- /dev/null +++ b/tests/queries/0_stateless/02886_missed_json_subcolumns.reference @@ -0,0 +1,7 @@ +4 1 +{"id":"1","n":"aaa","obj.k4":null} +{"id":"2","n":"bbb","obj.k4":null} +{"id":"3","n":"ccc","obj.k4":null} +{"id":"4","n":"ddd","obj.k4":null} +4 1 +4 1 diff --git a/tests/queries/0_stateless/02886_missed_json_subcolumns.sql b/tests/queries/0_stateless/02886_missed_json_subcolumns.sql new file mode 100644 index 00000000000..90a80509e99 --- /dev/null +++ b/tests/queries/0_stateless/02886_missed_json_subcolumns.sql @@ -0,0 +1,29 @@ +DROP TABLE IF EXISTS t_mutations_subcolumns; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_missed_subcolumns (id UInt64, n String, obj Object(Nullable('json'))) +ENGINE = MergeTree ORDER BY id; + +INSERT INTO t_missed_subcolumns VALUES (1, 'aaa', '{"k1": {"k2": "foo"}, "k3": 5}'); +INSERT INTO t_missed_subcolumns VALUES (2, 'bbb', '{"k1": {"k2": "fee"}, "k3": 4}'); +INSERT INTO t_missed_subcolumns VALUES (3, 'ccc', '{"k1": {"k2": "foo", "k4": "baz"}, "k3": 4}'); +INSERT INTO t_missed_subcolumns VALUES (4, 'ddd', '{"k1": {"k2": "foo"}, "k3": 4}'); + +OPTIMIZE TABLE t_missed_subcolumns FINAL; + +SELECT count(), min(id) FROM t_missed_subcolumns; + +SELECT * FROM t_missed_subcolumns WHERE obj.k4 = 5 ORDER BY id FORMAT JSONEachRow; + +SELECT * FROM t_missed_subcolumns WHERE obj.k1.k3 = 'fee' ORDER BY id FORMAT JSONEachRow; + +SELECT id, n, obj.k4 FROM t_missed_subcolumns ORDER BY id FORMAT JSONEachRow; + +ALTER TABLE t_missed_subcolumns DELETE WHERE obj.k4 = 5; +SELECT count(), min(id) FROM t_missed_subcolumns; + +DELETE FROM t_missed_subcolumns WHERE obj.k1.k3 = 'fee'; +SELECT count(), min(id) FROM t_missed_subcolumns; + +DROP TABLE IF EXISTS t_missed_subcolumns;