Merge pull request #45055 from kitaisreal/analyzer-aggregation-without-column-fix

Analyzer aggregation without column fix
This commit is contained in:
Maksim Kita 2023-01-11 12:03:32 +03:00 committed by GitHub
commit f0567f7a25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 86 additions and 15 deletions

View File

@ -790,10 +790,10 @@ void ExpressionActions::assertDeterministic() const
}
std::string ExpressionActions::getSmallestColumn(const NamesAndTypesList & columns)
NameAndTypePair ExpressionActions::getSmallestColumn(const NamesAndTypesList & columns)
{
std::optional<size_t> min_size;
String res;
NameAndTypePair result;
for (const auto & column : columns)
{
@ -807,14 +807,14 @@ std::string ExpressionActions::getSmallestColumn(const NamesAndTypesList & colum
if (!min_size || size < *min_size)
{
min_size = size;
res = column.name;
result = column;
}
}
if (!min_size)
throw Exception("No available columns", ErrorCodes::LOGICAL_ERROR);
return res;
return result;
}
std::string ExpressionActions::dumpActions() const

View File

@ -111,7 +111,7 @@ public:
std::string dumpActions() const;
JSONBuilder::ItemPtr toTree() const;
static std::string getSmallestColumn(const NamesAndTypesList & columns);
static NameAndTypePair getSmallestColumn(const NamesAndTypesList & columns);
/// Check if column is always zero. True if it's definite, false if we can't say for sure.
/// Call it only after subqueries for sets were executed.

View File

@ -1146,7 +1146,7 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
required.insert(std::min_element(columns.begin(), columns.end())->name);
else if (!source_columns.empty())
/// If we have no information about columns sizes, choose a column of minimum size of its data type.
required.insert(ExpressionActions::getSmallestColumn(source_columns));
required.insert(ExpressionActions::getSmallestColumn(source_columns).name);
}
else if (is_select && storage_snapshot && !columns_context.has_array_join)
{

View File

@ -81,6 +81,63 @@ void checkAccessRights(const TableNode & table_node, const Names & column_names,
query_context->checkAccess(AccessType::SELECT, storage_id, column_names);
}
NameAndTypePair chooseSmallestColumnToReadFromStorage(const StoragePtr & storage, const StorageSnapshotPtr & storage_snapshot)
{
/** We need to read at least one column to find the number of rows.
* We will find a column with minimum <compressed_size, type_size, uncompressed_size>.
* Because it is the column that is cheapest to read.
*/
class ColumnWithSize
{
public:
ColumnWithSize(NameAndTypePair column_, ColumnSize column_size_)
: column(std::move(column_))
, compressed_size(column_size_.data_compressed)
, uncompressed_size(column_size_.data_uncompressed)
, type_size(column.type->haveMaximumSizeOfValue() ? column.type->getMaximumSizeOfValueInMemory() : 100)
{
}
bool operator<(const ColumnWithSize & rhs) const
{
return std::tie(compressed_size, type_size, uncompressed_size)
< std::tie(rhs.compressed_size, rhs.type_size, rhs.uncompressed_size);
}
NameAndTypePair column;
size_t compressed_size = 0;
size_t uncompressed_size = 0;
size_t type_size = 0;
};
std::vector<ColumnWithSize> columns_with_sizes;
auto column_sizes = storage->getColumnSizes();
auto column_names_and_types = storage_snapshot->getColumns(GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns());
if (!column_sizes.empty())
{
for (auto & column_name_and_type : column_names_and_types)
{
auto it = column_sizes.find(column_name_and_type.name);
if (it == column_sizes.end())
continue;
columns_with_sizes.emplace_back(column_name_and_type, it->second);
}
}
NameAndTypePair result;
if (!columns_with_sizes.empty())
result = std::min_element(columns_with_sizes.begin(), columns_with_sizes.end())->column;
else
/// If we have no information about columns sizes, choose a column of minimum size of its data type
result = ExpressionActions::getSmallestColumn(column_names_and_types);
return result;
}
QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
SelectQueryInfo & select_query_info,
const SelectQueryOptions & select_query_options,
@ -127,9 +184,7 @@ QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
if (columns_names.empty())
{
auto column_names_and_types = storage_snapshot->getColumns(GetColumnsOptions(GetColumnsOptions::All).withSubcolumns());
auto additional_column_to_read = column_names_and_types.front();
auto additional_column_to_read = chooseSmallestColumnToReadFromStorage(storage, storage_snapshot);
const auto & column_identifier = planner_context->getGlobalPlannerContext()->createColumnIdentifier(additional_column_to_read, table_expression);
columns_names.push_back(additional_column_to_read.name);
table_expression_data.addColumn(additional_column_to_read, column_identifier);

View File

@ -1023,7 +1023,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
if (result.column_names_to_read.empty())
{
NamesAndTypesList available_real_columns = metadata_snapshot->getColumns().getAllPhysical();
result.column_names_to_read.push_back(ExpressionActions::getSmallestColumn(available_real_columns));
result.column_names_to_read.push_back(ExpressionActions::getSmallestColumn(available_real_columns).name);
}
// storage_snapshot->check(result.column_names_to_read);

View File

@ -599,7 +599,7 @@ Pipe StorageHDFS::read(
{ return std::any_of(virtuals.begin(), virtuals.end(), [&](const NameAndTypePair & virtual_col){ return col == virtual_col.name; }); });
if (fetch_columns.empty())
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()));
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name);
columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());

View File

@ -706,7 +706,7 @@ Pipe StorageFile::read(
});
if (fetch_columns.empty())
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()));
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name);
columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns);
}
else

View File

@ -488,7 +488,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu
column_names_as_aliases = alias_actions->getRequiredColumns().getNames();
if (column_names_as_aliases.empty())
column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_metadata_snapshot->getColumns().getAllPhysical()));
column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_metadata_snapshot->getColumns().getAllPhysical()).name);
}
auto source_pipeline = createSources(
@ -574,7 +574,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources(
{
/// If there are only virtual columns in query, you must request at least one other column.
if (real_column_names.empty())
real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()));
real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name);
QueryPlan plan;
if (StorageView * view = dynamic_cast<StorageView *>(storage.get()))

View File

@ -1057,7 +1057,7 @@ Pipe StorageS3::read(
{ return std::any_of(virtuals.begin(), virtuals.end(), [&](const NameAndTypePair & virtual_col){ return col == virtual_col.name; }); });
if (fetch_columns.empty())
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()));
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name);
columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());

View File

@ -0,0 +1,15 @@
SET allow_experimental_analyzer = 1;
DROP TABLE IF EXISTS test_table;
CREATE TABLE test_table
(
c0 String ALIAS c1,
c1 String,
c2 String,
) ENGINE = MergeTree ORDER BY c1;
INSERT INTO test_table VALUES ('a', 'b');
SELECT MAX(1) FROM test_table;
DROP TABLE test_table;