More correct version

This commit is contained in:
kssenii 2021-05-30 22:54:42 +00:00
parent d18609467b
commit c11ad44aad
5 changed files with 120 additions and 31 deletions

View File

@ -38,11 +38,33 @@ namespace ErrorCodes
String DataTypeAggregateFunction::doGetName() const
{
LOG_TRACE(&Poco::Logger::get("kssenii"), "doGetName stack trace:{}", StackTrace().toString());
return getNameImpl(true);
}
String DataTypeAggregateFunction::getNameWithoutVersion() const
{
return getNameImpl(false);
}
size_t DataTypeAggregateFunction::getVersion() const
{
if (version)
return *version;
return function->getDefaultVersion();
}
String DataTypeAggregateFunction::getNameImpl(bool with_version) const
{
WriteBufferFromOwnString stream;
stream << "AggregateFunction(";
if (version)
stream << version << ", ";
/// If aggregate function does not support versioning its version is 0 and is not printed.
auto data_type_version = getVersion();
if (with_version && data_type_version)
stream << data_type_version << ", ";
stream << function->getName();
if (!parameters.empty())
@ -64,12 +86,16 @@ String DataTypeAggregateFunction::doGetName() const
return stream.str();
}
MutableColumnPtr DataTypeAggregateFunction::createColumn() const
{
/// FIXME: ColumnAggregateFunction also uses function->serialize methods
/// FIXME: There are a lot of function->serialize inside ColumnAggregateFunction.
/// Looks like it also needs version.
LOG_TRACE(&Poco::Logger::get("kssenii"), "KSSENII COLUMN");
return ColumnAggregateFunction::create(function);
}
/// Create empty state
Field DataTypeAggregateFunction::getDefault() const
{
@ -100,20 +126,18 @@ Field DataTypeAggregateFunction::getDefault() const
bool DataTypeAggregateFunction::equals(const IDataType & rhs) const
{
return typeid(rhs) == typeid(*this) && getName() == rhs.getName();
return typeid(rhs) == typeid(*this) && getNameWithoutVersion() == typeid_cast<const DataTypeAggregateFunction &>(rhs).getNameWithoutVersion();
}
SerializationPtr DataTypeAggregateFunction::doGetDefaultSerialization() const
{
LOG_TRACE(&Poco::Logger::get("kssenii"), "get serializaton version: {}, name: {}, stack: {}", version, getName(), StackTrace().toString());
return std::make_shared<SerializationAggregateFunction>(function, version);
return std::make_shared<SerializationAggregateFunction>(function, getVersion());
}
static DataTypePtr create(const ASTPtr & arguments)
{
LOG_TRACE(&Poco::Logger::get("kssenii"), "create data type: {}", StackTrace().toString());
String function_name;
AggregateFunctionPtr function;
DataTypes argument_types;
@ -189,6 +213,7 @@ static DataTypePtr create(const ASTPtr & arguments)
return std::make_shared<DataTypeAggregateFunction>(function, argument_types, params_row, version);
}
void registerDataTypeAggregateFunction(DataTypeFactory & factory)
{
factory.registerDataType("AggregateFunction", create);

View File

@ -11,6 +11,11 @@ namespace DB
/** Type - the state of the aggregate function.
* Type parameters is an aggregate function, the types of its arguments, and its parameters (for parametric aggregate functions).
*
* Data type can support versioning for serialization of aggregate function state.
* Version 0 also means no versioning. When a table with versioned data type is attached, its version is parsed from AST. If
* there is no version in AST, then it is either attach with no version in metadata (then version is 0) or it
* is a new data type (then version is default). In distributed queries version of data type is known from data type name.
*/
class DataTypeAggregateFunction final : public IDataType
{
@ -18,7 +23,9 @@ private:
AggregateFunctionPtr function;
DataTypes argument_types;
Array parameters;
mutable size_t version;
mutable std::optional<size_t> version;
String getNameImpl(bool with_version) const;
public:
static constexpr bool is_parametric = true;
@ -28,7 +35,7 @@ public:
: function(function_)
, argument_types(argument_types_)
, parameters(parameters_)
, version(version_ ? *version_ : function_->getDefaultVersion())
, version(version_)
{
}
@ -36,6 +43,7 @@ public:
AggregateFunctionPtr getFunction() const { return function; }
String doGetName() const override;
String getNameWithoutVersion() const;
const char * getFamilyName() const override { return "AggregateFunction"; }
TypeIndex getTypeId() const override { return TypeIndex::AggregateFunction; }
@ -57,8 +65,18 @@ public:
SerializationPtr doGetDefaultSerialization() const override;
bool isVersioned() const { return function->isVersioned(); }
void setVersion(size_t version_) const { version = version_; }
/// Version of aggregate function state serialization.
size_t getVersion() const;
/// Version is not empty only if it was parsed from AST.
/// It is ok to have an empty version value here - then for serialization
/// a default (latest) version is used. This method is used to force some
/// zero version to be used instead of default - if there was no version in AST.
void setVersionIfEmpty(size_t version_) const
{
if (!version)
version = version_;
}
};
}

View File

@ -389,7 +389,7 @@ std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
}
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, bool loaded_from_disk)
{
columns = new_columns;
column_name_to_position.clear();
@ -398,8 +398,15 @@ void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
for (const auto & column : columns)
{
column_name_to_position.emplace(column.name, pos);
/// TODO: May be there is a better way or a better place for that.
const auto * aggregate_function_data_type = typeid_cast<const DataTypeAggregateFunction *>(column.type.get());
if (loaded_from_disk && aggregate_function_data_type)
aggregate_function_data_type->setVersionIfEmpty(0);
for (const auto & subcolumn : column.type->getSubcolumnNames())
column_name_to_position.emplace(Nested::concatenateName(column.name, subcolumn), pos);
++pos;
}
}
@ -1007,6 +1014,7 @@ void IMergeTreeDataPart::loadColumns(bool require)
metadata_snapshot = metadata_snapshot->projections.get(name).metadata;
NamesAndTypesList loaded_columns;
bool loaded_from_disk = false;
if (!volume->getDisk()->exists(path))
{
/// We can get list of columns only from columns.txt in compact parts.
@ -1027,18 +1035,15 @@ void IMergeTreeDataPart::loadColumns(bool require)
loaded_columns.writeText(*buf);
}
volume->getDisk()->moveFile(path + ".tmp", path);
LOG_TRACE(&Poco::Logger::get("kssenii"), "Loaded from metadata");
}
else
{
//LOG_TRACE(&Poco::Logger::get("kssenii"), "Loading columns stacktrace: {}", col.name, col.type->getName());
loaded_columns.readText(*volume->getDisk()->readFile(path));
LOG_TRACE(&Poco::Logger::get("kssenii"), "Loaded from disk");
loaded_from_disk = true;
for (auto & col : loaded_columns)
{
LOG_TRACE(&Poco::Logger::get("kssenii"), "Setting version for columns: {}, {}", col.name, col.type->getName());
if (auto agg = typeid_cast<const DataTypeAggregateFunction *>(col.type.get()))
agg->setVersion(0);
}
}
@ -1047,7 +1052,7 @@ void IMergeTreeDataPart::loadColumns(bool require)
LOG_TRACE(&Poco::Logger::get("kssenii"), "Loaded columns: {}, {}", col.name, col.type->getName());
}
setColumns(loaded_columns);
setColumns(loaded_columns, loaded_from_disk);
}
bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const

View File

@ -121,7 +121,7 @@ public:
String getTypeName() const { return getType().toString(); }
void setColumns(const NamesAndTypesList & new_columns);
void setColumns(const NamesAndTypesList & new_columns, bool loaded_from_disk = false);
const NamesAndTypesList & getColumns() const { return columns; }

View File

@ -6,7 +6,7 @@ from helpers.test_tools import assert_eq_with_retry, exec_query_with_retry
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance('node1', main_configs=["configs/log_conf.xml"])
node1 = cluster.add_instance('node1', main_configs=["configs/log_conf.xml"], stay_alive=True)
node2 = cluster.add_instance('node2', main_configs=["configs/log_conf.xml"],
image='yandex/clickhouse-server',
tag='21.5', with_installed_binary=True, stay_alive=True)
@ -14,6 +14,12 @@ node2 = cluster.add_instance('node2', main_configs=["configs/log_conf.xml"],
node3 = cluster.add_instance('node3', with_zookeeper=True, image='yandex/clickhouse-server', tag='21.2', with_installed_binary=True, stay_alive=True)
def insert_data(node):
node.query(""" INSERT INTO test_table
SELECT toDateTime('2020-10-01 19:20:30'), 1,
sumMapState(arrayMap(i -> 1, range(300)), arrayMap(i -> 1, range(300)));""")
def create_and_fill_table(node):
node.query("DROP TABLE IF EXISTS test_table;")
node.query("""
@ -21,13 +27,10 @@ def create_and_fill_table(node):
(
`col1` DateTime,
`col2` Int64,
`col3` AggregateFunction(sumMap, Tuple(Array(UInt8), Array(UInt8)))
`col3` AggregateFunction(sumMap, Array(UInt8), Array(UInt8))
)
ENGINE = AggregatingMergeTree() ORDER BY (col1, col2) """)
node.query(""" INSERT INTO test_table
SELECT toDateTime('2020-10-01 19:20:30'), 1,
sumMapState((arrayMap(i -> 1, range(300)), arrayMap(i -> 1, range(300))));""")
insert_data(node)
@pytest.fixture(scope="module")
@ -92,14 +95,52 @@ def test_aggregate_function_versioning_server_upgrade(start_cluster):
for node in [node1, node2]:
create_and_fill_table(node)
expected = "([1],[300])"
new_server_data = node1.query("select finalizeAggregation(col3) from default.test_table;").strip()
assert(new_server_data == "([1],[300])")
old_server_data = node2.query("select finalizeAggregation(col3) from default.test_table;").strip()
assert(old_server_data != new_server_data)
assert(old_server_data == "([1],[44])")
node2.restart_with_latest_version()
# Check that after server upgrade aggregate function is serialized according to older version.
upgraded_server_data = node2.query("select finalizeAggregation(col3) from default.test_table;").strip()
assert(upgraded_server_data == old_server_data)
assert(upgraded_server_data == "([1],[44])")
# Remote fetches are still with older version.
data_from_upgraded_to_new_server = node1.query("select finalizeAggregation(col3) from remote('node2', default.test_table);").strip()
assert(data_from_upgraded_to_new_server == upgraded_server_data == "([1],[44])")
# Check it is ok to write into table with older version of aggregate function.
insert_data(node2)
# Hm, should newly inserted data be serialized as old version?
upgraded_server_data = node2.query("select finalizeAggregation(col3) from default.test_table;").strip()
assert(upgraded_server_data == "([1],[300])\n([1],[44])")
def test_aggregate_function_versioning_persisting_metadata(start_cluster):
for node in [node1, node2]:
create_and_fill_table(node)
node2.restart_with_latest_version()
for node in [node1, node2]:
node.query("DETACH TABLE test_table")
node.query("ATTACH TABLE test_table")
for node in [node1, node2]:
insert_data(node)
new_server_data = node1.query("select finalizeAggregation(col3) from default.test_table;").strip()
assert(new_server_data == "([1],[300])\n([1],[300])")
upgraded_server_data = node2.query("select finalizeAggregation(col3) from default.test_table;").strip()
assert(upgraded_server_data == "([1],[44])\n([1],[44])")
for node in [node1, node2]:
node.restart_clickhouse()
insert_data(node)
result = node1.query("select finalizeAggregation(col3) from remote('127.0.0.{1,2}', default.test_table);").strip()
assert(result == "([1],[300])\n([1],[300])\n([1],[300])\n([1],[300])")
result = node2.query("select finalizeAggregation(col3) from remote('127.0.0.{1,2}', default.test_table);").strip()
assert(result == "([1],[44])\n([1],[44])\n([1],[44])\n([1],[44])")