Add blockSerializedSize() function (size on disk without compression)

Sometimes it is useful to know how much does this data will take on
disk, with blockSerializedSize() you can know this (although without
compression).

This can be a major knowledge for various aggregation functions that
tracking some state (i.e. uniqCombined).
This commit is contained in:
Azat Khuzhin 2020-02-01 22:41:35 +03:00
parent f979ce31ca
commit e89ceae61a
5 changed files with 149 additions and 0 deletions

View File

@ -0,0 +1,66 @@
#include <Functions/IFunctionImpl.h>
#include <Functions/FunctionFactory.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/NullWriteBuffer.h>
namespace DB
{
/// Returns size on disk for *block* (without taking into account compression).
class FunctionBlockSerializedSize : public IFunction
{
public:
static constexpr auto name = "blockSerializedSize";
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionBlockSerializedSize>();
}
String getName() const override { return name; }
bool useDefaultImplementationForNulls() const override { return false; }
size_t getNumberOfArguments() const override { return 0; }
bool isVariadic() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
{
return std::make_shared<DataTypeUInt64>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
UInt64 size = 0;
for (size_t i = 0; i < arguments.size(); ++i)
size += blockSerializedSizeOne(block.getByPosition(arguments[i]));
block.getByPosition(result).column = DataTypeUInt64().createColumnConst(
input_rows_count, size)->convertToFullColumnIfConst();
}
UInt64 blockSerializedSizeOne(const ColumnWithTypeAndName & elem) const
{
ColumnPtr full_column = elem.column->convertToFullColumnIfConst();
IDataType::SerializeBinaryBulkSettings settings;
NullWriteBuffer out;
settings.getter = [&out](IDataType::SubstreamPath) -> WriteBuffer * { return &out; };
IDataType::SerializeBinaryBulkStatePtr state;
elem.type->serializeBinaryBulkWithMultipleStreams(*full_column,
0 /** offset */, 0 /** limit */,
settings, state);
return out.count();
}
};
void registerFunctionBlockSerializedSize(FunctionFactory & factory)
{
factory.registerFunction<FunctionBlockSerializedSize>();
}
}

View File

@ -14,6 +14,7 @@ void registerFunctionFQDN(FunctionFactory &);
void registerFunctionVisibleWidth(FunctionFactory &);
void registerFunctionToTypeName(FunctionFactory &);
void registerFunctionGetSizeOfEnumType(FunctionFactory &);
void registerFunctionBlockSerializedSize(FunctionFactory &);
void registerFunctionToColumnTypeName(FunctionFactory &);
void registerFunctionDumpColumnStructure(FunctionFactory &);
void registerFunctionDefaultValueOfArgumentType(FunctionFactory &);
@ -72,6 +73,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
registerFunctionVisibleWidth(factory);
registerFunctionToTypeName(factory);
registerFunctionGetSizeOfEnumType(factory);
registerFunctionBlockSerializedSize(factory);
registerFunctionToColumnTypeName(factory);
registerFunctionDumpColumnStructure(factory);
registerFunctionDefaultValueOfArgumentType(factory);

View File

@ -0,0 +1,24 @@
UInt8 1
Nullable(UInt8) 2
UInt32 4
UInt64 8
Nullable(UInt64) 9
String 4
FixedString(32) 32
Enum8 1
Array 12
uniqCombinedState(100) 402
uniqCombinedState(10000) 81993
uniqCombinedState(100000) 81993
uniqCombinedState(1000000) 81993
uniqCombinedState(10000000) 81993
uniqCombined64State(10000000) 98505
String,UInt8 5
Block(UInt32) 16
Block(UInt32) 16

View File

@ -0,0 +1,29 @@
select 'UInt8', blockSerializedSize(0);
select 'Nullable(UInt8)', blockSerializedSize(toNullable(0));
select 'UInt32', blockSerializedSize(0xdeadbeaf);
select 'UInt64', blockSerializedSize(0xdeadbeafdead);
select 'Nullable(UInt64)', blockSerializedSize(toNullable(0xdeadbeafdead));
select '';
select 'String', blockSerializedSize('foo');
select 'FixedString(32)', blockSerializedSize(cast('foo', 'FixedString(32)'));
select '';
select 'Enum8', blockSerializedSize(cast('a' as Enum8('a' = 1, 'b' = 2)));
select '';
select 'Array', blockSerializedSize(['foo']);
select '';
select 'uniqCombinedState(100)', blockSerializedSize(uniqCombinedState(number)) from (select number from system.numbers limit 100);
select 'uniqCombinedState(10000)', blockSerializedSize(uniqCombinedState(number)) from (select number from system.numbers limit 10000);
select 'uniqCombinedState(100000)', blockSerializedSize(uniqCombinedState(number)) from (select number from system.numbers limit 100000);
select 'uniqCombinedState(1000000)', blockSerializedSize(uniqCombinedState(number)) from (select number from system.numbers limit 1000000);
select 'uniqCombinedState(10000000)', blockSerializedSize(uniqCombinedState(number)) from (select number from system.numbers limit 10000000);
select 'uniqCombined64State(10000000)', blockSerializedSize(uniqCombined64State(number)) from (select number from system.numbers limit 10000000);
select '';
select 'String,UInt8', blockSerializedSize('foo', 1);
select '';
select 'Block(UInt32)', blockSerializedSize(number) from numbers(2);

View File

@ -603,6 +603,34 @@ SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x
└───┘
```
## blockSerializedSize
Returns size on disk (without taking into account compression).
```sql
blockSerializedSize(value[, value[, ...]])
```
**Parameters:**
- `value` — Any value.
**Returned values**
- The number of bytes that will be written to disk for block of values (without compression).
**Example**
```sql
SELECT blockSerializedSize(maxState(1)) as x
```
```text
┌─x─┐
│ 2 │
└───┘
```
## toColumnTypeName
Returns the name of the class that represents the data type of the column in RAM.