Merge pull request #65546 from Avogar/data-types-binary-encoding

Implement binary encoding for ClickHouse data types
This commit is contained in:
Kruglov Pavel 2024-07-10 09:13:32 +00:00 committed by GitHub
commit 52690b64c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
42 changed files with 2146 additions and 149 deletions

View File

@ -1535,6 +1535,10 @@ the columns from input data will be mapped to the columns from the table by thei
Otherwise, the first row will be skipped.
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
If setting [output_format_binary_encode_types_in_binary_format](/docs/en/operations/settings/settings-formats.md/#output_format_binary_encode_types_in_binary_format) is set to 1,
the types in header will be written using [binary encoding](/docs/en/sql-reference/data-types/data-types-binary-encoding.md) instead of strings with type names in RowBinaryWithNamesAndTypes output format.
If setting [input_format_binary_encode_types_in_binary_format](/docs/en/operations/settings/settings-formats.md/#input_format_binary_encode_types_in_binary_format) is set to 1,
the types in header will be read using [binary encoding](/docs/en/sql-reference/data-types/data-types-binary-encoding.md) instead of strings with type names in RowBinaryWithNamesAndTypes input format.
:::
## RowBinaryWithDefaults {#rowbinarywithdefaults}

View File

@ -1951,6 +1951,18 @@ The maximum allowed size for String in RowBinary format. It prevents allocating
Default value: `1GiB`.
### output_format_binary_encode_types_in_binary_format {#output_format_binary_encode_types_in_binary_format}
Write data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in RowBinaryWithNamesAndTypes output format.
Disabled by default.
### input_format_binary_decode_types_in_binary_format {#input_format_binary_decode_types_in_binary_format}
Read data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in RowBinaryWithNamesAndTypes input format.
Disabled by default.
## Native format settings {#native-format-settings}
### input_format_native_allow_types_conversion {#input_format_native_allow_types_conversion}
@ -1958,3 +1970,15 @@ Default value: `1GiB`.
Allow types conversion in Native input format between columns from input data and requested columns.
Enabled by default.
### output_format_native_encode_types_in_binary_format {#output_format_native_encode_types_in_binary_format}
Write data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in Native output format.
Disabled by default.
### input_format_native_decode_types_in_binary_format {#input_format_native_decode_types_in_binary_format}
Read data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in Native input format.
Disabled by default.

View File

@ -0,0 +1,115 @@
---
slug: /en/sql-reference/data-types/data-types-binary-encoding
sidebar_position: 56
sidebar_label: Data types binary encoding specification.
---
# Data types binary encoding specification
This specification describes the binary format that can be used for binary encoding and decoding of ClickHouse data types. This format is used in `Dynamic` column [binary serialization](dynamic.md#binary-output-format) and can be used in input/output formats [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes) and [Native](../../interfaces/formats.md#native) under corresponding settings.
The table below describes how each data type is represented in binary format. Each data type encoding consist of 1 byte that indicates the type and some optional additional information.
`var_uint` in the binary encoding means that the size is encoded using Variable-Length Quantity compression.
| ClickHouse data type | Binary encoding |
|--------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Nothing` | `0x00` |
| `UInt8` | `0x01` |
| `UInt16` | `0x02` |
| `UInt32` | `0x03` |
| `UInt64` | `0x04` |
| `UInt128` | `0x05` |
| `UInt256` | `0x06` |
| `Int8` | `0x07` |
| `Int16` | `0x08` |
| `Int32` | `0x09` |
| `Int64` | `0x0A` |
| `Int128` | `0x0B` |
| `Int256` | `0x0C` |
| `Float32` | `0x0D` |
| `Float64` | `0x0E` |
| `Date` | `0x0F` |
| `Date32` | `0x10` |
| `DateTime` | `0x11` |
| `DateTime(time_zone)` | `0x12<var_uint_time_zone_name_size><time_zone_name_data>` |
| `DateTime64(P)` | `0x13<uint8_precision>` |
| `DateTime64(P, time_zone)` | `0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data>` |
| `String` | `0x15` |
| `FixedString(N)` | `0x16<var_uint_size>` |
| `Enum8` | `0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N>` |
| `Enum16` | `0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N>` |
| `Decimal32(P, S)` | `0x19<uint8_precision><uint8_scale>` |
| `Decimal64(P, S)` | `0x1A<uint8_precision><uint8_scale>` |
| `Decimal128(P, S)` | `0x1B<uint8_precision><uint8_scale>` |
| `Decimal256(P, S)` | `0x1C<uint8_precision><uint8_scale>` |
| `UUID` | `0x1D` |
| `Array(T)` | `0x1E<nested_type_encoding>` |
| `Tuple(T1, ..., TN)` | `0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N>` |
| `Tuple(name1 T1, ..., nameN TN)` | `0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `Set` | `0x21` |
| `Interval` | `0x22<interval_kind>` (see [interval kind binary encoding](#interval-kind-binary-encoding)) |
| `Nullable(T)` | `0x23<nested_type_encoding>` |
| `Function` | `0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding>` |
| `AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `LowCardinality(T)` | `0x26<nested_type_encoding>` |
| `Map(K, V)` | `0x27<key_type_encoding><value_type_encoding>` |
| `IPv4` | `0x28` |
| `IPv6` | `0x29` |
| `Variant(T1, ..., TN)` | `0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N>` |
| `Dynamic(max_types=N)` | `0x2B<uint8_max_types>` |
| `Custom type` (`Ring`, `Polygon`, etc) | `0x2C<var_uint_type_name_size><type_name_data>` |
| `Bool` | `0x2D` |
| `SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `Nested(name1 T1, ..., nameN TN)` | `0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
### Interval kind binary encoding
The table below describes how different interval kinds of `Interval` data type are encoded.
| Interval kind | Binary encoding |
|---------------|-----------------|
| `Nanosecond` | `0x00` |
| `Microsecond` | `0x01` |
| `Millisecond` | `0x02` |
| `Second` | `0x03` |
| `Minute` | `0x04` |
| `Hour` | `0x05` |
| `Day` | `0x06` |
| `Week` | `0x07` |
| `Month` | `0x08` |
| `Quarter` | `0x09` |
| `Year` | `0x1A` |
### Aggregate function parameter binary encoding
The table below describes how parameters of `AggragateFunction` and `SimpleAggregateFunction` are encoded.
The encoding of a parameter consists of 1 byte indicating the type of the parameter and the value itself.
| Parameter type | Binary encoding |
|--------------------------|--------------------------------------------------------------------------------------------------------------------------------|
| `Null` | `0x00` |
| `UInt64` | `0x01<var_uint_value>` |
| `Int64` | `0x02<var_int_value>` |
| `UInt128` | `0x03<uint128_little_endian_value>` |
| `Int128` | `0x04<int128_little_endian_value>` |
| `UInt128` | `0x05<uint128_little_endian_value>` |
| `Int128` | `0x06<int128_little_endian_value>` |
| `Float64` | `0x07<float64_little_endian_value>` |
| `Decimal32` | `0x08<var_uint_scale><int32_little_endian_value>` |
| `Decimal64` | `0x09<var_uint_scale><int64_little_endian_value>` |
| `Decimal128` | `0x0A<var_uint_scale><int128_little_endian_value>` |
| `Decimal256` | `0x0B<var_uint_scale><int256_little_endian_value>` |
| `String` | `0x0C<var_uint_size><data>` |
| `Array` | `0x0D<var_uint_size><value_encoding_1>...<value_encoding_N>` |
| `Tuple` | `0x0E<var_uint_size><value_encoding_1>...<value_encoding_N>` |
| `Map` | `0x0F<var_uint_size><key_encoding_1><value_encoding_1>...<key_endoding_N><value_encoding_N>` |
| `IPv4` | `0x10<uint32_little_endian_value>` |
| `IPv6` | `0x11<uint128_little_endian_value>` |
| `UUID` | `0x12<uuid_value>` |
| `Bool` | `0x13<bool_value>` |
| `Object` | `0x14<var_uint_size><var_uint_key_size_1><key_data_1><value_encoding_1>...<var_uint_key_size_N><key_data_N><value_encoding_N>` |
| `AggregateFunctionState` | `0x15<var_uint_name_size><name_data><var_uint_data_size><data>` |
| `Negative infinity` | `0xFE` |
| `Positive infinity` | `0xFF` |

View File

@ -493,3 +493,14 @@ SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) O
```
As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`.
### Binary output format
In [RowBinary](../../interfaces/formats.md#rowbinary-rowbinary) format values of `Dynamic` type are serialized in the following format:
```text
<binary_encoded_data_type><value_in_binary_format_according_to_the_data_type>
```
See the [data types binary encoding specification](../../sql-reference/data-types/data-types-binary-encoding.md)

View File

@ -4,7 +4,9 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/FieldToDataType.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <Common/Arena.h>
#include <Common/SipHash.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
@ -481,7 +483,7 @@ StringRef ColumnDynamic::serializeValueIntoArena(size_t n, DB::Arena & arena, co
/// We cannot use Variant serialization here as it serializes discriminator + value,
/// but Dynamic doesn't have fixed mapping discriminator <-> variant type
/// as different Dynamic column can have different Variants.
/// Instead, we serialize null bit + variant type name (size + bytes) + value.
/// Instead, we serialize null bit + variant type in binary format (size + bytes) + value.
const auto & variant_col = assert_cast<const ColumnVariant &>(*variant_column);
auto discr = variant_col.globalDiscriminatorAt(n);
StringRef res;
@ -495,14 +497,15 @@ StringRef ColumnDynamic::serializeValueIntoArena(size_t n, DB::Arena & arena, co
return res;
}
const auto & variant_name = variant_info.variant_names[discr];
size_t variant_name_size = variant_name.size();
char * pos = arena.allocContinue(sizeof(UInt8) + sizeof(size_t) + variant_name.size(), begin);
const auto & variant_type = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariant(discr);
String variant_type_binary_data = encodeDataType(variant_type);
size_t variant_type_binary_data_size = variant_type_binary_data.size();
char * pos = arena.allocContinue(sizeof(UInt8) + sizeof(size_t) + variant_type_binary_data.size(), begin);
memcpy(pos, &null_bit, sizeof(UInt8));
memcpy(pos + sizeof(UInt8), &variant_name_size, sizeof(size_t));
memcpy(pos + sizeof(UInt8) + sizeof(size_t), variant_name.data(), variant_name.size());
memcpy(pos + sizeof(UInt8), &variant_type_binary_data_size, sizeof(size_t));
memcpy(pos + sizeof(UInt8) + sizeof(size_t), variant_type_binary_data.data(), variant_type_binary_data.size());
res.data = pos;
res.size = sizeof(UInt8) + sizeof(size_t) + variant_name.size();
res.size = sizeof(UInt8) + sizeof(size_t) + variant_type_binary_data.size();
auto value_ref = variant_col.getVariantByGlobalDiscriminator(discr).serializeValueIntoArena(variant_col.offsetAt(n), arena, begin);
res.data = value_ref.data - res.size;
@ -521,13 +524,15 @@ const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos)
return pos;
}
/// Read variant type name.
const size_t variant_name_size = unalignedLoad<size_t>(pos);
pos += sizeof(variant_name_size);
String variant_name;
variant_name.resize(variant_name_size);
memcpy(variant_name.data(), pos, variant_name_size);
pos += variant_name_size;
/// Read variant type in binary format.
const size_t variant_type_binary_data_size = unalignedLoad<size_t>(pos);
pos += sizeof(variant_type_binary_data_size);
String variant_type_binary_data;
variant_type_binary_data.resize(variant_type_binary_data_size);
memcpy(variant_type_binary_data.data(), pos, variant_type_binary_data_size);
pos += variant_type_binary_data_size;
auto variant_type = decodeDataType(variant_type_binary_data);
auto variant_name = variant_type->getName();
/// If we already have such variant, just deserialize it into corresponding variant column.
auto it = variant_info.variant_name_to_discriminator.find(variant_name);
if (it != variant_info.variant_name_to_discriminator.end())
@ -537,7 +542,6 @@ const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos)
}
/// If we don't have such variant, add it.
auto variant_type = DataTypeFactory::instance().get(variant_name);
if (likely(addNewVariant(variant_type)))
{
auto discr = variant_info.variant_name_to_discriminator[variant_name];
@ -563,13 +567,13 @@ const char * ColumnDynamic::skipSerializedInArena(const char * pos) const
if (null_bit)
return pos;
const size_t variant_name_size = unalignedLoad<size_t>(pos);
pos += sizeof(variant_name_size);
String variant_name;
variant_name.resize(variant_name_size);
memcpy(variant_name.data(), pos, variant_name_size);
pos += variant_name_size;
auto tmp_variant_column = DataTypeFactory::instance().get(variant_name)->createColumn();
const size_t variant_type_binary_data_size = unalignedLoad<size_t>(pos);
pos += sizeof(variant_type_binary_data_size);
String variant_type_binary_data;
variant_type_binary_data.resize(variant_type_binary_data_size);
memcpy(variant_type_binary_data.data(), pos, variant_type_binary_data_size);
pos += variant_type_binary_data_size;
auto tmp_variant_column = decodeDataType(variant_type_binary_data)->createColumn();
return tmp_variant_column->skipSerializedInArena(pos);
}

View File

@ -0,0 +1,389 @@
#include <Common/FieldBinaryEncoding.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
extern const int INCORRECT_DATA;
}
namespace
{
enum class FieldBinaryTypeIndex: uint8_t
{
Null = 0x00,
UInt64 = 0x01,
Int64 = 0x02,
UInt128 = 0x03,
Int128 = 0x04,
UInt256 = 0x05,
Int256 = 0x06,
Float64 = 0x07,
Decimal32 = 0x08,
Decimal64 = 0x09,
Decimal128 = 0x0A,
Decimal256 = 0x0B,
String = 0x0C,
Array = 0x0D,
Tuple = 0x0E,
Map = 0x0F,
IPv4 = 0x10,
IPv6 = 0x11,
UUID = 0x12,
Bool = 0x13,
Object = 0x14,
AggregateFunctionState = 0x15,
NegativeInfinity = 0xFE,
PositiveInfinity = 0xFF,
};
class FieldVisitorEncodeBinary
{
public:
void operator() (const Null & x, WriteBuffer & buf) const;
void operator() (const UInt64 & x, WriteBuffer & buf) const;
void operator() (const UInt128 & x, WriteBuffer & buf) const;
void operator() (const UInt256 & x, WriteBuffer & buf) const;
void operator() (const Int64 & x, WriteBuffer & buf) const;
void operator() (const Int128 & x, WriteBuffer & buf) const;
void operator() (const Int256 & x, WriteBuffer & buf) const;
void operator() (const UUID & x, WriteBuffer & buf) const;
void operator() (const IPv4 & x, WriteBuffer & buf) const;
void operator() (const IPv6 & x, WriteBuffer & buf) const;
void operator() (const Float64 & x, WriteBuffer & buf) const;
void operator() (const String & x, WriteBuffer & buf) const;
void operator() (const Array & x, WriteBuffer & buf) const;
void operator() (const Tuple & x, WriteBuffer & buf) const;
void operator() (const Map & x, WriteBuffer & buf) const;
void operator() (const Object & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal32> & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal64> & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal128> & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal256> & x, WriteBuffer & buf) const;
void operator() (const AggregateFunctionStateData & x, WriteBuffer & buf) const;
[[noreturn]] void operator() (const CustomType & x, WriteBuffer & buf) const;
void operator() (const bool & x, WriteBuffer & buf) const;
};
void FieldVisitorEncodeBinary::operator() (const Null & x, WriteBuffer & buf) const
{
if (x.isNull())
writeBinary(UInt8(FieldBinaryTypeIndex::Null), buf);
else if (x.isPositiveInfinity())
writeBinary(UInt8(FieldBinaryTypeIndex::PositiveInfinity), buf);
else if (x.isNegativeInfinity())
writeBinary(UInt8(FieldBinaryTypeIndex::NegativeInfinity), buf);
}
void FieldVisitorEncodeBinary::operator() (const UInt64 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::UInt64), buf);
writeVarUInt(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const Int64 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Int64), buf);
writeVarInt(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const Float64 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Float64), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const String & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::String), buf);
writeStringBinary(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const UInt128 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::UInt128), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const Int128 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Int128), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const UInt256 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::UInt256), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const Int256 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Int256), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const UUID & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::UUID), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const IPv4 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::IPv4), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const IPv6 & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::IPv6), buf);
writeBinaryLittleEndian(x, buf);
}
void FieldVisitorEncodeBinary::operator() (const DecimalField<Decimal32> & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Decimal32), buf);
writeVarUInt(x.getScale(), buf);
writeBinaryLittleEndian(x.getValue(), buf);
}
void FieldVisitorEncodeBinary::operator() (const DecimalField<Decimal64> & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Decimal64), buf);
writeVarUInt(x.getScale(), buf);
writeBinaryLittleEndian(x.getValue(), buf);
}
void FieldVisitorEncodeBinary::operator() (const DecimalField<Decimal128> & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Decimal128), buf);
writeVarUInt(x.getScale(), buf);
writeBinaryLittleEndian(x.getValue(), buf);
}
void FieldVisitorEncodeBinary::operator() (const DecimalField<Decimal256> & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Decimal256), buf);
writeVarUInt(x.getScale(), buf);
writeBinaryLittleEndian(x.getValue(), buf);
}
void FieldVisitorEncodeBinary::operator() (const AggregateFunctionStateData & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::AggregateFunctionState), buf);
writeStringBinary(x.name, buf);
writeStringBinary(x.data, buf);
}
void FieldVisitorEncodeBinary::operator() (const Array & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Array), buf);
size_t size = x.size();
writeVarUInt(size, buf);
for (size_t i = 0; i < size; ++i)
Field::dispatch([&buf] (const auto & value) { FieldVisitorEncodeBinary()(value, buf); }, x[i]);
}
void FieldVisitorEncodeBinary::operator() (const Tuple & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Tuple), buf);
size_t size = x.size();
writeVarUInt(size, buf);
for (size_t i = 0; i < size; ++i)
Field::dispatch([&buf] (const auto & value) { FieldVisitorEncodeBinary()(value, buf); }, x[i]);
}
void FieldVisitorEncodeBinary::operator() (const Map & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Map), buf);
size_t size = x.size();
writeVarUInt(size, buf);
for (size_t i = 0; i < size; ++i)
{
const Tuple & key_and_value = x[i].get<Tuple>();
Field::dispatch([&buf] (const auto & value) { FieldVisitorEncodeBinary()(value, buf); }, key_and_value[0]);
Field::dispatch([&buf] (const auto & value) { FieldVisitorEncodeBinary()(value, buf); }, key_and_value[1]);
}
}
void FieldVisitorEncodeBinary::operator() (const Object & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Object), buf);
size_t size = x.size();
writeVarUInt(size, buf);
for (const auto & [key, value] : x)
{
writeStringBinary(key, buf);
Field::dispatch([&buf] (const auto & val) { FieldVisitorEncodeBinary()(val, buf); }, value);
}
}
void FieldVisitorEncodeBinary::operator()(const bool & x, WriteBuffer & buf) const
{
writeBinary(UInt8(FieldBinaryTypeIndex::Bool), buf);
writeBinary(static_cast<UInt8>(x), buf);
}
[[noreturn]] void FieldVisitorEncodeBinary::operator()(const CustomType &, WriteBuffer &) const
{
/// TODO: Support binary encoding/decoding for custom types somehow.
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of Field with custom type is not supported");
}
template <typename T>
Field decodeBigInteger(ReadBuffer & buf)
{
T value;
readBinaryLittleEndian(value, buf);
return value;
}
template <typename T>
DecimalField<T> decodeDecimal(ReadBuffer & buf)
{
UInt32 scale;
readVarUInt(scale, buf);
T value;
readBinaryLittleEndian(value, buf);
return DecimalField<T>(value, scale);
}
template <typename T>
T decodeValueLittleEndian(ReadBuffer & buf)
{
T value;
readBinaryLittleEndian(value, buf);
return value;
}
template <typename T>
T decodeArrayLikeField(ReadBuffer & buf)
{
size_t size;
readVarUInt(size, buf);
T value;
for (size_t i = 0; i != size; ++i)
value.push_back(decodeField(buf));
return value;
}
}
void encodeField(const Field & x, WriteBuffer & buf)
{
Field::dispatch([&buf] (const auto & val) { FieldVisitorEncodeBinary()(val, buf); }, x);
}
Field decodeField(ReadBuffer & buf)
{
UInt8 type;
readBinary(type, buf);
switch (FieldBinaryTypeIndex(type))
{
case FieldBinaryTypeIndex::Null:
return Null();
case FieldBinaryTypeIndex::PositiveInfinity:
return POSITIVE_INFINITY;
case FieldBinaryTypeIndex::NegativeInfinity:
return NEGATIVE_INFINITY;
case FieldBinaryTypeIndex::Int64:
{
Int64 value;
readVarInt(value, buf);
return value;
}
case FieldBinaryTypeIndex::UInt64:
{
UInt64 value;
readVarUInt(value, buf);
return value;
}
case FieldBinaryTypeIndex::Int128:
return decodeBigInteger<Int128>(buf);
case FieldBinaryTypeIndex::UInt128:
return decodeBigInteger<UInt128>(buf);
case FieldBinaryTypeIndex::Int256:
return decodeBigInteger<Int256>(buf);
case FieldBinaryTypeIndex::UInt256:
return decodeBigInteger<UInt256>(buf);
case FieldBinaryTypeIndex::Float64:
return decodeValueLittleEndian<Float64>(buf);
case FieldBinaryTypeIndex::Decimal32:
return decodeDecimal<Decimal32>(buf);
case FieldBinaryTypeIndex::Decimal64:
return decodeDecimal<Decimal64>(buf);
case FieldBinaryTypeIndex::Decimal128:
return decodeDecimal<Decimal128>(buf);
case FieldBinaryTypeIndex::Decimal256:
return decodeDecimal<Decimal256>(buf);
case FieldBinaryTypeIndex::String:
{
String value;
readStringBinary(value, buf);
return value;
}
case FieldBinaryTypeIndex::UUID:
return decodeValueLittleEndian<UUID>(buf);
case FieldBinaryTypeIndex::IPv4:
return decodeValueLittleEndian<IPv4>(buf);
case FieldBinaryTypeIndex::IPv6:
return decodeValueLittleEndian<IPv6>(buf);
case FieldBinaryTypeIndex::Bool:
{
bool value;
readBinary(value, buf);
return value;
}
case FieldBinaryTypeIndex::Array:
return decodeArrayLikeField<Array>(buf);
case FieldBinaryTypeIndex::Tuple:
return decodeArrayLikeField<Tuple>(buf);
case FieldBinaryTypeIndex::Map:
{
size_t size;
readVarUInt(size, buf);
Map map;
for (size_t i = 0; i != size; ++i)
{
Tuple key_and_value;
key_and_value.push_back(decodeField(buf));
key_and_value.push_back(decodeField(buf));
map.push_back(key_and_value);
}
return map;
}
case FieldBinaryTypeIndex::Object:
{
size_t size;
readVarUInt(size, buf);
Object value;
for (size_t i = 0; i != size; ++i)
{
String name;
readStringBinary(name, buf);
value[name] = decodeField(buf);
}
return value;
}
case FieldBinaryTypeIndex::AggregateFunctionState:
{
String name;
readStringBinary(name, buf);
String data;
readStringBinary(data, buf);
return AggregateFunctionStateData{.name = name, .data = data};
}
}
throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown Field type: {0:#04x}", UInt64(type));
}
}

View File

@ -0,0 +1,43 @@
#pragma once
#include <Core/Field.h>
namespace DB
{
/**
Binary encoding for Fields:
|--------------------------|--------------------------------------------------------------------------------------------------------------------------------|
| Field type | Binary encoding |
|--------------------------|--------------------------------------------------------------------------------------------------------------------------------|
| `Null` | `0x00` |
| `UInt64` | `0x01<var_uint_value>` |
| `Int64` | `0x02<var_int_value>` |
| `UInt128` | `0x03<uint128_little_endian_value>` |
| `Int128` | `0x04<int128_little_endian_value>` |
| `UInt128` | `0x05<uint128_little_endian_value>` |
| `Int128` | `0x06<int128_little_endian_value>` |
| `Float64` | `0x07<float64_little_endian_value>` |
| `Decimal32` | `0x08<var_uint_scale><int32_little_endian_value>` |
| `Decimal64` | `0x09<var_uint_scale><int64_little_endian_value>` |
| `Decimal128` | `0x0A<var_uint_scale><int128_little_endian_value>` |
| `Decimal256` | `0x0B<var_uint_scale><int256_little_endian_value>` |
| `String` | `0x0C<var_uint_size><data>` |
| `Array` | `0x0D<var_uint_size><value_encoding_1>...<value_encoding_N>` |
| `Tuple` | `0x0E<var_uint_size><value_encoding_1>...<value_encoding_N>` |
| `Map` | `0x0F<var_uint_size><key_encoding_1><value_encoding_1>...<key_endoding_N><value_encoding_N>` |
| `IPv4` | `0x10<uint32_little_endian_value>` |
| `IPv6` | `0x11<uint128_little_endian_value>` |
| `UUID` | `0x12<uuid_value>` |
| `Bool` | `0x13<bool_value>` |
| `Object` | `0x14<var_uint_size><var_uint_key_size_1><key_data_1><value_encoding_1>...<var_uint_key_size_N><key_data_N><value_encoding_N>` |
| `AggregateFunctionState` | `0x15<var_uint_name_size><name_data><var_uint_data_size><data>` |
| `Negative infinity` | `0xFE` |
| `Positive infinity` | `0xFF` |
|--------------------------|--------------------------------------------------------------------------------------------------------------------------------|
*/
void encodeField(const Field &, WriteBuffer & buf);
Field decodeField(ReadBuffer & buf);
}

View File

@ -7,19 +7,20 @@ namespace DB
/// Kind of a temporal interval.
struct IntervalKind
{
/// note: The order and numbers are important and used in binary encoding, append new interval kinds to the end of list.
enum class Kind : uint8_t
{
Nanosecond,
Microsecond,
Millisecond,
Second,
Minute,
Hour,
Day,
Week,
Month,
Quarter,
Year,
Nanosecond = 0x00,
Microsecond = 0x01,
Millisecond = 0x02,
Second = 0x03,
Minute = 0x04,
Hour = 0x05,
Day = 0x06,
Week = 0x07,
Month = 0x08,
Quarter = 0x09,
Year = 0x0A,
};
Kind kind = Kind::Second;

View File

@ -1132,6 +1132,8 @@ class IColumn;
M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \
\
M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \
M(Bool, input_format_native_decode_types_in_binary_format, false, "Read data types in binary format instead of type names in Native input format", 0) \
M(Bool, output_format_native_encode_types_in_binary_format, false, "Write data types in binary format instead of type names in Native output format", 0) \
\
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \
M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \
@ -1151,6 +1153,8 @@ class IColumn;
M(Bool, input_format_avro_null_as_default, false, "For Avro/AvroConfluent format: insert default in case of null and non Nullable column", 0) \
M(UInt64, format_binary_max_string_size, 1_GiB, "The maximum allowed size for String in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit", 0) \
M(UInt64, format_binary_max_array_size, 1_GiB, "The maximum allowed size for Array in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit", 0) \
M(Bool, input_format_binary_decode_types_in_binary_format, false, "Read data types in binary format instead of type names in RowBinaryWithNamesAndTypes input format", 0) \
M(Bool, output_format_binary_encode_types_in_binary_format, false, "Write data types in binary format instead of type names in RowBinaryWithNamesAndTypes output format ", 0) \
M(URI, format_avro_schema_registry_url, "", "For AvroConfluent format: Confluent Schema Registry URL.", 0) \
\
M(Bool, output_format_json_quote_64bit_integers, true, "Controls quoting of 64-bit integers in JSON output format.", 0) \
@ -1171,9 +1175,9 @@ class IColumn;
M(UInt64, output_format_pretty_max_value_width, 10000, "Maximum width of value to display in Pretty formats. If greater - it will be cut.", 0) \
M(UInt64, output_format_pretty_max_value_width_apply_for_single_value, false, "Only cut values (see the `output_format_pretty_max_value_width` setting) when it is not a single value in a block. Otherwise output it entirely, which is useful for the `SHOW CREATE TABLE` query.", 0) \
M(UInt64Auto, output_format_pretty_color, "auto", "Use ANSI escape sequences in Pretty formats. 0 - disabled, 1 - enabled, 'auto' - enabled if a terminal.", 0) \
M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \
M(UInt64, output_format_pretty_display_footer_column_names, true, "Display column names in the footer if there are 999 or more rows.", 0) \
M(UInt64, output_format_pretty_display_footer_column_names_min_rows, 50, "Sets the minimum threshold value of rows for which to enable displaying column names in the footer. 50 (default)", 0) \
M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \
M(UInt64, output_format_pretty_display_footer_column_names, true, "Display column names in the footer if there are 999 or more rows.", 0) \
M(UInt64, output_format_pretty_display_footer_column_names_min_rows, 50, "Sets the minimum threshold value of rows for which to enable displaying column names in the footer. 50 (default)", 0) \
M(UInt64, output_format_parquet_row_group_size, 1000000, "Target row group size in rows.", 0) \
M(UInt64, output_format_parquet_row_group_size_bytes, 512 * 1024 * 1024, "Target row group size in bytes, before compression.", 0) \
M(Bool, output_format_parquet_string_as_string, true, "Use Parquet String type instead of Binary for String columns.", 0) \

View File

@ -58,6 +58,10 @@ String ClickHouseVersion::toString() const
static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory::SettingsChanges>> settings_changes_history_initializer =
{
{"24.7", {{"output_format_parquet_write_page_index", false, true, "Add a possibility to write page index into parquet files."},
{"output_format_binary_encode_types_in_binary_format", false, false, "Added new setting to allow to write type names in binary format in RowBinaryWithNamesAndTypes output format"},
{"input_format_binary_decode_types_in_binary_format", false, false, "Added new setting to allow to read type names in binary format in RowBinaryWithNamesAndTypes input format"},
{"output_format_native_encode_types_in_binary_format", false, false, "Added new setting to allow to write type names in binary format in Native output format"},
{"input_format_native_decode_types_in_binary_format", false, false, "Added new setting to allow to read type names in binary format in Native output format"},
{"read_in_order_use_buffering", false, true, "Use buffering before merging while reading in order of primary key"},
{"optimize_functions_to_subcolumns", false, true, "Enable optimization by default"},
{"enable_named_columns_in_function_tuple", false, true, "Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers."},

View File

@ -0,0 +1,65 @@
#include <gtest/gtest.h>
#include <Common/FieldBinaryEncoding.h>
#include <IO/WriteBufferFromString.h>
#include <IO/ReadBufferFromString.h>
using namespace DB;
namespace DB::ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
}
void check(const Field & field)
{
// std::cerr << "Check " << toString(field) << "\n";
WriteBufferFromOwnString ostr;
encodeField(field, ostr);
ReadBufferFromString istr(ostr.str());
Field decoded_field = decodeField(istr);
ASSERT_TRUE(istr.eof());
ASSERT_EQ(field, decoded_field);
}
GTEST_TEST(FieldBinaryEncoding, EncodeAndDecode)
{
check(Null());
check(POSITIVE_INFINITY);
check(NEGATIVE_INFINITY);
check(true);
check(UInt64(42));
check(Int64(-42));
check(UInt128(42));
check(Int128(-42));
check(UInt256(42));
check(Int256(-42));
check(UUID(42));
check(IPv4(42));
check(IPv6(42));
check(Float64(42.42));
check(String("Hello, World!"));
check(Array({Field(UInt64(42)), Field(UInt64(43))}));
check(Tuple({Field(UInt64(42)), Field(Null()), Field(UUID(42)), Field(String("Hello, World!"))}));
check(Map({Tuple{Field(UInt64(42)), Field(String("str_42"))}, Tuple{Field(UInt64(43)), Field(String("str_43"))}}));
check(Object({{String("key_1"), Field(UInt64(42))}, {String("key_2"), Field(UInt64(43))}}));
check(DecimalField<Decimal32>(4242, 3));
check(DecimalField<Decimal64>(4242, 3));
check(DecimalField<Decimal128>(Int128(4242), 3));
check(DecimalField<Decimal256>(Int256(4242), 3));
check(AggregateFunctionStateData{.name="some_name", .data="some_data"});
try
{
check(CustomType());
}
catch (const Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::UNSUPPORTED_METHOD);
}
check(Array({
Tuple({Field(UInt64(42)), Map({Tuple{Field(UInt64(42)), Field(String("str_42"))}, Tuple{Field(UInt64(43)), Field(String("str_43"))}}), Field(UUID(42)), Field(String("Hello, World!"))}),
Tuple({Field(UInt64(43)), Map({Tuple{Field(UInt64(43)), Field(String("str_43"))}, Tuple{Field(UInt64(44)), Field(String("str_44"))}}), Field(UUID(43)), Field(String("Hello, World 2!"))})
}));
}

View File

@ -25,7 +25,6 @@ private:
mutable std::optional<size_t> version;
String getNameImpl(bool with_version) const;
size_t getVersion() const;
public:
static constexpr bool is_parametric = true;
@ -39,6 +38,8 @@ public:
{
}
size_t getVersion() const;
String getFunctionName() const;
AggregateFunctionPtr getFunction() const { return function; }

View File

@ -165,6 +165,19 @@ static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & argum
return std::make_pair(storage_type, std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
}
String DataTypeCustomSimpleAggregateFunction::getFunctionName() const
{
return function->getName();
}
DataTypePtr createSimpleAggregateFunctionType(const AggregateFunctionPtr & function, const DataTypes & argument_types, const Array & parameters)
{
auto custom_desc = std::make_unique<DataTypeCustomDesc>(
std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, argument_types, parameters));
return DataTypeFactory::instance().getCustom(std::move(custom_desc));
}
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory)
{
factory.registerDataTypeCustom("SimpleAggregateFunction", create);

View File

@ -40,8 +40,13 @@ public:
: function(function_), argument_types(argument_types_), parameters(parameters_) {}
AggregateFunctionPtr getFunction() const { return function; }
String getFunctionName() const;
const DataTypes & getArgumentsDataTypes() const { return argument_types; }
const Array & getParameters() const { return parameters; }
String getName() const override;
static void checkSupportedFunctions(const AggregateFunctionPtr & function);
};
DataTypePtr createSimpleAggregateFunctionType(const AggregateFunctionPtr & function, const DataTypes & argument_types, const Array & parameters);
}

View File

@ -71,7 +71,7 @@ static DataTypePtr create(const ASTPtr & arguments)
auto * literal = argument->arguments->children[1]->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.get<UInt64>() == 0 || literal->value.get<UInt64>() > 255)
if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.get<UInt64>() == 0 || literal->value.get<UInt64>() > ColumnVariant::MAX_NESTED_COLUMNS)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'max_types' argument for Dynamic type should be a positive integer between 1 and 255");
return std::make_shared<DataTypeDynamic>(literal->value.get<UInt64>());

View File

@ -19,6 +19,8 @@ public:
}
String getName() const override;
const DataTypes & getElements() const { return elems; }
const Names & getNames() const { return names; }
};
DataTypePtr createNested(const DataTypes & types, const Names & names);

View File

@ -0,0 +1,705 @@
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFunction.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypeSet.h>
#include <DataTypes/DataTypeInterval.h>
#include <DataTypes/DataTypeIPv4andIPv6.h>
#include <DataTypes/DataTypeAggregateFunction.h>
#include <DataTypes/DataTypeCustomSimpleAggregateFunction.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeDynamic.h>
#include <DataTypes/DataTypeNested.h>
#include <DataTypes/DataTypeFactory.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <Parsers/NullsAction.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <Common/FieldBinaryEncoding.h>
#include <Common/assert_cast.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
extern const int INCORRECT_DATA;
}
namespace
{
enum class BinaryTypeIndex : uint8_t
{
Nothing = 0x00,
UInt8 = 0x01,
UInt16 = 0x02,
UInt32 = 0x03,
UInt64 = 0x04,
UInt128 = 0x05,
UInt256 = 0x06,
Int8 = 0x07,
Int16 = 0x08,
Int32 = 0x09,
Int64 = 0x0A,
Int128 = 0x0B,
Int256 = 0x0C,
Float32 = 0x0D,
Float64 = 0x0E,
Date = 0x0F,
Date32 = 0x10,
DateTimeUTC = 0x11,
DateTimeWithTimezone = 0x12,
DateTime64UTC = 0x13,
DateTime64WithTimezone = 0x14,
String = 0x15,
FixedString = 0x16,
Enum8 = 0x17,
Enum16 = 0x18,
Decimal32 = 0x19,
Decimal64 = 0x1A,
Decimal128 = 0x1B,
Decimal256 = 0x1C,
UUID = 0x1D,
Array = 0x1E,
UnnamedTuple = 0x1F,
NamedTuple = 0x20,
Set = 0x21,
Interval = 0x22,
Nullable = 0x23,
Function = 0x24,
AggregateFunction = 0x25,
LowCardinality = 0x26,
Map = 0x27,
IPv4 = 0x28,
IPv6 = 0x29,
Variant = 0x2A,
Dynamic = 0x2B,
Custom = 0x2C,
Bool = 0x2D,
SimpleAggregateFunction = 0x2E,
Nested = 0x2F,
};
BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
{
/// By default custom types don't have their own BinaryTypeIndex.
if (type->hasCustomName())
{
/// Some widely used custom types have separate BinaryTypeIndex for better serialization.
/// Right now it's Bool, SimpleAggregateFunction and Nested types.
/// TODO: Consider adding BinaryTypeIndex for more custom types.
if (isBool(type))
return BinaryTypeIndex::Bool;
if (typeid_cast<const DataTypeCustomSimpleAggregateFunction *>(type->getCustomName()))
return BinaryTypeIndex::SimpleAggregateFunction;
if (isNested(type))
return BinaryTypeIndex::Nested;
return BinaryTypeIndex::Custom;
}
switch (type->getTypeId())
{
case TypeIndex::Nothing:
return BinaryTypeIndex::Nothing;
case TypeIndex::UInt8:
return BinaryTypeIndex::UInt8;
case TypeIndex::UInt16:
return BinaryTypeIndex::UInt16;
case TypeIndex::UInt32:
return BinaryTypeIndex::UInt32;
case TypeIndex::UInt64:
return BinaryTypeIndex::UInt64;
case TypeIndex::UInt128:
return BinaryTypeIndex::UInt128;
case TypeIndex::UInt256:
return BinaryTypeIndex::UInt256;
case TypeIndex::Int8:
return BinaryTypeIndex::Int8;
case TypeIndex::Int16:
return BinaryTypeIndex::Int16;
case TypeIndex::Int32:
return BinaryTypeIndex::Int32;
case TypeIndex::Int64:
return BinaryTypeIndex::Int64;
case TypeIndex::Int128:
return BinaryTypeIndex::Int128;
case TypeIndex::Int256:
return BinaryTypeIndex::Int256;
case TypeIndex::Float32:
return BinaryTypeIndex::Float32;
case TypeIndex::Float64:
return BinaryTypeIndex::Float64;
case TypeIndex::Date:
return BinaryTypeIndex::Date;
case TypeIndex::Date32:
return BinaryTypeIndex::Date32;
case TypeIndex::DateTime:
if (assert_cast<const DataTypeDateTime &>(*type).hasExplicitTimeZone())
return BinaryTypeIndex::DateTimeWithTimezone;
return BinaryTypeIndex::DateTimeUTC;
case TypeIndex::DateTime64:
if (assert_cast<const DataTypeDateTime64 &>(*type).hasExplicitTimeZone())
return BinaryTypeIndex::DateTime64WithTimezone;
return BinaryTypeIndex::DateTime64UTC;
case TypeIndex::String:
return BinaryTypeIndex::String;
case TypeIndex::FixedString:
return BinaryTypeIndex::FixedString;
case TypeIndex::Enum8:
return BinaryTypeIndex::Enum8;
case TypeIndex::Enum16:
return BinaryTypeIndex::Enum16;
case TypeIndex::Decimal32:
return BinaryTypeIndex::Decimal32;
case TypeIndex::Decimal64:
return BinaryTypeIndex::Decimal64;
case TypeIndex::Decimal128:
return BinaryTypeIndex::Decimal128;
case TypeIndex::Decimal256:
return BinaryTypeIndex::Decimal256;
case TypeIndex::UUID:
return BinaryTypeIndex::UUID;
case TypeIndex::Array:
return BinaryTypeIndex::Array;
case TypeIndex::Tuple:
{
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*type);
if (tuple_type.haveExplicitNames())
return BinaryTypeIndex::NamedTuple;
return BinaryTypeIndex::UnnamedTuple;
}
case TypeIndex::Set:
return BinaryTypeIndex::Set;
case TypeIndex::Interval:
return BinaryTypeIndex::Interval;
case TypeIndex::Nullable:
return BinaryTypeIndex::Nullable;
case TypeIndex::Function:
return BinaryTypeIndex::Function;
case TypeIndex::AggregateFunction:
return BinaryTypeIndex::AggregateFunction;
case TypeIndex::LowCardinality:
return BinaryTypeIndex::LowCardinality;
case TypeIndex::Map:
return BinaryTypeIndex::Map;
case TypeIndex::Object:
/// Object type will be deprecated and replaced by new implementation. No need to support it here.
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type Object is not supported");
case TypeIndex::IPv4:
return BinaryTypeIndex::IPv4;
case TypeIndex::IPv6:
return BinaryTypeIndex::IPv6;
case TypeIndex::Variant:
return BinaryTypeIndex::Variant;
case TypeIndex::Dynamic:
return BinaryTypeIndex::Dynamic;
/// JSONPaths is used only during schema inference and cannot be used anywhere else.
case TypeIndex::JSONPaths:
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type JSONPaths is not supported");
}
}
template <typename T>
void encodeEnumValues(const DataTypePtr & type, WriteBuffer & buf)
{
const auto & enum_type = assert_cast<const DataTypeEnum<T> &>(*type);
const auto & values = enum_type.getValues();
writeVarUInt(values.size(), buf);
for (const auto & [name, value] : values)
{
writeStringBinary(name, buf);
writeBinaryLittleEndian(value, buf);
}
}
template <typename T>
DataTypePtr decodeEnum(ReadBuffer & buf)
{
typename DataTypeEnum<T>::Values values;
size_t size;
readVarUInt(size, buf);
for (size_t i = 0; i != size; ++i)
{
String name;
readStringBinary(name, buf);
T value;
readBinaryLittleEndian(value, buf);
values.emplace_back(name, value);
}
return std::make_shared<DataTypeEnum<T>>(values);
}
template <typename T>
void encodeDecimal(const DataTypePtr & type, WriteBuffer & buf)
{
const auto & decimal_type = assert_cast<const DataTypeDecimal<T> &>(*type);
/// Both precision and scale should be less than 76, so we can decode it in 1 byte.
writeBinary(UInt8(decimal_type.getPrecision()), buf);
writeBinary(UInt8(decimal_type.getScale()), buf);
}
template <typename T>
DataTypePtr decodeDecimal(ReadBuffer & buf)
{
UInt8 precision;
readBinary(precision, buf);
UInt8 scale;
readBinary(scale, buf);
return std::make_shared<DataTypeDecimal<T>>(precision, scale);
}
void encodeAggregateFunction(const String & function_name, const Array & parameters, const DataTypes & arguments_types, WriteBuffer & buf)
{
writeStringBinary(function_name, buf);
writeVarUInt(parameters.size(), buf);
for (const auto & param : parameters)
encodeField(param, buf);
writeVarUInt(arguments_types.size(), buf);
for (const auto & argument_type : arguments_types)
encodeDataType(argument_type, buf);
}
std::tuple<AggregateFunctionPtr, Array, DataTypes> decodeAggregateFunction(ReadBuffer & buf)
{
String function_name;
readStringBinary(function_name, buf);
size_t num_parameters;
readVarUInt(num_parameters, buf);
Array parameters;
parameters.reserve(num_parameters);
for (size_t i = 0; i != num_parameters; ++i)
parameters.push_back(decodeField(buf));
size_t num_arguments;
readVarUInt(num_arguments, buf);
DataTypes arguments_types;
arguments_types.reserve(num_arguments);
for (size_t i = 0; i != num_arguments; ++i)
arguments_types.push_back(decodeDataType(buf));
AggregateFunctionProperties properties;
auto action = NullsAction::EMPTY;
auto function = AggregateFunctionFactory::instance().get(function_name, action, arguments_types, parameters, properties);
return {function, parameters, arguments_types};
}
}
void encodeDataType(const DataTypePtr & type, WriteBuffer & buf)
{
/// First, write the BinaryTypeIndex byte.
auto binary_type_index = getBinaryTypeIndex(type);
buf.write(UInt8(binary_type_index));
/// Then, write additional information depending on the data type.
switch (binary_type_index)
{
case BinaryTypeIndex::DateTimeWithTimezone:
{
const auto & datetime_type = assert_cast<const DataTypeDateTime &>(*type);
writeStringBinary(datetime_type.getTimeZone().getTimeZone(), buf);
break;
}
case BinaryTypeIndex::DateTime64UTC:
{
const auto & datetime64_type = assert_cast<const DataTypeDateTime64 &>(*type);
/// Maximum scale for DateTime64 is 9, so we can write it as 1 byte.
buf.write(UInt8(datetime64_type.getScale()));
break;
}
case BinaryTypeIndex::DateTime64WithTimezone:
{
const auto & datetime64_type = assert_cast<const DataTypeDateTime64 &>(*type);
buf.write(UInt8(datetime64_type.getScale()));
writeStringBinary(datetime64_type.getTimeZone().getTimeZone(), buf);
break;
}
case BinaryTypeIndex::FixedString:
{
const auto & fixed_string_type = assert_cast<const DataTypeFixedString &>(*type);
writeVarUInt(fixed_string_type.getN(), buf);
break;
}
case BinaryTypeIndex::Enum8:
{
encodeEnumValues<Int8>(type, buf);
break;
}
case BinaryTypeIndex::Enum16:
{
encodeEnumValues<Int16>(type, buf);
break;
}
case BinaryTypeIndex::Decimal32:
{
encodeDecimal<Decimal32>(type, buf);
break;
}
case BinaryTypeIndex::Decimal64:
{
encodeDecimal<Decimal64>(type, buf);
break;
}
case BinaryTypeIndex::Decimal128:
{
encodeDecimal<Decimal128>(type, buf);
break;
}
case BinaryTypeIndex::Decimal256:
{
encodeDecimal<Decimal256>(type, buf);
break;
}
case BinaryTypeIndex::Array:
{
const auto & array_type = assert_cast<const DataTypeArray &>(*type);
encodeDataType(array_type.getNestedType(), buf);
break;
}
case BinaryTypeIndex::NamedTuple:
{
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*type);
const auto & types = tuple_type.getElements();
const auto & names = tuple_type.getElementNames();
writeVarUInt(types.size(), buf);
for (size_t i = 0; i != types.size(); ++i)
{
writeStringBinary(names[i], buf);
encodeDataType(types[i], buf);
}
break;
}
case BinaryTypeIndex::UnnamedTuple:
{
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*type);
const auto & element_types = tuple_type.getElements();
writeVarUInt(element_types.size(), buf);
for (const auto & element_type : element_types)
encodeDataType(element_type, buf);
break;
}
case BinaryTypeIndex::Interval:
{
const auto & interval_type = assert_cast<const DataTypeInterval &>(*type);
writeBinary(UInt8(interval_type.getKind().kind), buf);
break;
}
case BinaryTypeIndex::Nullable:
{
const auto & nullable_type = assert_cast<const DataTypeNullable &>(*type);
encodeDataType(nullable_type.getNestedType(), buf);
break;
}
case BinaryTypeIndex::Function:
{
const auto & function_type = assert_cast<const DataTypeFunction &>(*type);
const auto & arguments_types = function_type.getArgumentTypes();
const auto & return_type = function_type.getReturnType();
writeVarUInt(arguments_types.size(), buf);
for (const auto & argument_type : arguments_types)
encodeDataType(argument_type, buf);
encodeDataType(return_type, buf);
break;
}
case BinaryTypeIndex::LowCardinality:
{
const auto & low_cardinality_type = assert_cast<const DataTypeLowCardinality &>(*type);
encodeDataType(low_cardinality_type.getDictionaryType(), buf);
break;
}
case BinaryTypeIndex::Map:
{
const auto & map_type = assert_cast<const DataTypeMap &>(*type);
encodeDataType(map_type.getKeyType(), buf);
encodeDataType(map_type.getValueType(), buf);
break;
}
case BinaryTypeIndex::Variant:
{
const auto & variant_type = assert_cast<const DataTypeVariant &>(*type);
const auto & variants = variant_type.getVariants();
writeVarUInt(variants.size(), buf);
for (const auto & variant : variants)
encodeDataType(variant, buf);
break;
}
case BinaryTypeIndex::Dynamic:
{
const auto & dynamic_type = assert_cast<const DataTypeDynamic &>(*type);
/// Maximum number of dynamic types is 255, we can write it as 1 byte.
writeBinary(UInt8(dynamic_type.getMaxDynamicTypes()), buf);
break;
}
case BinaryTypeIndex::AggregateFunction:
{
const auto & aggregate_function_type = assert_cast<const DataTypeAggregateFunction &>(*type);
writeVarUInt(aggregate_function_type.getVersion(), buf);
encodeAggregateFunction(aggregate_function_type.getFunctionName(), aggregate_function_type.getParameters(), aggregate_function_type.getArgumentsDataTypes(), buf);
break;
}
case BinaryTypeIndex::SimpleAggregateFunction:
{
const auto & simple_aggregate_function_type = assert_cast<const DataTypeCustomSimpleAggregateFunction &>(*type->getCustomName());
encodeAggregateFunction(simple_aggregate_function_type.getFunctionName(), simple_aggregate_function_type.getParameters(), simple_aggregate_function_type.getArgumentsDataTypes(), buf);
break;
}
case BinaryTypeIndex::Nested:
{
const auto & nested_type = assert_cast<const DataTypeNestedCustomName &>(*type->getCustomName());
const auto & elements = nested_type.getElements();
const auto & names = nested_type.getNames();
writeVarUInt(elements.size(), buf);
for (size_t i = 0; i != elements.size(); ++i)
{
writeStringBinary(names[i], buf);
encodeDataType(elements[i], buf);
}
break;
}
case BinaryTypeIndex::Custom:
{
const auto & type_name = type->getName();
writeStringBinary(type_name, buf);
break;
}
default:
break;
}
}
String encodeDataType(const DataTypePtr & type)
{
WriteBufferFromOwnString buf;
encodeDataType(type, buf);
return buf.str();
}
DataTypePtr decodeDataType(ReadBuffer & buf)
{
UInt8 type;
readBinary(type, buf);
switch (BinaryTypeIndex(type))
{
case BinaryTypeIndex::Nothing:
return std::make_shared<DataTypeNothing>();
case BinaryTypeIndex::UInt8:
return std::make_shared<DataTypeUInt8>();
case BinaryTypeIndex::Bool:
return DataTypeFactory::instance().get("Bool");
case BinaryTypeIndex::UInt16:
return std::make_shared<DataTypeUInt16>();
case BinaryTypeIndex::UInt32:
return std::make_shared<DataTypeUInt32>();
case BinaryTypeIndex::UInt64:
return std::make_shared<DataTypeUInt64>();
case BinaryTypeIndex::UInt128:
return std::make_shared<DataTypeUInt128>();
case BinaryTypeIndex::UInt256:
return std::make_shared<DataTypeUInt256>();
case BinaryTypeIndex::Int8:
return std::make_shared<DataTypeInt8>();
case BinaryTypeIndex::Int16:
return std::make_shared<DataTypeInt16>();
case BinaryTypeIndex::Int32:
return std::make_shared<DataTypeInt32>();
case BinaryTypeIndex::Int64:
return std::make_shared<DataTypeInt64>();
case BinaryTypeIndex::Int128:
return std::make_shared<DataTypeInt128>();
case BinaryTypeIndex::Int256:
return std::make_shared<DataTypeInt256>();
case BinaryTypeIndex::Float32:
return std::make_shared<DataTypeFloat32>();
case BinaryTypeIndex::Float64:
return std::make_shared<DataTypeFloat64>();
case BinaryTypeIndex::Date:
return std::make_shared<DataTypeDate>();
case BinaryTypeIndex::Date32:
return std::make_shared<DataTypeDate32>();
case BinaryTypeIndex::DateTimeUTC:
return std::make_shared<DataTypeDateTime>();
case BinaryTypeIndex::DateTimeWithTimezone:
{
String time_zone;
readStringBinary(time_zone, buf);
return std::make_shared<DataTypeDateTime>(time_zone);
}
case BinaryTypeIndex::DateTime64UTC:
{
UInt8 scale;
readBinary(scale, buf);
return std::make_shared<DataTypeDateTime64>(scale);
}
case BinaryTypeIndex::DateTime64WithTimezone:
{
UInt8 scale;
readBinary(scale, buf);
String time_zone;
readStringBinary(time_zone, buf);
return std::make_shared<DataTypeDateTime64>(scale, time_zone);
}
case BinaryTypeIndex::String:
return std::make_shared<DataTypeString>();
case BinaryTypeIndex::FixedString:
{
UInt64 size;
readVarUInt(size, buf);
return std::make_shared<DataTypeFixedString>(size);
}
case BinaryTypeIndex::Enum8:
return decodeEnum<Int8>(buf);
case BinaryTypeIndex::Enum16:
return decodeEnum<Int16>(buf);
case BinaryTypeIndex::Decimal32:
return decodeDecimal<Decimal32>(buf);
case BinaryTypeIndex::Decimal64:
return decodeDecimal<Decimal64>(buf);
case BinaryTypeIndex::Decimal128:
return decodeDecimal<Decimal128>(buf);
case BinaryTypeIndex::Decimal256:
return decodeDecimal<Decimal256>(buf);
case BinaryTypeIndex::UUID:
return std::make_shared<DataTypeUUID>();
case BinaryTypeIndex::Array:
return std::make_shared<DataTypeArray>(decodeDataType(buf));
case BinaryTypeIndex::NamedTuple:
{
size_t size;
readVarUInt(size, buf);
DataTypes elements;
elements.reserve(size);
Names names;
names.reserve(size);
for (size_t i = 0; i != size; ++i)
{
names.emplace_back();
readStringBinary(names.back(), buf);
elements.push_back(decodeDataType(buf));
}
return std::make_shared<DataTypeTuple>(elements, names);
}
case BinaryTypeIndex::UnnamedTuple:
{
size_t size;
readVarUInt(size, buf);
DataTypes elements;
elements.reserve(size);
for (size_t i = 0; i != size; ++i)
elements.push_back(decodeDataType(buf));
return std::make_shared<DataTypeTuple>(elements);
}
case BinaryTypeIndex::Set:
return std::make_shared<DataTypeSet>();
case BinaryTypeIndex::Interval:
{
UInt8 kind;
readBinary(kind, buf);
return std::make_shared<DataTypeInterval>(IntervalKind(IntervalKind::Kind(kind)));
}
case BinaryTypeIndex::Nullable:
return std::make_shared<DataTypeNullable>(decodeDataType(buf));
case BinaryTypeIndex::Function:
{
size_t arguments_size;
readVarUInt(arguments_size, buf);
DataTypes arguments;
arguments.reserve(arguments_size);
for (size_t i = 0; i != arguments_size; ++i)
arguments.push_back(decodeDataType(buf));
auto return_type = decodeDataType(buf);
return std::make_shared<DataTypeFunction>(arguments, return_type);
}
case BinaryTypeIndex::LowCardinality:
return std::make_shared<DataTypeLowCardinality>(decodeDataType(buf));
case BinaryTypeIndex::Map:
{
auto key_type = decodeDataType(buf);
auto value_type = decodeDataType(buf);
return std::make_shared<DataTypeMap>(key_type, value_type);
}
case BinaryTypeIndex::IPv4:
return std::make_shared<DataTypeIPv4>();
case BinaryTypeIndex::IPv6:
return std::make_shared<DataTypeIPv6>();
case BinaryTypeIndex::Variant:
{
size_t size;
readVarUInt(size, buf);
DataTypes variants;
variants.reserve(size);
for (size_t i = 0; i != size; ++i)
variants.push_back(decodeDataType(buf));
return std::make_shared<DataTypeVariant>(variants);
}
case BinaryTypeIndex::Dynamic:
{
UInt8 max_dynamic_types;
readBinary(max_dynamic_types, buf);
return std::make_shared<DataTypeDynamic>(max_dynamic_types);
}
case BinaryTypeIndex::AggregateFunction:
{
size_t version;
readVarUInt(version, buf);
const auto & [function, parameters, arguments_types] = decodeAggregateFunction(buf);
return std::make_shared<DataTypeAggregateFunction>(function, arguments_types, parameters, version);
}
case BinaryTypeIndex::SimpleAggregateFunction:
{
const auto & [function, parameters, arguments_types] = decodeAggregateFunction(buf);
return createSimpleAggregateFunctionType(function, arguments_types, parameters);
}
case BinaryTypeIndex::Nested:
{
size_t size;
readVarUInt(size, buf);
Names names;
names.reserve(size);
DataTypes elements;
elements.reserve(size);
for (size_t i = 0; i != size; ++i)
{
names.emplace_back();
readStringBinary(names.back(), buf);
elements.push_back(decodeDataType(buf));
}
return createNested(elements, names);
}
case BinaryTypeIndex::Custom:
{
String type_name;
readStringBinary(type_name, buf);
return DataTypeFactory::instance().get(type_name);
}
}
throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown type code: {0:#04x}", UInt64(type));
}
DataTypePtr decodeDataType(const String & data)
{
ReadBufferFromString buf(data);
return decodeDataType(buf);
}
}

View File

@ -0,0 +1,118 @@
#pragma once
#include <DataTypes/IDataType.h>
namespace DB
{
/**
Binary encoding for ClickHouse data types:
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ClickHouse data type | Binary encoding |
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Nothing | 0x00 |
| UInt8 | 0x01 |
| UInt16 | 0x02 |
| UInt32 | 0x03 |
| UInt64 | 0x04 |
| UInt128 | 0x05 |
| UInt256 | 0x06 |
| Int8 | 0x07 |
| Int16 | 0x08 |
| Int32 | 0x09 |
| Int64 | 0x0A |
| Int128 | 0x0B |
| Int256 | 0x0C |
| Float32 | 0x0D |
| Float64 | 0x0E |
| Date | 0x0F |
| Date32 | 0x10 |
| DateTime | 0x11 |
| DateTime(time_zone) | 0x12<var_uint_time_zone_name_size><time_zone_name_data> |
| DateTime64(P) | 0x13<uint8_precision> |
| DateTime64(P, time_zone) | 0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data> |
| String | 0x15 |
| FixedString(N) | 0x16<var_uint_size> |
| Enum8 | 0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N> |
| Enum16 | 0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N> |
| Decimal32(P, S) | 0x19<uint8_precision><uint8_scale> |
| Decimal64(P, S) | 0x1A<uint8_precision><uint8_scale> |
| Decimal128(P, S) | 0x1B<uint8_precision><uint8_scale> |
| Decimal256(P, S) | 0x1C<uint8_precision><uint8_scale> |
| UUID | 0x1D |
| Array(T) | 0x1E<nested_type_encoding> |
| Tuple(T1, ..., TN) | 0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N> |
| Tuple(name1 T1, ..., nameN TN) | 0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> |
| Set | 0x21 |
| Interval | 0x22<interval_kind> |
| Nullable(T) | 0x23<nested_type_encoding> |
| Function | 0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding> |
| AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> |
| LowCardinality(T) | 0x26<nested_type_encoding> |
| Map(K, V) | 0x27<key_type_encoding><value_type_encoding> |
| IPv4 | 0x28 |
| IPv6 | 0x29 |
| Variant(T1, ..., TN) | 0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N> |
| Dynamic(max_types=N) | 0x2B<uint8_max_types> |
| Custom type (Ring, Polygon, etc) | 0x2C<var_uint_type_name_size><type_name_data> |
| Bool | 0x2D |
| SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> |
| Nested(name1 T1, ..., nameN TN) | 0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> |
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
Interval kind binary encoding:
|---------------|-----------------|
| Interval kind | Binary encoding |
|---------------|-----------------|
| Nanosecond | 0x00 |
| Microsecond | 0x01 |
| Millisecond | 0x02 |
| Second | 0x03 |
| Minute | 0x04 |
| Hour | 0x05 |
| Day | 0x06 |
| Week | 0x07 |
| Month | 0x08 |
| Quarter | 0x09 |
| Year | 0x1A |
|---------------|-----------------|
Aggregate function parameter binary encoding (binary encoding of a Field, see src/Common/FieldBinaryEncoding.h):
|------------------------|------------------------------------------------------------------------------------------------------------------------------|
| Parameter type | Binary encoding |
|------------------------|------------------------------------------------------------------------------------------------------------------------------|
| Null | 0x00 |
| UInt64 | 0x01<var_uint_value> |
| Int64 | 0x02<var_int_value> |
| UInt128 | 0x03<uint128_little_endian_value> |
| Int128 | 0x04<int128_little_endian_value> |
| UInt128 | 0x05<uint128_little_endian_value> |
| Int128 | 0x06<int128_little_endian_value> |
| Float64 | 0x07<float64_little_endian_value> |
| Decimal32 | 0x08<var_uint_scale><int32_little_endian_value> |
| Decimal64 | 0x09<var_uint_scale><int64_little_endian_value> |
| Decimal128 | 0x0A<var_uint_scale><int128_little_endian_value> |
| Decimal256 | 0x0B<var_uint_scale><int256_little_endian_value> |
| String | 0x0C<var_uint_size><data> |
| Array | 0x0D<var_uint_size><value_encoding_1>...<value_encoding_N> |
| Tuple | 0x0E<var_uint_size><value_encoding_1>...<value_encoding_N> |
| Map | 0x0F<var_uint_size><key_encoding_1><value_encoding_1>...<key_endoding_N><value_encoding_N> |
| IPv4 | 0x10<uint32_little_endian_value> |
| IPv6 | 0x11<uint128_little_endian_value> |
| UUID | 0x12<uuid_value> |
| Bool | 0x13<bool_value> |
| Object | 0x14<var_uint_size><var_uint_key_size_1><key_data_1><value_encoding_1>...<var_uint_key_size_N><key_data_N><value_encoding_N> |
| AggregateFunctionState | 0x15<var_uint_name_size><name_data><var_uint_data_size><data> |
| Negative infinity | 0xFE |
| Positive infinity | 0xFF |
|------------------------|------------------------------------------------------------------------------------------------------------------------------|
*/
String encodeDataType(const DataTypePtr & type);
void encodeDataType(const DataTypePtr & type, WriteBuffer & buf);
DataTypePtr decodeDataType(const String & data);
DataTypePtr decodeDataType(ReadBuffer & buf);
}

View File

@ -257,6 +257,9 @@ public:
bool position_independent_encoding = true;
/// True if data type names should be serialized in binary encoding.
bool data_types_binary_encoding = false;
bool use_compact_variant_discriminators_serialization = false;
enum class DynamicStatisticsMode
@ -278,6 +281,9 @@ public:
bool position_independent_encoding = true;
/// True if data type names should be deserialized in binary encoding.
bool data_types_binary_encoding = false;
bool native_format = false;
/// If not zero, may be used to avoid reallocations while reading column of String type.

View File

@ -42,13 +42,13 @@ void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr, con
{
size_t size;
readVarUInt(size, istr);
if (settings.max_binary_array_size && size > settings.max_binary_array_size)
if (settings.binary.max_binary_string_size && size > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_array_size",
size,
settings.max_binary_array_size);
settings.binary.max_binary_string_size);
field = Array();
Array & arr = field.get<Array &>();
@ -82,13 +82,13 @@ void SerializationArray::deserializeBinary(IColumn & column, ReadBuffer & istr,
size_t size;
readVarUInt(size, istr);
if (settings.max_binary_array_size && size > settings.max_binary_array_size)
if (settings.binary.max_binary_string_size && size > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_array_size",
size,
settings.max_binary_array_size);
settings.binary.max_binary_string_size);
IColumn & nested_column = column_array.getData();

View File

@ -4,6 +4,8 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnString.h>
@ -109,7 +111,10 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix(
const auto & variant_column = column_dynamic.getVariantColumn();
/// Write internal Variant type name.
writeStringBinary(dynamic_state->variant_type->getName(), *stream);
if (settings.data_types_binary_encoding)
encodeDataType(dynamic_state->variant_type, *stream);
else
writeStringBinary(dynamic_state->variant_type->getName(), *stream);
/// Write statistics in prefix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX)
@ -178,9 +183,16 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
readBinaryLittleEndian(structure_version, *structure_stream);
auto structure_state = std::make_shared<DeserializeBinaryBulkStateDynamicStructure>(structure_version);
/// Read internal Variant type name.
String data_type_name;
readStringBinary(data_type_name, *structure_stream);
structure_state->variant_type = DataTypeFactory::instance().get(data_type_name);
if (settings.data_types_binary_encoding)
{
structure_state->variant_type = decodeDataType(*structure_stream);
}
else
{
String data_type_name;
readStringBinary(data_type_name, *structure_stream);
structure_state->variant_type = DataTypeFactory::instance().get(data_type_name);
}
const auto * variant_type = typeid_cast<const DataTypeVariant *>(structure_state->variant_type.get());
if (!variant_type)
throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type of Dynamic nested column, expected Variant, got {}", structure_state->variant_type->getName());
@ -280,33 +292,27 @@ void SerializationDynamic::deserializeBinaryBulkWithMultipleStreams(
void SerializationDynamic::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const
{
UInt8 null_bit = field.isNull();
writeBinary(null_bit, ostr);
if (null_bit)
/// Serialize NULL as Nothing type with no value.
if (field.isNull())
{
encodeDataType(std::make_shared<DataTypeNothing>(), ostr);
return;
}
auto field_type = applyVisitor(FieldToDataType(), field);
auto field_type_name = field_type->getName();
writeVarUInt(field_type_name.size(), ostr);
writeString(field_type_name, ostr);
encodeDataType(field_type, ostr);
field_type->getDefaultSerialization()->serializeBinary(field, ostr, settings);
}
void SerializationDynamic::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const
{
UInt8 null_bit;
readBinary(null_bit, istr);
if (null_bit)
auto field_type = decodeDataType(istr);
if (isNothing(field_type))
{
field = Null();
return;
}
size_t field_type_name_size;
readVarUInt(field_type_name_size, istr);
String field_type_name(field_type_name_size, 0);
istr.readStrict(field_type_name.data(), field_type_name_size);
auto field_type = DataTypeFactory::instance().get(field_type_name);
field_type->getDefaultSerialization()->deserializeBinary(field, istr, settings);
}
@ -317,15 +323,15 @@ void SerializationDynamic::serializeBinary(const IColumn & column, size_t row_nu
const auto & variant_column = dynamic_column.getVariantColumn();
auto global_discr = variant_column.globalDiscriminatorAt(row_num);
UInt8 null_bit = global_discr == ColumnVariant::NULL_DISCRIMINATOR;
writeBinary(null_bit, ostr);
if (null_bit)
/// Serialize NULL as Nothing type with no value.
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR)
{
encodeDataType(std::make_shared<DataTypeNothing>(), ostr);
return;
}
const auto & variant_type = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariant(global_discr);
const auto & variant_type_name = variant_info.variant_names[global_discr];
writeVarUInt(variant_type_name.size(), ostr);
writeString(variant_type_name, ostr);
encodeDataType(variant_type, ostr);
variant_type->getDefaultSerialization()->serializeBinary(variant_column.getVariantByGlobalDiscriminator(global_discr), variant_column.offsetAt(row_num), ostr, settings);
}
@ -346,30 +352,23 @@ static void deserializeVariant(
void SerializationDynamic::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto & dynamic_column = assert_cast<ColumnDynamic &>(column);
UInt8 null_bit;
readBinary(null_bit, istr);
if (null_bit)
auto variant_type = decodeDataType(istr);
if (isNothing(variant_type))
{
dynamic_column.insertDefault();
return;
}
size_t variant_type_name_size;
readVarUInt(variant_type_name_size, istr);
String variant_type_name(variant_type_name_size, 0);
istr.readStrict(variant_type_name.data(), variant_type_name_size);
auto variant_type_name = variant_type->getName();
const auto & variant_info = dynamic_column.getVariantInfo();
auto it = variant_info.variant_name_to_discriminator.find(variant_type_name);
if (it != variant_info.variant_name_to_discriminator.end())
{
const auto & variant_type = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariant(it->second);
deserializeVariant(dynamic_column.getVariantColumn(), variant_type, it->second, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); });
return;
}
/// We don't have this variant yet. Let's try to add it.
auto variant_type = DataTypeFactory::instance().get(variant_type_name);
if (dynamic_column.addNewVariant(variant_type))
{
auto discr = variant_info.variant_name_to_discriminator.at(variant_type_name);

View File

@ -55,13 +55,13 @@ void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr, const
{
size_t size;
readVarUInt(size, istr);
if (settings.max_binary_array_size && size > settings.max_binary_array_size)
if (settings.binary.max_binary_string_size && size > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large map size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_array_size",
size,
settings.max_binary_array_size);
settings.binary.max_binary_string_size);
field = Map();
Map & map = field.get<Map &>();
map.reserve(size);

View File

@ -33,13 +33,13 @@ namespace ErrorCodes
void SerializationString::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const
{
const String & s = field.get<const String &>();
if (settings.max_binary_string_size && s.size() > settings.max_binary_string_size)
if (settings.binary.max_binary_string_size && s.size() > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_STRING_SIZE,
"Too large string size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_string_size",
s.size(),
settings.max_binary_string_size);
settings.binary.max_binary_string_size);
writeVarUInt(s.size(), ostr);
writeString(s, ostr);
@ -50,13 +50,13 @@ void SerializationString::deserializeBinary(Field & field, ReadBuffer & istr, co
{
UInt64 size;
readVarUInt(size, istr);
if (settings.max_binary_string_size && size > settings.max_binary_string_size)
if (settings.binary.max_binary_string_size && size > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_STRING_SIZE,
"Too large string size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_string_size",
size,
settings.max_binary_string_size);
settings.binary.max_binary_string_size);
field = String();
String & s = field.get<String &>();
@ -68,13 +68,13 @@ void SerializationString::deserializeBinary(Field & field, ReadBuffer & istr, co
void SerializationString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const StringRef & s = assert_cast<const ColumnString &>(column).getDataAt(row_num);
if (settings.max_binary_string_size && s.size > settings.max_binary_string_size)
if (settings.binary.max_binary_string_size && s.size > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_STRING_SIZE,
"Too large string size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_string_size",
s.size,
settings.max_binary_string_size);
settings.binary.max_binary_string_size);
writeVarUInt(s.size, ostr);
writeString(s, ostr);
@ -89,13 +89,13 @@ void SerializationString::deserializeBinary(IColumn & column, ReadBuffer & istr,
UInt64 size;
readVarUInt(size, istr);
if (settings.max_binary_string_size && size > settings.max_binary_string_size)
if (settings.binary.max_binary_string_size && size > settings.binary.max_binary_string_size)
throw Exception(
ErrorCodes::TOO_LARGE_STRING_SIZE,
"Too large string size: {}. The maximum is: {}. To increase the maximum, use setting "
"format_binary_max_string_size",
size,
settings.max_binary_string_size);
settings.binary.max_binary_string_size);
size_t old_chars_size = data.size();
size_t offset = old_chars_size + size + 1;

View File

@ -72,8 +72,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size)
DataTypePtr type = DataTypeFactory::instance().get(data_type);
FormatSettings settings;
settings.max_binary_string_size = 100;
settings.max_binary_array_size = 100;
settings.binary.max_binary_string_size = 100;
settings.binary.max_binary_string_size = 100;
Field field;
type->getDefaultSerialization()->deserializeBinary(field, in, settings);

View File

@ -0,0 +1,129 @@
#include <gtest/gtest.h>
#include <Core/Field.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeFunction.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypeSet.h>
#include <DataTypes/DataTypeInterval.h>
#include <DataTypes/DataTypeIPv4andIPv6.h>
#include <DataTypes/DataTypeAggregateFunction.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeDynamic.h>
#include <DataTypes/DataTypeFactory.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/registerAggregateFunctions.h>
#include <IO/WriteBufferFromString.h>
#include <IO/ReadBufferFromString.h>
#include <Common/tests/gtest_global_register.h>
using namespace DB;
namespace DB::ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
}
void check(const DataTypePtr & type)
{
// std::cerr << "Check " << type->getName() << "\n";
WriteBufferFromOwnString ostr;
encodeDataType(type, ostr);
ReadBufferFromString istr(ostr.str());
DataTypePtr decoded_type = decodeDataType(istr);
ASSERT_TRUE(istr.eof());
ASSERT_EQ(type->getName(), decoded_type->getName());
ASSERT_TRUE(type->equals(*decoded_type));
}
GTEST_TEST(DataTypesBinaryEncoding, EncodeAndDecode)
{
tryRegisterAggregateFunctions();
check(std::make_shared<DataTypeNothing>());
check(std::make_shared<DataTypeInt8>());
check(std::make_shared<DataTypeUInt8>());
check(std::make_shared<DataTypeInt16>());
check(std::make_shared<DataTypeUInt16>());
check(std::make_shared<DataTypeInt32>());
check(std::make_shared<DataTypeUInt32>());
check(std::make_shared<DataTypeInt64>());
check(std::make_shared<DataTypeUInt64>());
check(std::make_shared<DataTypeInt128>());
check(std::make_shared<DataTypeUInt128>());
check(std::make_shared<DataTypeInt256>());
check(std::make_shared<DataTypeUInt256>());
check(std::make_shared<DataTypeFloat32>());
check(std::make_shared<DataTypeFloat64>());
check(std::make_shared<DataTypeDate>());
check(std::make_shared<DataTypeDate32>());
check(std::make_shared<DataTypeDateTime>());
check(std::make_shared<DataTypeDateTime>("EST"));
check(std::make_shared<DataTypeDateTime>("CET"));
check(std::make_shared<DataTypeDateTime64>(3));
check(std::make_shared<DataTypeDateTime64>(3, "EST"));
check(std::make_shared<DataTypeDateTime64>(3, "CET"));
check(std::make_shared<DataTypeString>());
check(std::make_shared<DataTypeFixedString>(10));
check(DataTypeFactory::instance().get("Enum8('a' = 1, 'b' = 2, 'c' = 3, 'd' = -128)"));
check(DataTypeFactory::instance().get("Enum16('a' = 1, 'b' = 2, 'c' = 3, 'd' = -1000)"));
check(std::make_shared<DataTypeDecimal32>(3, 6));
check(std::make_shared<DataTypeDecimal64>(3, 6));
check(std::make_shared<DataTypeDecimal128>(3, 6));
check(std::make_shared<DataTypeDecimal256>(3, 6));
check(std::make_shared<DataTypeUUID>());
check(DataTypeFactory::instance().get("Array(UInt32)"));
check(DataTypeFactory::instance().get("Array(Array(Array(UInt32)))"));
check(DataTypeFactory::instance().get("Tuple(UInt32, String, UUID)"));
check(DataTypeFactory::instance().get("Tuple(UInt32, String, Tuple(UUID, Date, IPv4))"));
check(DataTypeFactory::instance().get("Tuple(c1 UInt32, c2 String, c3 UUID)"));
check(DataTypeFactory::instance().get("Tuple(c1 UInt32, c2 String, c3 Tuple(c4 UUID, c5 Date, c6 IPv4))"));
check(std::make_shared<DataTypeSet>());
check(std::make_shared<DataTypeInterval>(IntervalKind::Kind::Nanosecond));
check(std::make_shared<DataTypeInterval>(IntervalKind::Kind::Microsecond));
check(DataTypeFactory::instance().get("Nullable(UInt32)"));
check(DataTypeFactory::instance().get("Nullable(Nothing)"));
check(DataTypeFactory::instance().get("Nullable(UUID)"));
check(std::make_shared<DataTypeFunction>(
DataTypes{
std::make_shared<DataTypeInt8>(),
std::make_shared<DataTypeDate>(),
DataTypeFactory::instance().get("Array(Array(Array(UInt32)))")},
DataTypeFactory::instance().get("Tuple(c1 UInt32, c2 String, c3 UUID)")));
DataTypes argument_types = {std::make_shared<DataTypeUInt64>()};
Array parameters = {Field(0.1), Field(0.2)};
AggregateFunctionProperties properties;
AggregateFunctionPtr function = AggregateFunctionFactory::instance().get("quantiles", NullsAction::EMPTY, argument_types, parameters, properties);
check(std::make_shared<DataTypeAggregateFunction>(function, argument_types, parameters));
check(std::make_shared<DataTypeAggregateFunction>(function, argument_types, parameters, 2));
check(DataTypeFactory::instance().get("AggregateFunction(sum, UInt64)"));
check(DataTypeFactory::instance().get("AggregateFunction(quantiles(0.5, 0.9), UInt64)"));
check(DataTypeFactory::instance().get("AggregateFunction(sequenceMatch('(?1)(?2)'), Date, UInt8, UInt8)"));
check(DataTypeFactory::instance().get("AggregateFunction(sumMapFiltered([1, 4, 8]), Array(UInt64), Array(UInt64))"));
check(DataTypeFactory::instance().get("LowCardinality(UInt32)"));
check(DataTypeFactory::instance().get("LowCardinality(Nullable(String))"));
check(DataTypeFactory::instance().get("Map(String, UInt32)"));
check(DataTypeFactory::instance().get("Map(String, Map(String, Map(String, UInt32)))"));
check(std::make_shared<DataTypeIPv4>());
check(std::make_shared<DataTypeIPv6>());
check(DataTypeFactory::instance().get("Variant(String, UInt32, Date32)"));
check(std::make_shared<DataTypeDynamic>());
check(std::make_shared<DataTypeDynamic>(10));
check(std::make_shared<DataTypeDynamic>(255));
check(DataTypeFactory::instance().get("Bool"));
check(DataTypeFactory::instance().get("SimpleAggregateFunction(sum, UInt64)"));
check(DataTypeFactory::instance().get("SimpleAggregateFunction(maxMap, Tuple(Array(UInt32), Array(UInt32)))"));
check(DataTypeFactory::instance().get("SimpleAggregateFunction(groupArrayArray(19), Array(UInt64))"));
check(DataTypeFactory::instance().get("Nested(a UInt32, b UInt32)"));
check(DataTypeFactory::instance().get("Nested(a UInt32, b Nested(c String, d Nested(e Date)))"));
check(DataTypeFactory::instance().get("Ring"));
check(DataTypeFactory::instance().get("Point"));
check(DataTypeFactory::instance().get("Polygon"));
check(DataTypeFactory::instance().get("MultiPolygon"));
check(DataTypeFactory::instance().get("Tuple(Map(LowCardinality(String), Array(AggregateFunction(2, quantiles(0.1, 0.2), Float32))), Array(Array(Tuple(UInt32, Tuple(a Map(String, String), b Nullable(Date), c Variant(Tuple(g String, d Array(UInt32)), Date, Map(String, String)))))))"));
}

View File

@ -270,9 +270,13 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.markdown.escape_special_characters = settings.output_format_markdown_escape_special_characters;
format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string;
format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference;
format_settings.max_binary_string_size = settings.format_binary_max_string_size;
format_settings.max_binary_array_size = settings.format_binary_max_array_size;
format_settings.binary.max_binary_string_size = settings.format_binary_max_string_size;
format_settings.binary.max_binary_array_size = settings.format_binary_max_array_size;
format_settings.binary.encode_types_in_binary_format = settings.output_format_binary_encode_types_in_binary_format;
format_settings.binary.decode_types_in_binary_format = settings.input_format_binary_decode_types_in_binary_format;
format_settings.native.allow_types_conversion = settings.input_format_native_allow_types_conversion;
format_settings.native.encode_types_in_binary_format = settings.output_format_native_encode_types_in_binary_format;
format_settings.native.decode_types_in_binary_format = settings.input_format_native_decode_types_in_binary_format;
format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth;
format_settings.client_protocol_version = context->getClientProtocolVersion();
format_settings.date_time_overflow_behavior = settings.date_time_overflow_behavior;

View File

@ -106,8 +106,6 @@ struct FormatSettings
UInt64 input_allow_errors_num = 0;
Float32 input_allow_errors_ratio = 0;
UInt64 max_binary_string_size = 1_GiB;
UInt64 max_binary_array_size = 1_GiB;
UInt64 client_protocol_version = 0;
UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH;
@ -121,6 +119,14 @@ struct FormatSettings
ZSTD
};
struct
{
UInt64 max_binary_string_size = 1_GiB;
UInt64 max_binary_array_size = 1_GiB;
bool encode_types_in_binary_format = false;
bool decode_types_in_binary_format = false;
} binary{};
struct
{
UInt64 row_group_size = 1000000;
@ -459,6 +465,8 @@ struct FormatSettings
struct
{
bool allow_types_conversion = true;
bool encode_types_in_binary_format = false;
bool decode_types_in_binary_format = false;
} native{};
struct

View File

@ -6,6 +6,7 @@
#include <Compression/CompressedReadBufferFromFile.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <Common/typeid_cast.h>
#include <base/range.h>
@ -31,8 +32,8 @@ namespace ErrorCodes
}
NativeReader::NativeReader(ReadBuffer & istr_, UInt64 server_revision_)
: istr(istr_), server_revision(server_revision_)
NativeReader::NativeReader(ReadBuffer & istr_, UInt64 server_revision_, std::optional<FormatSettings> format_settings_)
: istr(istr_), server_revision(server_revision_), format_settings(format_settings_)
{
}
@ -40,16 +41,12 @@ NativeReader::NativeReader(
ReadBuffer & istr_,
const Block & header_,
UInt64 server_revision_,
bool skip_unknown_columns_,
bool null_as_default_,
bool allow_types_conversion_,
std::optional<FormatSettings> format_settings_,
BlockMissingValues * block_missing_values_)
: istr(istr_)
, header(header_)
, server_revision(server_revision_)
, skip_unknown_columns(skip_unknown_columns_)
, null_as_default(null_as_default_)
, allow_types_conversion(allow_types_conversion_)
, format_settings(std::move(format_settings_))
, block_missing_values(block_missing_values_)
{
}
@ -83,13 +80,14 @@ void NativeReader::resetParser()
use_index = false;
}
void NativeReader::readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint)
static void readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, const std::optional<FormatSettings> & format_settings, size_t rows, double avg_value_size_hint)
{
ISerialization::DeserializeBinaryBulkSettings settings;
settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; };
settings.avg_value_size_hint = avg_value_size_hint;
settings.position_independent_encoding = false;
settings.native_format = true;
settings.data_types_binary_encoding = format_settings && format_settings->native.decode_types_in_binary_format;
ISerialization::DeserializeBinaryBulkStatePtr state;
@ -167,8 +165,16 @@ Block NativeReader::read()
/// Type
String type_name;
readBinary(type_name, istr);
column.type = data_type_factory.get(type_name);
if (format_settings && format_settings->native.decode_types_in_binary_format)
{
column.type = decodeDataType(istr);
type_name = column.type->getName();
}
else
{
readBinary(type_name, istr);
column.type = data_type_factory.get(type_name);
}
setVersionToAggregateFunctions(column.type, true, server_revision);
@ -203,7 +209,7 @@ Block NativeReader::read()
double avg_value_size_hint = avg_value_size_hints.empty() ? 0 : avg_value_size_hints[i];
if (rows) /// If no rows, nothing to read.
readData(*serialization, read_column, istr, rows, avg_value_size_hint);
readData(*serialization, read_column, istr, format_settings, rows, avg_value_size_hint);
column.column = std::move(read_column);
@ -214,12 +220,12 @@ Block NativeReader::read()
{
auto & header_column = header.getByName(column.name);
if (null_as_default)
if (format_settings && format_settings->null_as_default)
insertNullAsDefaultIfNeeded(column, header_column, header.getPositionByName(column.name), block_missing_values);
if (!header_column.type->equals(*column.type))
{
if (allow_types_conversion)
if (format_settings && format_settings->native.allow_types_conversion)
{
try
{
@ -246,7 +252,7 @@ Block NativeReader::read()
}
else
{
if (!skip_unknown_columns)
if (format_settings && !format_settings->skip_unknown_fields)
throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column with name {} found while reading data in Native format", column.name);
use_in_result = false;
}

View File

@ -20,7 +20,7 @@ class NativeReader
{
public:
/// If a non-zero server_revision is specified, additional block information may be expected and read.
NativeReader(ReadBuffer & istr_, UInt64 server_revision_);
NativeReader(ReadBuffer & istr_, UInt64 server_revision_, std::optional<FormatSettings> format_settings_ = std::nullopt);
/// For cases when data structure (header) is known in advance.
/// NOTE We may use header for data validation and/or type conversions. It is not implemented.
@ -28,9 +28,7 @@ public:
ReadBuffer & istr_,
const Block & header_,
UInt64 server_revision_,
bool skip_unknown_columns_ = false,
bool null_as_default_ = false,
bool allow_types_conversion_ = false,
std::optional<FormatSettings> format_settings_ = std::nullopt,
BlockMissingValues * block_missing_values_ = nullptr);
/// For cases when we have an index. It allows to skip columns. Only columns specified in the index will be read.
@ -38,8 +36,6 @@ public:
IndexForNativeFormat::Blocks::const_iterator index_block_it_,
IndexForNativeFormat::Blocks::const_iterator index_block_end_);
static void readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint);
Block getHeader() const;
void resetParser();
@ -50,9 +46,7 @@ private:
ReadBuffer & istr;
Block header;
UInt64 server_revision;
bool skip_unknown_columns = false;
bool null_as_default = false;
bool allow_types_conversion = false;
std::optional<FormatSettings> format_settings = std::nullopt;
BlockMissingValues * block_missing_values = nullptr;
bool use_index = false;

View File

@ -14,6 +14,7 @@
#include <Columns/ColumnSparse.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeAggregateFunction.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
namespace DB
{
@ -25,10 +26,20 @@ namespace ErrorCodes
NativeWriter::NativeWriter(
WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, bool remove_low_cardinality_,
IndexForNativeFormat * index_, size_t initial_size_of_file_)
: ostr(ostr_), client_revision(client_revision_), header(header_),
index(index_), initial_size_of_file(initial_size_of_file_), remove_low_cardinality(remove_low_cardinality_)
WriteBuffer & ostr_,
UInt64 client_revision_,
const Block & header_,
std::optional<FormatSettings> format_settings_,
bool remove_low_cardinality_,
IndexForNativeFormat * index_,
size_t initial_size_of_file_)
: ostr(ostr_)
, client_revision(client_revision_)
, header(header_)
, index(index_)
, initial_size_of_file(initial_size_of_file_)
, remove_low_cardinality(remove_low_cardinality_)
, format_settings(std::move(format_settings_))
{
if (index)
{
@ -45,7 +56,7 @@ void NativeWriter::flush()
}
static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit)
static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, const std::optional<FormatSettings> & format_settings, UInt64 offset, UInt64 limit)
{
/** If there are columns-constants - then we materialize them.
* (Since the data type does not know how to serialize / deserialize constants.)
@ -57,6 +68,7 @@ static void writeData(const ISerialization & serialization, const ColumnPtr & co
settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; };
settings.position_independent_encoding = false;
settings.low_cardinality_max_dictionary_size = 0;
settings.data_types_binary_encoding = format_settings && format_settings->native.encode_types_in_binary_format;
ISerialization::SerializeBinaryBulkStatePtr state;
serialization.serializeBinaryBulkStatePrefix(*full_column, settings, state);
@ -121,15 +133,22 @@ size_t NativeWriter::write(const Block & block)
setVersionToAggregateFunctions(column.type, include_version, include_version ? std::optional<size_t>(client_revision) : std::nullopt);
/// Type
String type_name = column.type->getName();
if (format_settings && format_settings->native.encode_types_in_binary_format)
{
encodeDataType(column.type, ostr);
}
else
{
String type_name = column.type->getName();
/// For compatibility, we will not send explicit timezone parameter in DateTime data type
/// to older clients, that cannot understand it.
if (client_revision < DBMS_MIN_REVISION_WITH_TIME_ZONE_PARAMETER_IN_DATETIME_DATA_TYPE
&& startsWith(type_name, "DateTime("))
type_name = "DateTime";
/// For compatibility, we will not send explicit timezone parameter in DateTime data type
/// to older clients, that cannot understand it.
if (client_revision < DBMS_MIN_REVISION_WITH_TIME_ZONE_PARAMETER_IN_DATETIME_DATA_TYPE
&& startsWith(type_name, "DateTime("))
type_name = "DateTime";
writeStringBinary(type_name, ostr);
writeStringBinary(type_name, ostr);
}
/// Serialization. Dynamic, if client supports it.
SerializationPtr serialization;
@ -161,7 +180,7 @@ size_t NativeWriter::write(const Block & block)
/// Data
if (rows) /// Zero items of data is always represented as zero number of bytes.
writeData(*serialization, column.column, ostr, 0, 0);
writeData(*serialization, column.column, ostr, format_settings, 0, 0);
if (index)
{

View File

@ -3,6 +3,7 @@
#include <base/types.h>
#include <DataTypes/IDataType.h>
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
namespace DB
{
@ -23,7 +24,7 @@ public:
/** If non-zero client_revision is specified, additional block information can be written.
*/
NativeWriter(
WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, bool remove_low_cardinality_ = false,
WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, std::optional<FormatSettings> format_settings_ = std::nullopt, bool remove_low_cardinality_ = false,
IndexForNativeFormat * index_ = nullptr, size_t initial_size_of_file_ = 0);
Block getHeader() const { return header; }
@ -44,6 +45,7 @@ private:
CompressedWriteBuffer * ostr_concrete = nullptr;
bool remove_low_cardinality;
std::optional<FormatSettings> format_settings;
};
}

View File

@ -4051,7 +4051,6 @@ std::shared_ptr<QueryThreadLog> Context::getQueryThreadLog() const
std::shared_ptr<QueryViewsLog> Context::getQueryViewsLog() const
{
SharedLockGuard lock(shared->mutex);
if (!shared->system_logs)
return {};

View File

@ -4,6 +4,7 @@
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
namespace DB
{
@ -55,10 +56,25 @@ std::vector<String> BinaryFormatReader<with_defaults>::readNames()
template <bool with_defaults>
std::vector<String> BinaryFormatReader<with_defaults>::readTypes()
{
auto types = readHeaderRow();
for (const auto & type_name : types)
read_data_types.push_back(DataTypeFactory::instance().get(type_name));
return types;
read_data_types.reserve(read_columns);
Names type_names;
if (format_settings.binary.decode_types_in_binary_format)
{
type_names.reserve(read_columns);
for (size_t i = 0; i < read_columns; ++i)
{
read_data_types.push_back(decodeDataType(*in));
type_names.push_back(read_data_types.back()->getName());
}
}
else
{
type_names = readHeaderRow();
for (const auto & type_name : type_names)
read_data_types.push_back(DataTypeFactory::instance().get(type_name));
}
return type_names;
}
template <bool with_defaults>

View File

@ -2,6 +2,7 @@
#include <IO/WriteHelpers.h>
#include <Columns/IColumn.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <Processors/Formats/Impl/BinaryRowOutputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
@ -35,9 +36,15 @@ void BinaryRowOutputFormat::writePrefix()
if (with_types)
{
for (size_t i = 0; i < columns; ++i)
if (format_settings.binary.encode_types_in_binary_format)
{
writeStringBinary(header.safeGetByPosition(i).type->getName(), out);
for (size_t i = 0; i < columns; ++i)
encodeDataType(header.safeGetByPosition(i).type, out);
}
else
{
for (size_t i = 0; i < columns; ++i)
writeStringBinary(header.safeGetByPosition(i).type->getName(), out);
}
}
}

View File

@ -21,9 +21,7 @@ public:
buf,
header_,
0,
settings.skip_unknown_fields,
settings.null_as_default,
settings.native.allow_types_conversion,
settings,
settings.defaults_for_omitted_fields ? &block_missing_values : nullptr))
, header(header_) {}
@ -72,9 +70,9 @@ private:
class NativeOutputFormat final : public IOutputFormat
{
public:
NativeOutputFormat(WriteBuffer & buf, const Block & header, UInt64 client_protocol_version = 0)
NativeOutputFormat(WriteBuffer & buf, const Block & header, const FormatSettings & settings, UInt64 client_protocol_version = 0)
: IOutputFormat(header, buf)
, writer(buf, client_protocol_version, header)
, writer(buf, client_protocol_version, header, settings)
{
}
@ -103,14 +101,17 @@ private:
class NativeSchemaReader : public ISchemaReader
{
public:
explicit NativeSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) {}
explicit NativeSchemaReader(ReadBuffer & in_, const FormatSettings & settings_) : ISchemaReader(in_), settings(settings_) {}
NamesAndTypesList readSchema() override
{
auto reader = NativeReader(in, 0);
auto reader = NativeReader(in, 0, settings);
auto block = reader.read();
return block.getNamesAndTypesList();
}
private:
const FormatSettings settings;
};
@ -134,16 +135,16 @@ void registerOutputFormatNative(FormatFactory & factory)
const Block & sample,
const FormatSettings & settings)
{
return std::make_shared<NativeOutputFormat>(buf, sample, settings.client_protocol_version);
return std::make_shared<NativeOutputFormat>(buf, sample, settings, settings.client_protocol_version);
});
}
void registerNativeSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &)
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<NativeSchemaReader>(buf);
return std::make_shared<NativeSchemaReader>(buf, settings);
});
}

View File

@ -2107,6 +2107,7 @@ void TCPHandler::initBlockOutput(const Block & block)
*state.maybe_compressed_out,
client_tcp_protocol_version,
block.cloneEmpty(),
std::nullopt,
!query_settings.low_cardinality_allow_in_native_format);
}
}
@ -2121,6 +2122,7 @@ void TCPHandler::initLogsBlockOutput(const Block & block)
*out,
client_tcp_protocol_version,
block.cloneEmpty(),
std::nullopt,
!query_settings.low_cardinality_allow_in_native_format);
}
}
@ -2135,6 +2137,7 @@ void TCPHandler::initProfileEventsBlockOutput(const Block & block)
*out,
client_tcp_protocol_version,
block.cloneEmpty(),
std::nullopt,
!query_settings.low_cardinality_allow_in_native_format);
}
}

View File

@ -408,7 +408,7 @@ namespace
auto data_file_path = temp_dir / fs::path{file_paths[data_bin_pos]}.filename();
auto data_out_compressed = temp_disk->writeFile(data_file_path);
auto data_out = std::make_unique<CompressedWriteBuffer>(*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size);
NativeWriter block_out{*data_out, 0, metadata_snapshot->getSampleBlock(), false, &index};
NativeWriter block_out{*data_out, 0, metadata_snapshot->getSampleBlock(), std::nullopt, false, &index};
for (const auto & block : *blocks)
block_out.write(block);
data_out->finalize();

View File

@ -193,7 +193,7 @@ public:
storage.saveFileSizes(lock);
size_t initial_data_size = storage.file_checker.getFileSize(storage.data_file_path);
block_out = std::make_unique<NativeWriter>(*data_out, 0, metadata_snapshot->getSampleBlock(), false, &storage.indices, initial_data_size);
block_out = std::make_unique<NativeWriter>(*data_out, 0, metadata_snapshot->getSampleBlock(), std::nullopt, false, &storage.indices, initial_data_size);
}
String getName() const override { return "StripeLogSink"; }

View File

@ -0,0 +1,50 @@
\N None
42 UInt8
-42 Int8
42 UInt16
-42 Int16
42 UInt32
-42 Int32
42 UInt64
-42 Int64
42 UInt128
-42 Int128
42 UInt256
-42 Int256
42.42 Float32
42.42 Float64
2020-01-01 Date
2020-01-01 Date32
2020-01-01 00:00:00 DateTime(\'EST\')
2020-01-01 00:00:00 DateTime(\'CET\')
2020-01-01 00:00:00.000000 DateTime64(6, \'EST\')
2020-01-01 00:00:00.000000 DateTime64(6, \'CET\')
Hello, World! String
aaaaa FixedString(5)
a Enum8(\'c\' = -128, \'a\' = 1, \'b\' = 2)
a Enum16(\'c\' = -1280, \'a\' = 1, \'b\' = 2)
42.42 Decimal(9, 3)
42.42 Decimal(18, 3)
42.42 Decimal(38, 3)
42.42 Decimal(76, 3)
984ac60f-4d08-4ef1-9c62-d82f343fbc90 UUID
[1,2,3] Array(UInt64)
[[[1],[2]],[[3,4,5]]] Array(Array(Array(UInt64)))
(1,'str',42.42) Tuple(UInt32, String, Float32)
(1,'str',42.42) Tuple(a UInt32, b String, c Float32)
(1,('str',(42.42,-30))) Tuple(UInt32, Tuple(String, Tuple(Float32, Int8)))
(1,('str',(42.42,-30))) Tuple(a UInt32, b Tuple(c String, d Tuple(e Float32, f Int8)))
\0 \0\0\0\0\0\0\0\0\0\0\0\0\06364136223846793005 0 123459*\0\0\0\0\0\0\0 AggregateFunction(quantile(0.5), UInt64)
42 SimpleAggregateFunction(sum, UInt64)
Hello, World! LowCardinality(String)
{1:'str1',2:'str2'} Map(UInt64, String)
{1:{1:{1:'str1'}},2:{2:{2:'str2'}}} Map(UInt64, Map(UInt64, Map(UInt64, String)))
127.0.0.0 IPv4
2001:db8:cafe:1::1 IPv6
true Bool
[(1,2),(3,4)] Nested(a UInt32, b UInt32)
[(0,0),(10,0),(10,10),(0,10)] Ring
(0,0) Point
[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] Polygon
[[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] MultiPolygon
[{42:(1,[(2,{1:2})])}] Array(Map(UInt8, Tuple(UInt8, Array(Tuple(UInt8, Map(UInt8, UInt8))))))

View File

@ -0,0 +1,63 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test"
$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "create table test (id UInt64, d Dynamic(max_types=255)) engine=Memory"
$CLICKHOUSE_CLIENT -q "insert into test select 0, NULL"
$CLICKHOUSE_CLIENT -q "insert into test select 1, materialize(42)::UInt8"
$CLICKHOUSE_CLIENT -q "insert into test select 2, materialize(-42)::Int8"
$CLICKHOUSE_CLIENT -q "insert into test select 3, materialize(42)::UInt16"
$CLICKHOUSE_CLIENT -q "insert into test select 4, materialize(-42)::Int16"
$CLICKHOUSE_CLIENT -q "insert into test select 5, materialize(42)::UInt32"
$CLICKHOUSE_CLIENT -q "insert into test select 6, materialize(-42)::Int32"
$CLICKHOUSE_CLIENT -q "insert into test select 7, materialize(42)::UInt64"
$CLICKHOUSE_CLIENT -q "insert into test select 8, materialize(-42)::Int64"
$CLICKHOUSE_CLIENT -q "insert into test select 9, materialize(42)::UInt128"
$CLICKHOUSE_CLIENT -q "insert into test select 10, materialize(-42)::Int128"
$CLICKHOUSE_CLIENT -q "insert into test select 11, materialize(42)::UInt256"
$CLICKHOUSE_CLIENT -q "insert into test select 12, materialize(-42)::Int256"
$CLICKHOUSE_CLIENT -q "insert into test select 13, materialize(42.42)::Float32"
$CLICKHOUSE_CLIENT -q "insert into test select 14, materialize(42.42)::Float64"
$CLICKHOUSE_CLIENT -q "insert into test select 15, materialize('2020-01-01')::Date"
$CLICKHOUSE_CLIENT -q "insert into test select 16, materialize('2020-01-01')::Date32"
$CLICKHOUSE_CLIENT -q "insert into test select 17, materialize('2020-01-01 00:00:00')::DateTime('EST')"
$CLICKHOUSE_CLIENT -q "insert into test select 18, materialize('2020-01-01 00:00:00')::DateTime('CET')"
$CLICKHOUSE_CLIENT -q "insert into test select 19, materialize('2020-01-01 00:00:00.000000')::DateTime64(6, 'EST')"
$CLICKHOUSE_CLIENT -q "insert into test select 20, materialize('2020-01-01 00:00:00.000000')::DateTime64(6, 'CET')"
$CLICKHOUSE_CLIENT -q "insert into test select 21, materialize('Hello, World!')"
$CLICKHOUSE_CLIENT -q "insert into test select 22, materialize('aaaaa')::FixedString(5)"
$CLICKHOUSE_CLIENT -q "insert into test select 23, materialize('a')::Enum8('a' = 1, 'b' = 2, 'c' = -128)"
$CLICKHOUSE_CLIENT -q "insert into test select 24, materialize('a')::Enum16('a' = 1, 'b' = 2, 'c' = -1280)"
$CLICKHOUSE_CLIENT -q "insert into test select 25, materialize(42.42)::Decimal32(3)"
$CLICKHOUSE_CLIENT -q "insert into test select 26, materialize(42.42)::Decimal64(3)"
$CLICKHOUSE_CLIENT -q "insert into test select 27, materialize(42.42)::Decimal128(3)"
$CLICKHOUSE_CLIENT -q "insert into test select 28, materialize(42.42)::Decimal256(3)"
$CLICKHOUSE_CLIENT -q "insert into test select 29, materialize('984ac60f-4d08-4ef1-9c62-d82f343fbc90')::UUID"
$CLICKHOUSE_CLIENT -q "insert into test select 30, materialize([1, 2, 3])::Array(UInt64)"
$CLICKHOUSE_CLIENT -q "insert into test select 31, materialize([[[1], [2]], [[3, 4, 5]]])::Array(Array(Array(UInt64)))"
$CLICKHOUSE_CLIENT -q "insert into test select 32, materialize(tuple(1, 'str', 42.42))::Tuple(UInt32, String, Float32)"
$CLICKHOUSE_CLIENT -q "insert into test select 33, materialize(tuple(1, 'str', 42.42))::Tuple(a UInt32, b String, c Float32)"
$CLICKHOUSE_CLIENT -q "insert into test select 34, materialize(tuple(1, tuple('str', tuple(42.42, -30))))::Tuple(UInt32, Tuple(String, Tuple(Float32, Int8)))"
$CLICKHOUSE_CLIENT -q "insert into test select 35, materialize(tuple(1, tuple('str', tuple(42.42, -30))))::Tuple(a UInt32, b Tuple(c String, d Tuple(e Float32, f Int8)))"
$CLICKHOUSE_CLIENT -q "insert into test select 36, quantileState(0.5)(42::UInt64)"
$CLICKHOUSE_CLIENT -q "insert into test select 37, sumSimpleState(42::UInt64)"
$CLICKHOUSE_CLIENT -q "insert into test select 38, toLowCardinality('Hello, World!')"
$CLICKHOUSE_CLIENT -q "insert into test select 39, materialize(map(1, 'str1', 2, 'str2'))::Map(UInt64, String)"
$CLICKHOUSE_CLIENT -q "insert into test select 40, materialize(map(1, map(1, map(1, 'str1')), 2, map(2, map(2, 'str2'))))::Map(UInt64, Map(UInt64, Map(UInt64, String)))"
$CLICKHOUSE_CLIENT -q "insert into test select 41, materialize('127.0.0.0')::IPv4"
$CLICKHOUSE_CLIENT -q "insert into test select 42, materialize('2001:db8:cafe:1:0:0:0:1')::IPv6"
$CLICKHOUSE_CLIENT -q "insert into test select 43, materialize(true)::Bool"
$CLICKHOUSE_CLIENT -q "insert into test select 44, materialize([tuple(1, 2), tuple(3, 4)])::Nested(a UInt32, b UInt32)"
$CLICKHOUSE_CLIENT -q "insert into test select 45, materialize([(0, 0), (10, 0), (10, 10), (0, 10)])::Ring"
$CLICKHOUSE_CLIENT -q "insert into test select 46, materialize((0, 0))::Point"
$CLICKHOUSE_CLIENT -q "insert into test select 47, materialize([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]])::Polygon"
$CLICKHOUSE_CLIENT -q "insert into test select 48, materialize([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]])::MultiPolygon"
$CLICKHOUSE_CLIENT -q "insert into test select 49, materialize([map(42, tuple(1, [tuple(2, map(1, 2))]))])"
$CLICKHOUSE_CLIENT -q "select * from test format RowBinary" | $CLICKHOUSE_LOCAL --allow_experimental_dynamic_type=1 --input-format RowBinary --structure 'id UInt64, d Dynamic(max_types=255)' -q "select d, dynamicType(d) from table order by id"
$CLICKHOUSE_CLIENT -q "drop table test"

View File

@ -0,0 +1,114 @@
42 UInt8
42 UInt8
\N Nullable(Nothing)
\N Nullable(Nothing)
42 UInt8
42 UInt8
-42 Int8
-42 Int8
42 UInt16
42 UInt16
-42 Int16
-42 Int16
42 UInt32
42 UInt32
-42 Int32
-42 Int32
42 UInt64
42 UInt64
-42 Int64
-42 Int64
42 UInt128
42 UInt128
-42 Int128
-42 Int128
42 UInt256
42 UInt256
-42 Int256
-42 Int256
42.42 Float32
42.42 Float32
42.42 Float64
42.42 Float64
2020-01-01 Date
2020-01-01 Date
2020-01-01 Date32
2020-01-01 Date32
2020-01-01 00:00:00 DateTime
2020-01-01 00:00:00 DateTime
2020-01-01 00:00:00 DateTime(\'EST\')
2020-01-01 00:00:00 DateTime(\'EST\')
2020-01-01 00:00:00 DateTime(\'CET\')
2020-01-01 00:00:00 DateTime(\'CET\')
2020-01-01 00:00:00.000000 DateTime64(6)
2020-01-01 00:00:00.000000 DateTime64(6)
2020-01-01 00:00:00.000000 DateTime64(6, \'EST\')
2020-01-01 00:00:00.000000 DateTime64(6, \'EST\')
2020-01-01 00:00:00.000000 DateTime64(6, \'CET\')
2020-01-01 00:00:00.000000 DateTime64(6, \'CET\')
Hello, World! String
Hello, World! String
aaaaa FixedString(5)
aaaaa FixedString(5)
a Enum8(\'c\' = -128, \'a\' = 1, \'b\' = 2)
a Enum8(\'c\' = -128, \'a\' = 1, \'b\' = 2)
a Enum16(\'c\' = -1280, \'a\' = 1, \'b\' = 2)
a Enum16(\'c\' = -1280, \'a\' = 1, \'b\' = 2)
42.42 Decimal(9, 3)
42.42 Decimal(9, 3)
42.42 Decimal(18, 3)
42.42 Decimal(18, 3)
42.42 Decimal(38, 3)
42.42 Decimal(38, 3)
42.42 Decimal(76, 3)
42.42 Decimal(76, 3)
984ac60f-4d08-4ef1-9c62-d82f343fbc90 UUID
984ac60f-4d08-4ef1-9c62-d82f343fbc90 UUID
[1,2,3] Array(UInt64)
[1,2,3] Array(UInt64)
[[[1],[2]],[[3,4,5]]] Array(Array(Array(UInt64)))
[[[1],[2]],[[3,4,5]]] Array(Array(Array(UInt64)))
(1,'str',42.42) Tuple(UInt32, String, Float32)
(1,'str',42.42) Tuple(UInt32, String, Float32)
(1,'str',42.42) Tuple(\n a UInt32,\n b String,\n c Float32)
(1,'str',42.42) Tuple(\n a UInt32,\n b String,\n c Float32)
(1,('str',(42.42,-30))) Tuple(UInt32, Tuple(String, Tuple(Float32, Int8)))
(1,('str',(42.42,-30))) Tuple(UInt32, Tuple(String, Tuple(Float32, Int8)))
(1,('str',(42.42,-30))) Tuple(\n a UInt32,\n b Tuple(\n c String,\n d Tuple(\n e Float32,\n f Int8)))
(1,('str',(42.42,-30))) Tuple(\n a UInt32,\n b Tuple(\n c String,\n d Tuple(\n e Float32,\n f Int8)))
\0 \0\0\0\0\0\0\0\0\0\0\0\0\06364136223846793005 0 123459*\0\0\0\0\0\0\0 AggregateFunction(quantile(0.5), UInt64)
\0 \0\0\0\0\0\0\0\0\0\0\0\0\06364136223846793005 0 123459*\0\0\0\0\0\0\0 AggregateFunction(quantile(0.5), UInt64)
42 SimpleAggregateFunction(sum, UInt64)
42 SimpleAggregateFunction(sum, UInt64)
Hello, World! LowCardinality(String)
Hello, World! LowCardinality(String)
{1:'str1',2:'str2'} Map(UInt64, String)
{1:'str1',2:'str2'} Map(UInt64, String)
{1:{1:{1:'str1'}},2:{2:{2:'str2'}}} Map(UInt64, Map(UInt64, Map(UInt64, String)))
{1:{1:{1:'str1'}},2:{2:{2:'str2'}}} Map(UInt64, Map(UInt64, Map(UInt64, String)))
127.0.0.0 IPv4
127.0.0.0 IPv4
2001:db8:cafe:1::1 IPv6
2001:db8:cafe:1::1 IPv6
true Bool
true Bool
[(1,2),(3,4)] Nested(a UInt32, b UInt32)
[(1,2),(3,4)] Nested(a UInt32, b UInt32)
[(0,0),(10,0),(10,10),(0,10)] Ring
[(0,0),(10,0),(10,10),(0,10)] Ring
(0,0) Point
(0,0) Point
[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] Polygon
[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] Polygon
[[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] MultiPolygon
[[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] MultiPolygon
[{42:(1,[(2,{1:2})])}] Array(Map(UInt8, Tuple(UInt8, Array(Tuple(UInt8, Map(UInt8, UInt8))))))
[{42:(1,[(2,{1:2})])}] Array(Map(UInt8, Tuple(UInt8, Array(Tuple(UInt8, Map(UInt8, UInt8))))))
42 Variant(String, Tuple(\n a UInt32,\n b Array(Map(String, String))), UInt32)
42 Variant(String, Tuple(\n a UInt32,\n b Array(Map(String, String))), UInt32)
[{42:(1,[(2,{1:2})])}] Dynamic
[{42:(1,[(2,{1:2})])}] Dynamic
[{42:(1,[(2,{1:2})])}] Dynamic(max_types=10)
[{42:(1,[(2,{1:2})])}] Dynamic(max_types=10)
[{42:(1,[(2,{1:2})])}] Dynamic(max_types=255)
[{42:(1,[(2,{1:2})])}] Dynamic(max_types=255)

View File

@ -0,0 +1,69 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
function test
{
$CLICKHOUSE_LOCAL --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --output_format_binary_encode_types_in_binary_format=1 -q "select $1 as value format RowBinaryWithNamesAndTypes" | $CLICKHOUSE_LOCAL --input-format RowBinaryWithNamesAndTypes --input_format_binary_decode_types_in_binary_format=1 -q "select value, toTypeName(value) from table"
$CLICKHOUSE_LOCAL --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --output_format_native_encode_types_in_binary_format=1 -q "select $1 as value format Native" | $CLICKHOUSE_LOCAL --input-format Native --input_format_native_decode_types_in_binary_format=1 -q "select value, toTypeName(value) from table"
}
test "materialize(42)::UInt8"
test "NULL"
test "materialize(42)::UInt8"
test "materialize(-42)::Int8"
test "materialize(42)::UInt16"
test "materialize(-42)::Int16"
test "materialize(42)::UInt32"
test "materialize(-42)::Int32"
test "materialize(42)::UInt64"
test "materialize(-42)::Int64"
test "materialize(42)::UInt128"
test "materialize(-42)::Int128"
test "materialize(42)::UInt256"
test "materialize(-42)::Int256"
test "materialize(42.42)::Float32"
test "materialize(42.42)::Float64"
test "materialize('2020-01-01')::Date"
test "materialize('2020-01-01')::Date32"
test "materialize('2020-01-01 00:00:00')::DateTime"
test "materialize('2020-01-01 00:00:00')::DateTime('EST')"
test "materialize('2020-01-01 00:00:00')::DateTime('CET')"
test "materialize('2020-01-01 00:00:00.000000')::DateTime64(6)"
test "materialize('2020-01-01 00:00:00.000000')::DateTime64(6, 'EST')"
test "materialize('2020-01-01 00:00:00.000000')::DateTime64(6, 'CET')"
test "materialize('Hello, World!')"
test "materialize('aaaaa')::FixedString(5)"
test "materialize('a')::Enum8('a' = 1, 'b' = 2, 'c' = -128)"
test "materialize('a')::Enum16('a' = 1, 'b' = 2, 'c' = -1280)"
test "materialize(42.42)::Decimal32(3)"
test "materialize(42.42)::Decimal64(3)"
test "materialize(42.42)::Decimal128(3)"
test "materialize(42.42)::Decimal256(3)"
test "materialize('984ac60f-4d08-4ef1-9c62-d82f343fbc90')::UUID"
test "materialize([1, 2, 3])::Array(UInt64)"
test "materialize([[[1], [2]], [[3, 4, 5]]])::Array(Array(Array(UInt64)))"
test "materialize(tuple(1, 'str', 42.42))::Tuple(UInt32, String, Float32)"
test "materialize(tuple(1, 'str', 42.42))::Tuple(a UInt32, b String, c Float32)"
test "materialize(tuple(1, tuple('str', tuple(42.42, -30))))::Tuple(UInt32, Tuple(String, Tuple(Float32, Int8)))"
test "materialize(tuple(1, tuple('str', tuple(42.42, -30))))::Tuple(a UInt32, b Tuple(c String, d Tuple(e Float32, f Int8)))"
test "quantileState(0.5)(42::UInt64)"
test "sumSimpleState(42::UInt64)"
test "toLowCardinality('Hello, World!')"
test "materialize(map(1, 'str1', 2, 'str2'))::Map(UInt64, String)"
test "materialize(map(1, map(1, map(1, 'str1')), 2, map(2, map(2, 'str2'))))::Map(UInt64, Map(UInt64, Map(UInt64, String)))"
test "materialize('127.0.0.0')::IPv4"
test "materialize('2001:db8:cafe:1:0:0:0:1')::IPv6"
test "materialize(true)::Bool"
test "materialize([tuple(1, 2), tuple(3, 4)])::Nested(a UInt32, b UInt32)"
test "materialize([(0, 0), (10, 0), (10, 10), (0, 10)])::Ring"
test "materialize((0, 0))::Point"
test "materialize([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]])::Polygon"
test "materialize([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]])::MultiPolygon"
test "materialize([map(42, tuple(1, [tuple(2, map(1, 2))]))])"
test "materialize(42::UInt32)::Variant(UInt32, String, Tuple(a UInt32, b Array(Map(String, String))))"
test "materialize([map(42, tuple(1, [tuple(2, map(1, 2))]))])::Dynamic"
test "materialize([map(42, tuple(1, [tuple(2, map(1, 2))]))])::Dynamic(max_types=10)"
test "materialize([map(42, tuple(1, [tuple(2, map(1, 2))]))])::Dynamic(max_types=255)"