Merge pull request #48122 from Avogar/bson-more-types

Support Enum output/input in BSONEachRow, allow all map key types and avoid extra calculations
This commit is contained in:
Kruglov Pavel 2023-04-05 18:26:19 +02:00 committed by GitHub
commit 9331c6c260
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 133 additions and 98 deletions

View File

@ -1235,8 +1235,8 @@ For output it uses the following correspondence between ClickHouse types and BSO
| ClickHouse type | BSON Type |
|-----------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|
| [Bool](/docs/en/sql-reference/data-types/boolean.md) | `\x08` boolean |
| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
| [Int16UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 |
| [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 |
| [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
| [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 |
| [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 |
@ -1255,30 +1255,30 @@ For output it uses the following correspondence between ClickHouse types and BSO
| [Array](/docs/en/sql-reference/data-types/array.md) | `\x04` array |
| [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x04` array |
| [Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x03` document |
| [Map](/docs/en/sql-reference/data-types/map.md) (with String keys) | `\x03` document |
| [Map](/docs/en/sql-reference/data-types/map.md) | `\x03` document |
| [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `\x10` int32 |
| [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `\x05` binary, `\x00` binary subtype |
For input it uses the following correspondence between BSON types and ClickHouse types:
| BSON Type | ClickHouse Type |
|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) |
| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) |
| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) |
| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) |
| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) |
| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
| BSON Type | ClickHouse Type |
|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) |
| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) |
| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) |
| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) |
| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md)/[Enum8/Enum16](/docs/en/sql-reference/data-types/enum.md) |
| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8).
Big integers and decimals (Int128/UInt128/Int256/UInt256/Decimal128/Decimal256) can be parsed from BSON Binary value with `\x00` binary subtype. In this case this format will validate that the size of binary data equals the size of expected value.

View File

@ -446,11 +446,6 @@ void BSONEachRowRowInputFormat::readMap(IColumn & column, const DataTypePtr & da
const auto * data_type_map = assert_cast<const DataTypeMap *>(data_type.get());
const auto & key_data_type = data_type_map->getKeyType();
if (!isStringOrFixedString(key_data_type))
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
"Only maps with String key type are supported in BSON, got key type: {}",
key_data_type->getName());
const auto & value_data_type = data_type_map->getValueType();
auto & column_map = assert_cast<ColumnMap &>(column);
auto & key_column = column_map.getNestedData().getColumn(0);
@ -464,7 +459,8 @@ void BSONEachRowRowInputFormat::readMap(IColumn & column, const DataTypePtr & da
{
auto nested_bson_type = getBSONType(readBSONType(*in));
auto name = readBSONKeyName(*in, current_key_name);
key_column.insertData(name.data, name.size);
ReadBufferFromMemory buf(name.data, name.size);
key_data_type->getDefaultSerialization()->deserializeWholeText(key_column, buf, format_settings);
readField(value_column, value_data_type, nested_bson_type);
}
@ -511,6 +507,7 @@ bool BSONEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr &
lc_column.insertFromFullColumn(*tmp_column, 0);
return res;
}
case TypeIndex::Enum8: [[fallthrough]];
case TypeIndex::Int8:
{
readAndInsertInteger<Int8>(*in, column, data_type, bson_type);
@ -521,6 +518,7 @@ bool BSONEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr &
readAndInsertInteger<UInt8>(*in, column, data_type, bson_type);
return true;
}
case TypeIndex::Enum16: [[fallthrough]];
case TypeIndex::Int16:
{
readAndInsertInteger<Int16>(*in, column, data_type, bson_type);
@ -1008,6 +1006,9 @@ fileSegmentationEngineBSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t
"the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely BSON is malformed",
min_bytes, document_size);
if (document_size < sizeof(document_size))
throw ParsingException(ErrorCodes::INCORRECT_DATA, "Size of BSON document is invalid");
size_t old_size = memory.size();
memory.resize(old_size + document_size);
unalignedStore<BSONSizeT>(memory.data() + old_size, document_size);

View File

@ -33,13 +33,14 @@ namespace ErrorCodes
}
/// In BSON all names should be valid UTF8 sequences
static String toValidUTF8String(const String & name)
static String toValidUTF8String(const String & name, const FormatSettings & settings)
{
WriteBufferFromOwnString buf;
WriteBufferValidUTF8 validating_buf(buf);
writeString(name, validating_buf);
writeJSONString(name, validating_buf, settings);
validating_buf.finalize();
return buf.str();
/// Return value without quotes
return buf.str().substr(1, buf.str().size() - 2);
}
BSONEachRowRowOutputFormat::BSONEachRowRowOutputFormat(
@ -49,7 +50,7 @@ BSONEachRowRowOutputFormat::BSONEachRowRowOutputFormat(
const auto & sample = getPort(PortKind::Main).getHeader();
fields.reserve(sample.columns());
for (const auto & field : sample.getNamesAndTypes())
fields.emplace_back(toValidUTF8String(field.name), field.type);
fields.emplace_back(toValidUTF8String(field.name, settings), field.type);
}
static void writeBSONSize(size_t size, WriteBuffer & buf)
@ -112,7 +113,7 @@ static void writeBSONBigInteger(const IColumn & column, size_t row_num, const St
buf.write(data.data, data.size);
}
size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name)
size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name, const String & path, std::unordered_map<String, size_t> & nested_document_sizes)
{
size_t size = 1; // Field type
size += name.size() + 1; // Field name and \0
@ -125,6 +126,8 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
case TypeIndex::Date32: [[fallthrough]];
case TypeIndex::Decimal32: [[fallthrough]];
case TypeIndex::IPv4: [[fallthrough]];
case TypeIndex::Enum8: [[fallthrough]];
case TypeIndex::Enum16: [[fallthrough]];
case TypeIndex::Int32:
{
return size + sizeof(Int32);
@ -183,7 +186,7 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
auto dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
auto dict_column = lc_column.getDictionary().getNestedColumn();
size_t index = lc_column.getIndexAt(row_num);
return countBSONFieldSize(*dict_column, dict_type, index, name);
return countBSONFieldSize(*dict_column, dict_type, index, name, path, nested_document_sizes);
}
case TypeIndex::Nullable:
{
@ -191,11 +194,11 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
const ColumnNullable & column_nullable = assert_cast<const ColumnNullable &>(column);
if (column_nullable.isNullAt(row_num))
return size; /// Null has no value, just type
return countBSONFieldSize(column_nullable.getNestedColumn(), nested_type, row_num, name);
return countBSONFieldSize(column_nullable.getNestedColumn(), nested_type, row_num, name, path, nested_document_sizes);
}
case TypeIndex::Array:
{
size += sizeof(BSONSizeT); // Size of a document
size_t document_size = sizeof(BSONSizeT); // Size of a document
const auto & nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType();
const ColumnArray & column_array = assert_cast<const ColumnArray &>(column);
@ -204,39 +207,41 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
size_t offset = offsets[row_num - 1];
size_t array_size = offsets[row_num] - offset;
String current_path = path + "." + name;
for (size_t i = 0; i < array_size; ++i)
size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i)); // Add size of each value from array
document_size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i), current_path, nested_document_sizes); // Add size of each value from array
return size + sizeof(BSON_DOCUMENT_END); // Add final \0
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
nested_document_sizes[current_path] = document_size;
return size + document_size;
}
case TypeIndex::Tuple:
{
size += sizeof(BSONSizeT); // Size of a document
size_t document_size = sizeof(BSONSizeT); // Size of a document
const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
const auto & nested_types = tuple_type->getElements();
bool have_explicit_names = tuple_type->haveExplicitNames();
const auto & nested_names = tuple_type->getElementNames();
const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
const auto & nested_columns = tuple_column.getColumns();
String current_path = path + "." + name;
for (size_t i = 0; i < nested_columns.size(); ++i)
{
String key_name = have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i);
size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name); // Add size of each value from tuple
String key_name = toValidUTF8String(nested_names[i], settings);
document_size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name, current_path, nested_document_sizes); // Add size of each value from tuple
}
return size + sizeof(BSON_DOCUMENT_END); // Add final \0
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
nested_document_sizes[current_path] = document_size;
return size + document_size;
}
case TypeIndex::Map:
{
size += sizeof(BSONSizeT); // Size of a document
size_t document_size = sizeof(BSONSizeT); // Size of a document
const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
if (!isStringOrFixedString(map_type.getKeyType()))
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
"Only maps with String key type are supported in BSON, got key type: {}",
map_type.getKeyType()->getName());
const auto & key_type = map_type.getKeyType();
const auto & value_type = map_type.getValueType();
const auto & map_column = assert_cast<const ColumnMap &>(column);
@ -248,20 +253,26 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
size_t offset = offsets[row_num - 1];
size_t map_size = offsets[row_num] - offset;
WriteBufferFromOwnString buf;
String current_path = path + "." + name;
for (size_t i = 0; i < map_size; ++i)
{
String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
size += countBSONFieldSize(*value_column, value_type, offset + i, key);
key_type->getDefaultSerialization()->serializeText(*key_column, offset + i, buf, settings);
auto s = countBSONFieldSize(*value_column, value_type, offset + i, toValidUTF8String(buf.str(), settings), current_path, nested_document_sizes);
document_size += s;
buf.restart();
}
return size + sizeof(BSON_DOCUMENT_END); // Add final \0
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
nested_document_sizes[current_path] = document_size;
return size + document_size;
}
default:
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in BSON output format", data_type->getName());
}
}
void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name)
void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name, const String & path, std::unordered_map<String, size_t> & nested_document_sizes)
{
switch (data_type->getTypeId())
{
@ -275,6 +286,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
writeBSONNumber<ColumnFloat64, double>(BSONType::DOUBLE, column, row_num, name, out);
break;
}
case TypeIndex::Enum8: [[fallthrough]];
case TypeIndex::Int8:
{
writeBSONNumber<ColumnInt8, Int32>(BSONType::INT32, column, row_num, name, out);
@ -288,6 +300,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
writeBSONNumber<ColumnUInt8, Int32>(BSONType::INT32, column, row_num, name, out);
break;
}
case TypeIndex::Enum16: [[fallthrough]];
case TypeIndex::Int16:
{
writeBSONNumber<ColumnInt16, Int32>(BSONType::INT32, column, row_num, name, out);
@ -403,7 +416,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
auto dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
auto dict_column = lc_column.getDictionary().getNestedColumn();
size_t index = lc_column.getIndexAt(row_num);
serializeField(*dict_column, dict_type, index, name);
serializeField(*dict_column, dict_type, index, name, path, nested_document_sizes);
break;
}
case TypeIndex::Nullable:
@ -411,7 +424,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
auto nested_type = removeNullable(data_type);
const ColumnNullable & column_nullable = assert_cast<const ColumnNullable &>(column);
if (!column_nullable.isNullAt(row_num))
serializeField(column_nullable.getNestedColumn(), nested_type, row_num, name);
serializeField(column_nullable.getNestedColumn(), nested_type, row_num, name, path, nested_document_sizes);
else
writeBSONTypeAndKeyName(BSONType::NULL_VALUE, name, out);
break;
@ -427,15 +440,12 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
writeBSONTypeAndKeyName(BSONType::ARRAY, name, out);
size_t document_size = sizeof(BSONSizeT);
for (size_t i = 0; i < array_size; ++i)
document_size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i)); // Add size of each value from array
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
String current_path = path + "." + name;
size_t document_size = nested_document_sizes[current_path];
writeBSONSize(document_size, out);
for (size_t i = 0; i < array_size; ++i)
serializeField(nested_column, nested_type, offset + i, std::to_string(i));
serializeField(nested_column, nested_type, offset + i, std::to_string(i), current_path, nested_document_sizes);
writeChar(BSON_DOCUMENT_END, out);
break;
@ -444,26 +454,19 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
const auto & nested_types = tuple_type->getElements();
bool have_explicit_names = tuple_type->haveExplicitNames();
const auto & nested_names = tuple_type->getElementNames();
const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
const auto & nested_columns = tuple_column.getColumns();
BSONType bson_type = have_explicit_names ? BSONType::DOCUMENT : BSONType::ARRAY;
BSONType bson_type = tuple_type->haveExplicitNames() ? BSONType::DOCUMENT : BSONType::ARRAY;
writeBSONTypeAndKeyName(bson_type, name, out);
size_t document_size = sizeof(BSONSizeT);
for (size_t i = 0; i < nested_columns.size(); ++i)
{
String key_name = have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i);
document_size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name); // Add size of each value from tuple
}
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
String current_path = path + "." + name;
size_t document_size = nested_document_sizes[current_path];
writeBSONSize(document_size, out);
for (size_t i = 0; i < nested_columns.size(); ++i)
serializeField(*nested_columns[i], nested_types[i], row_num, have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i));
serializeField(*nested_columns[i], nested_types[i], row_num, toValidUTF8String(nested_names[i], settings), current_path, nested_document_sizes);
writeChar(BSON_DOCUMENT_END, out);
break;
@ -471,10 +474,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
case TypeIndex::Map:
{
const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
if (!isStringOrFixedString(map_type.getKeyType()))
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
"Only maps with String key type are supported in BSON, got key type: {}",
map_type.getKeyType()->getName());
const auto & key_type = map_type.getKeyType();
const auto & value_type = map_type.getValueType();
const auto & map_column = assert_cast<const ColumnMap &>(column);
@ -488,20 +488,16 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
writeBSONTypeAndKeyName(BSONType::DOCUMENT, name, out);
size_t document_size = sizeof(BSONSizeT);
for (size_t i = 0; i < map_size; ++i)
{
String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
document_size += countBSONFieldSize(*value_column, value_type, offset + i, key);
}
document_size += sizeof(BSON_DOCUMENT_END);
String current_path = path + "." + name;
size_t document_size = nested_document_sizes[current_path];
writeBSONSize(document_size, out);
WriteBufferFromOwnString buf;
for (size_t i = 0; i < map_size; ++i)
{
String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
serializeField(*value_column, value_type, offset + i, key);
key_type->getDefaultSerialization()->serializeText(*key_column, offset + i, buf, settings);
serializeField(*value_column, value_type, offset + i, toValidUTF8String(buf.str(), settings), current_path, nested_document_sizes);
buf.restart();
}
writeChar(BSON_DOCUMENT_END, out);
@ -516,15 +512,18 @@ void BSONEachRowRowOutputFormat::write(const Columns & columns, size_t row_num)
{
/// We should calculate and write document size before its content
size_t document_size = sizeof(BSONSizeT);
/// Remember calculated sizes for nested documents (map document path -> size), so we won't need
/// to recalculate it while serializing.
std::unordered_map<String, size_t> nested_document_sizes;
for (size_t i = 0; i != columns.size(); ++i)
document_size += countBSONFieldSize(*columns[i], fields[i].type, row_num, fields[i].name);
document_size += countBSONFieldSize(*columns[i], fields[i].type, row_num, fields[i].name, "$", nested_document_sizes);
document_size += sizeof(BSON_DOCUMENT_END);
size_t document_start = out.count();
writeBSONSize(document_size, out);
for (size_t i = 0; i != columns.size(); ++i)
serializeField(*columns[i], fields[i].type, row_num, fields[i].name);
serializeField(*columns[i], fields[i].type, row_num, fields[i].name, "$", nested_document_sizes);
writeChar(BSON_DOCUMENT_END, out);

View File

@ -17,8 +17,8 @@ namespace DB
*
* ClickHouse type | BSON Type
* Bool | \x08 boolean
* Int8/UInt8 | \x10 int32
* Int16UInt16 | \x10 int32
* Int8/UInt8/Enum8 | \x10 int32
* Int16UInt16/Enum16 | \x10 int32
* Int32 | \x10 int32
* UInt32 | \x12 int64
* Int64 | \x12 int64
@ -38,7 +38,7 @@ namespace DB
* Array | \x04 array
* Tuple | \x04 array
* Named Tuple | \x03 document
* Map (with String keys) | \x03 document
* Map | \x03 document
*
* Note: on Big-Endian platforms this format will not work properly.
*/
@ -55,12 +55,24 @@ private:
void write(const Columns & columns, size_t row_num) override;
void writeField(const IColumn &, const ISerialization &, size_t) override { }
void serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name);
void serializeField(
const IColumn & column,
const DataTypePtr & data_type,
size_t row_num,
const String & name,
const String & path,
std::unordered_map<String, size_t> & nested_document_sizes);
/// Count field size in bytes that we will get after serialization in BSON format.
/// It's needed to calculate document size before actual serialization,
/// because in BSON format we should write the size of the document before its content.
size_t countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name);
size_t countBSONFieldSize(
const IColumn & column,
const DataTypePtr & data_type,
size_t row_num,
const String & name,
const String & path,
std::unordered_map<String, size_t> & nested_document_sizes);
NamesAndTypes fields;
FormatSettings settings;

View File

@ -121,7 +121,7 @@ Nullable
2
0
4
FAIL
OK
null Nullable(Int64)
0
\N
@ -191,8 +191,11 @@ tuple Tuple(Nullable(Int64), Nullable(String))
(3,'Hello')
(4,'Hello')
Map
OK
OK
{1:0,2:1}
{1:1,2:2}
{1:2,2:3}
{1:3,2:4}
{1:4,2:5}
{'a':0,'b':1}
{'a':1,'b':2}
{'a':2,'b':3}

View File

@ -88,7 +88,7 @@ echo "Nullable"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'null Nullable(UInt32)') select number % 2 ? NULL : number from numbers(5) settings engine_file_truncate_on_insert=1"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null Nullable(UInt32)')"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null UInt32')"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null UInt32') settings input_format_null_as_default=0" 2>&1 | grep -q -F "INCORRECT_DATA" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null UInt32') settings input_format_null_as_default=0" 2>&1 | grep -q -F "ILLEGAL_COLUMN" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow)"
@ -132,10 +132,10 @@ $CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow)"
echo "Map"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)') select map(1, number, 2, number + 1) from numbers(5) settings engine_file_truncate_on_insert=1" 2>&1 | grep -q -F "ILLEGAL_COLUMN" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(String, UInt64)') select map('a', number, 'b', number + 1) from numbers(5) settings engine_file_truncate_on_insert=1"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)') select map(1, number, 2, number + 1) from numbers(5) settings engine_file_truncate_on_insert=1"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)')"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)')" 2>&1 | grep -q -F "ILLEGAL_COLUMN" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(String, UInt64)') select map('a', number, 'b', number + 1) from numbers(5) settings engine_file_truncate_on_insert=1"
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'map Map(String, UInt64)')"
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"

View File

@ -0,0 +1,5 @@
{'a\\u0000b':42}
c1 Nullable(Int32)
c2 Nullable(Int32)
c3 Map(String, Nullable(Int32))
a b {42:42}

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select map('a\0b', 42) as c1 format BSONEachRow" | $CLICKHOUSE_LOCAL --input-format BSONEachRow --table test --structure "c1 Map(String, UInt32)" -q "select * from test"
$CLICKHOUSE_LOCAL -q "select 'a'::Enum8('a' = 1) as c1, 'b'::Enum16('b' = 1) as c2, map(42, 42) as c3 format BSONEachRow" | $CLICKHOUSE_LOCAL --input-format BSONEachRow --table test -q "desc test"
$CLICKHOUSE_LOCAL -q "select 'a'::Enum8('a' = 1) as c1, 'b'::Enum16('b' = 1) as c2, map(42, 42) as c3 format BSONEachRow" | $CLICKHOUSE_LOCAL --input-format BSONEachRow --table test --structure "c1 Enum8('a' = 1), c2 Enum16('b' = 1), c3 Map(UInt32, UInt32)" -q "select * from test"