mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Merge pull request #48122 from Avogar/bson-more-types
Support Enum output/input in BSONEachRow, allow all map key types and avoid extra calculations
This commit is contained in:
commit
9331c6c260
@ -1235,8 +1235,8 @@ For output it uses the following correspondence between ClickHouse types and BSO
|
||||
| ClickHouse type | BSON Type |
|
||||
|-----------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|
|
||||
| [Bool](/docs/en/sql-reference/data-types/boolean.md) | `\x08` boolean |
|
||||
| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
|
||||
| [Int16UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
|
||||
| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 |
|
||||
| [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 |
|
||||
| [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
|
||||
| [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 |
|
||||
| [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 |
|
||||
@ -1255,30 +1255,30 @@ For output it uses the following correspondence between ClickHouse types and BSO
|
||||
| [Array](/docs/en/sql-reference/data-types/array.md) | `\x04` array |
|
||||
| [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x04` array |
|
||||
| [Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x03` document |
|
||||
| [Map](/docs/en/sql-reference/data-types/map.md) (with String keys) | `\x03` document |
|
||||
| [Map](/docs/en/sql-reference/data-types/map.md) | `\x03` document |
|
||||
| [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `\x10` int32 |
|
||||
| [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `\x05` binary, `\x00` binary subtype |
|
||||
|
||||
For input it uses the following correspondence between BSON types and ClickHouse types:
|
||||
|
||||
| BSON Type | ClickHouse Type |
|
||||
|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) |
|
||||
| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) |
|
||||
| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) |
|
||||
| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) |
|
||||
| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
|
||||
| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
|
||||
| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) |
|
||||
| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
|
||||
| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) |
|
||||
| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) |
|
||||
| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
|
||||
| BSON Type | ClickHouse Type |
|
||||
|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) |
|
||||
| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) |
|
||||
| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) |
|
||||
| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) |
|
||||
| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
|
||||
| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
|
||||
| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) |
|
||||
| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
|
||||
| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) |
|
||||
| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
|
||||
| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md)/[Enum8/Enum16](/docs/en/sql-reference/data-types/enum.md) |
|
||||
| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
|
||||
|
||||
Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8).
|
||||
Big integers and decimals (Int128/UInt128/Int256/UInt256/Decimal128/Decimal256) can be parsed from BSON Binary value with `\x00` binary subtype. In this case this format will validate that the size of binary data equals the size of expected value.
|
||||
|
@ -446,11 +446,6 @@ void BSONEachRowRowInputFormat::readMap(IColumn & column, const DataTypePtr & da
|
||||
|
||||
const auto * data_type_map = assert_cast<const DataTypeMap *>(data_type.get());
|
||||
const auto & key_data_type = data_type_map->getKeyType();
|
||||
if (!isStringOrFixedString(key_data_type))
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Only maps with String key type are supported in BSON, got key type: {}",
|
||||
key_data_type->getName());
|
||||
|
||||
const auto & value_data_type = data_type_map->getValueType();
|
||||
auto & column_map = assert_cast<ColumnMap &>(column);
|
||||
auto & key_column = column_map.getNestedData().getColumn(0);
|
||||
@ -464,7 +459,8 @@ void BSONEachRowRowInputFormat::readMap(IColumn & column, const DataTypePtr & da
|
||||
{
|
||||
auto nested_bson_type = getBSONType(readBSONType(*in));
|
||||
auto name = readBSONKeyName(*in, current_key_name);
|
||||
key_column.insertData(name.data, name.size);
|
||||
ReadBufferFromMemory buf(name.data, name.size);
|
||||
key_data_type->getDefaultSerialization()->deserializeWholeText(key_column, buf, format_settings);
|
||||
readField(value_column, value_data_type, nested_bson_type);
|
||||
}
|
||||
|
||||
@ -511,6 +507,7 @@ bool BSONEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr &
|
||||
lc_column.insertFromFullColumn(*tmp_column, 0);
|
||||
return res;
|
||||
}
|
||||
case TypeIndex::Enum8: [[fallthrough]];
|
||||
case TypeIndex::Int8:
|
||||
{
|
||||
readAndInsertInteger<Int8>(*in, column, data_type, bson_type);
|
||||
@ -521,6 +518,7 @@ bool BSONEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr &
|
||||
readAndInsertInteger<UInt8>(*in, column, data_type, bson_type);
|
||||
return true;
|
||||
}
|
||||
case TypeIndex::Enum16: [[fallthrough]];
|
||||
case TypeIndex::Int16:
|
||||
{
|
||||
readAndInsertInteger<Int16>(*in, column, data_type, bson_type);
|
||||
@ -1008,6 +1006,9 @@ fileSegmentationEngineBSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t
|
||||
"the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely BSON is malformed",
|
||||
min_bytes, document_size);
|
||||
|
||||
if (document_size < sizeof(document_size))
|
||||
throw ParsingException(ErrorCodes::INCORRECT_DATA, "Size of BSON document is invalid");
|
||||
|
||||
size_t old_size = memory.size();
|
||||
memory.resize(old_size + document_size);
|
||||
unalignedStore<BSONSizeT>(memory.data() + old_size, document_size);
|
||||
|
@ -33,13 +33,14 @@ namespace ErrorCodes
|
||||
}
|
||||
|
||||
/// In BSON all names should be valid UTF8 sequences
|
||||
static String toValidUTF8String(const String & name)
|
||||
static String toValidUTF8String(const String & name, const FormatSettings & settings)
|
||||
{
|
||||
WriteBufferFromOwnString buf;
|
||||
WriteBufferValidUTF8 validating_buf(buf);
|
||||
writeString(name, validating_buf);
|
||||
writeJSONString(name, validating_buf, settings);
|
||||
validating_buf.finalize();
|
||||
return buf.str();
|
||||
/// Return value without quotes
|
||||
return buf.str().substr(1, buf.str().size() - 2);
|
||||
}
|
||||
|
||||
BSONEachRowRowOutputFormat::BSONEachRowRowOutputFormat(
|
||||
@ -49,7 +50,7 @@ BSONEachRowRowOutputFormat::BSONEachRowRowOutputFormat(
|
||||
const auto & sample = getPort(PortKind::Main).getHeader();
|
||||
fields.reserve(sample.columns());
|
||||
for (const auto & field : sample.getNamesAndTypes())
|
||||
fields.emplace_back(toValidUTF8String(field.name), field.type);
|
||||
fields.emplace_back(toValidUTF8String(field.name, settings), field.type);
|
||||
}
|
||||
|
||||
static void writeBSONSize(size_t size, WriteBuffer & buf)
|
||||
@ -112,7 +113,7 @@ static void writeBSONBigInteger(const IColumn & column, size_t row_num, const St
|
||||
buf.write(data.data, data.size);
|
||||
}
|
||||
|
||||
size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name)
|
||||
size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name, const String & path, std::unordered_map<String, size_t> & nested_document_sizes)
|
||||
{
|
||||
size_t size = 1; // Field type
|
||||
size += name.size() + 1; // Field name and \0
|
||||
@ -125,6 +126,8 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
|
||||
case TypeIndex::Date32: [[fallthrough]];
|
||||
case TypeIndex::Decimal32: [[fallthrough]];
|
||||
case TypeIndex::IPv4: [[fallthrough]];
|
||||
case TypeIndex::Enum8: [[fallthrough]];
|
||||
case TypeIndex::Enum16: [[fallthrough]];
|
||||
case TypeIndex::Int32:
|
||||
{
|
||||
return size + sizeof(Int32);
|
||||
@ -183,7 +186,7 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
|
||||
auto dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
|
||||
auto dict_column = lc_column.getDictionary().getNestedColumn();
|
||||
size_t index = lc_column.getIndexAt(row_num);
|
||||
return countBSONFieldSize(*dict_column, dict_type, index, name);
|
||||
return countBSONFieldSize(*dict_column, dict_type, index, name, path, nested_document_sizes);
|
||||
}
|
||||
case TypeIndex::Nullable:
|
||||
{
|
||||
@ -191,11 +194,11 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
|
||||
const ColumnNullable & column_nullable = assert_cast<const ColumnNullable &>(column);
|
||||
if (column_nullable.isNullAt(row_num))
|
||||
return size; /// Null has no value, just type
|
||||
return countBSONFieldSize(column_nullable.getNestedColumn(), nested_type, row_num, name);
|
||||
return countBSONFieldSize(column_nullable.getNestedColumn(), nested_type, row_num, name, path, nested_document_sizes);
|
||||
}
|
||||
case TypeIndex::Array:
|
||||
{
|
||||
size += sizeof(BSONSizeT); // Size of a document
|
||||
size_t document_size = sizeof(BSONSizeT); // Size of a document
|
||||
|
||||
const auto & nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType();
|
||||
const ColumnArray & column_array = assert_cast<const ColumnArray &>(column);
|
||||
@ -204,39 +207,41 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
|
||||
size_t offset = offsets[row_num - 1];
|
||||
size_t array_size = offsets[row_num] - offset;
|
||||
|
||||
String current_path = path + "." + name;
|
||||
for (size_t i = 0; i < array_size; ++i)
|
||||
size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i)); // Add size of each value from array
|
||||
document_size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i), current_path, nested_document_sizes); // Add size of each value from array
|
||||
|
||||
return size + sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
nested_document_sizes[current_path] = document_size;
|
||||
return size + document_size;
|
||||
}
|
||||
case TypeIndex::Tuple:
|
||||
{
|
||||
size += sizeof(BSONSizeT); // Size of a document
|
||||
size_t document_size = sizeof(BSONSizeT); // Size of a document
|
||||
|
||||
const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
|
||||
const auto & nested_types = tuple_type->getElements();
|
||||
bool have_explicit_names = tuple_type->haveExplicitNames();
|
||||
const auto & nested_names = tuple_type->getElementNames();
|
||||
const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
|
||||
const auto & nested_columns = tuple_column.getColumns();
|
||||
|
||||
String current_path = path + "." + name;
|
||||
for (size_t i = 0; i < nested_columns.size(); ++i)
|
||||
{
|
||||
String key_name = have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i);
|
||||
size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name); // Add size of each value from tuple
|
||||
String key_name = toValidUTF8String(nested_names[i], settings);
|
||||
document_size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name, current_path, nested_document_sizes); // Add size of each value from tuple
|
||||
}
|
||||
|
||||
return size + sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
nested_document_sizes[current_path] = document_size;
|
||||
return size + document_size;
|
||||
}
|
||||
case TypeIndex::Map:
|
||||
{
|
||||
size += sizeof(BSONSizeT); // Size of a document
|
||||
size_t document_size = sizeof(BSONSizeT); // Size of a document
|
||||
|
||||
const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
|
||||
if (!isStringOrFixedString(map_type.getKeyType()))
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Only maps with String key type are supported in BSON, got key type: {}",
|
||||
map_type.getKeyType()->getName());
|
||||
const auto & key_type = map_type.getKeyType();
|
||||
const auto & value_type = map_type.getValueType();
|
||||
|
||||
const auto & map_column = assert_cast<const ColumnMap &>(column);
|
||||
@ -248,20 +253,26 @@ size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, co
|
||||
size_t offset = offsets[row_num - 1];
|
||||
size_t map_size = offsets[row_num] - offset;
|
||||
|
||||
WriteBufferFromOwnString buf;
|
||||
String current_path = path + "." + name;
|
||||
for (size_t i = 0; i < map_size; ++i)
|
||||
{
|
||||
String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
|
||||
size += countBSONFieldSize(*value_column, value_type, offset + i, key);
|
||||
key_type->getDefaultSerialization()->serializeText(*key_column, offset + i, buf, settings);
|
||||
auto s = countBSONFieldSize(*value_column, value_type, offset + i, toValidUTF8String(buf.str(), settings), current_path, nested_document_sizes);
|
||||
document_size += s;
|
||||
buf.restart();
|
||||
}
|
||||
|
||||
return size + sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
nested_document_sizes[current_path] = document_size;
|
||||
return size + document_size;
|
||||
}
|
||||
default:
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in BSON output format", data_type->getName());
|
||||
}
|
||||
}
|
||||
|
||||
void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name)
|
||||
void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name, const String & path, std::unordered_map<String, size_t> & nested_document_sizes)
|
||||
{
|
||||
switch (data_type->getTypeId())
|
||||
{
|
||||
@ -275,6 +286,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
writeBSONNumber<ColumnFloat64, double>(BSONType::DOUBLE, column, row_num, name, out);
|
||||
break;
|
||||
}
|
||||
case TypeIndex::Enum8: [[fallthrough]];
|
||||
case TypeIndex::Int8:
|
||||
{
|
||||
writeBSONNumber<ColumnInt8, Int32>(BSONType::INT32, column, row_num, name, out);
|
||||
@ -288,6 +300,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
writeBSONNumber<ColumnUInt8, Int32>(BSONType::INT32, column, row_num, name, out);
|
||||
break;
|
||||
}
|
||||
case TypeIndex::Enum16: [[fallthrough]];
|
||||
case TypeIndex::Int16:
|
||||
{
|
||||
writeBSONNumber<ColumnInt16, Int32>(BSONType::INT32, column, row_num, name, out);
|
||||
@ -403,7 +416,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
auto dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
|
||||
auto dict_column = lc_column.getDictionary().getNestedColumn();
|
||||
size_t index = lc_column.getIndexAt(row_num);
|
||||
serializeField(*dict_column, dict_type, index, name);
|
||||
serializeField(*dict_column, dict_type, index, name, path, nested_document_sizes);
|
||||
break;
|
||||
}
|
||||
case TypeIndex::Nullable:
|
||||
@ -411,7 +424,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
auto nested_type = removeNullable(data_type);
|
||||
const ColumnNullable & column_nullable = assert_cast<const ColumnNullable &>(column);
|
||||
if (!column_nullable.isNullAt(row_num))
|
||||
serializeField(column_nullable.getNestedColumn(), nested_type, row_num, name);
|
||||
serializeField(column_nullable.getNestedColumn(), nested_type, row_num, name, path, nested_document_sizes);
|
||||
else
|
||||
writeBSONTypeAndKeyName(BSONType::NULL_VALUE, name, out);
|
||||
break;
|
||||
@ -427,15 +440,12 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
|
||||
writeBSONTypeAndKeyName(BSONType::ARRAY, name, out);
|
||||
|
||||
size_t document_size = sizeof(BSONSizeT);
|
||||
for (size_t i = 0; i < array_size; ++i)
|
||||
document_size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i)); // Add size of each value from array
|
||||
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
|
||||
String current_path = path + "." + name;
|
||||
size_t document_size = nested_document_sizes[current_path];
|
||||
writeBSONSize(document_size, out);
|
||||
|
||||
for (size_t i = 0; i < array_size; ++i)
|
||||
serializeField(nested_column, nested_type, offset + i, std::to_string(i));
|
||||
serializeField(nested_column, nested_type, offset + i, std::to_string(i), current_path, nested_document_sizes);
|
||||
|
||||
writeChar(BSON_DOCUMENT_END, out);
|
||||
break;
|
||||
@ -444,26 +454,19 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
{
|
||||
const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
|
||||
const auto & nested_types = tuple_type->getElements();
|
||||
bool have_explicit_names = tuple_type->haveExplicitNames();
|
||||
const auto & nested_names = tuple_type->getElementNames();
|
||||
const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
|
||||
const auto & nested_columns = tuple_column.getColumns();
|
||||
|
||||
BSONType bson_type = have_explicit_names ? BSONType::DOCUMENT : BSONType::ARRAY;
|
||||
BSONType bson_type = tuple_type->haveExplicitNames() ? BSONType::DOCUMENT : BSONType::ARRAY;
|
||||
writeBSONTypeAndKeyName(bson_type, name, out);
|
||||
|
||||
size_t document_size = sizeof(BSONSizeT);
|
||||
for (size_t i = 0; i < nested_columns.size(); ++i)
|
||||
{
|
||||
String key_name = have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i);
|
||||
document_size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name); // Add size of each value from tuple
|
||||
}
|
||||
document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
|
||||
|
||||
String current_path = path + "." + name;
|
||||
size_t document_size = nested_document_sizes[current_path];
|
||||
writeBSONSize(document_size, out);
|
||||
|
||||
for (size_t i = 0; i < nested_columns.size(); ++i)
|
||||
serializeField(*nested_columns[i], nested_types[i], row_num, have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i));
|
||||
serializeField(*nested_columns[i], nested_types[i], row_num, toValidUTF8String(nested_names[i], settings), current_path, nested_document_sizes);
|
||||
|
||||
writeChar(BSON_DOCUMENT_END, out);
|
||||
break;
|
||||
@ -471,10 +474,7 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
case TypeIndex::Map:
|
||||
{
|
||||
const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
|
||||
if (!isStringOrFixedString(map_type.getKeyType()))
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Only maps with String key type are supported in BSON, got key type: {}",
|
||||
map_type.getKeyType()->getName());
|
||||
const auto & key_type = map_type.getKeyType();
|
||||
const auto & value_type = map_type.getValueType();
|
||||
|
||||
const auto & map_column = assert_cast<const ColumnMap &>(column);
|
||||
@ -488,20 +488,16 @@ void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const Da
|
||||
|
||||
writeBSONTypeAndKeyName(BSONType::DOCUMENT, name, out);
|
||||
|
||||
size_t document_size = sizeof(BSONSizeT);
|
||||
for (size_t i = 0; i < map_size; ++i)
|
||||
{
|
||||
String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
|
||||
document_size += countBSONFieldSize(*value_column, value_type, offset + i, key);
|
||||
}
|
||||
document_size += sizeof(BSON_DOCUMENT_END);
|
||||
|
||||
String current_path = path + "." + name;
|
||||
size_t document_size = nested_document_sizes[current_path];
|
||||
writeBSONSize(document_size, out);
|
||||
|
||||
WriteBufferFromOwnString buf;
|
||||
for (size_t i = 0; i < map_size; ++i)
|
||||
{
|
||||
String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
|
||||
serializeField(*value_column, value_type, offset + i, key);
|
||||
key_type->getDefaultSerialization()->serializeText(*key_column, offset + i, buf, settings);
|
||||
serializeField(*value_column, value_type, offset + i, toValidUTF8String(buf.str(), settings), current_path, nested_document_sizes);
|
||||
buf.restart();
|
||||
}
|
||||
|
||||
writeChar(BSON_DOCUMENT_END, out);
|
||||
@ -516,15 +512,18 @@ void BSONEachRowRowOutputFormat::write(const Columns & columns, size_t row_num)
|
||||
{
|
||||
/// We should calculate and write document size before its content
|
||||
size_t document_size = sizeof(BSONSizeT);
|
||||
/// Remember calculated sizes for nested documents (map document path -> size), so we won't need
|
||||
/// to recalculate it while serializing.
|
||||
std::unordered_map<String, size_t> nested_document_sizes;
|
||||
for (size_t i = 0; i != columns.size(); ++i)
|
||||
document_size += countBSONFieldSize(*columns[i], fields[i].type, row_num, fields[i].name);
|
||||
document_size += countBSONFieldSize(*columns[i], fields[i].type, row_num, fields[i].name, "$", nested_document_sizes);
|
||||
document_size += sizeof(BSON_DOCUMENT_END);
|
||||
|
||||
size_t document_start = out.count();
|
||||
writeBSONSize(document_size, out);
|
||||
|
||||
for (size_t i = 0; i != columns.size(); ++i)
|
||||
serializeField(*columns[i], fields[i].type, row_num, fields[i].name);
|
||||
serializeField(*columns[i], fields[i].type, row_num, fields[i].name, "$", nested_document_sizes);
|
||||
|
||||
writeChar(BSON_DOCUMENT_END, out);
|
||||
|
||||
|
@ -17,8 +17,8 @@ namespace DB
|
||||
*
|
||||
* ClickHouse type | BSON Type
|
||||
* Bool | \x08 boolean
|
||||
* Int8/UInt8 | \x10 int32
|
||||
* Int16UInt16 | \x10 int32
|
||||
* Int8/UInt8/Enum8 | \x10 int32
|
||||
* Int16UInt16/Enum16 | \x10 int32
|
||||
* Int32 | \x10 int32
|
||||
* UInt32 | \x12 int64
|
||||
* Int64 | \x12 int64
|
||||
@ -38,7 +38,7 @@ namespace DB
|
||||
* Array | \x04 array
|
||||
* Tuple | \x04 array
|
||||
* Named Tuple | \x03 document
|
||||
* Map (with String keys) | \x03 document
|
||||
* Map | \x03 document
|
||||
*
|
||||
* Note: on Big-Endian platforms this format will not work properly.
|
||||
*/
|
||||
@ -55,12 +55,24 @@ private:
|
||||
void write(const Columns & columns, size_t row_num) override;
|
||||
void writeField(const IColumn &, const ISerialization &, size_t) override { }
|
||||
|
||||
void serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name);
|
||||
void serializeField(
|
||||
const IColumn & column,
|
||||
const DataTypePtr & data_type,
|
||||
size_t row_num,
|
||||
const String & name,
|
||||
const String & path,
|
||||
std::unordered_map<String, size_t> & nested_document_sizes);
|
||||
|
||||
/// Count field size in bytes that we will get after serialization in BSON format.
|
||||
/// It's needed to calculate document size before actual serialization,
|
||||
/// because in BSON format we should write the size of the document before its content.
|
||||
size_t countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name);
|
||||
size_t countBSONFieldSize(
|
||||
const IColumn & column,
|
||||
const DataTypePtr & data_type,
|
||||
size_t row_num,
|
||||
const String & name,
|
||||
const String & path,
|
||||
std::unordered_map<String, size_t> & nested_document_sizes);
|
||||
|
||||
NamesAndTypes fields;
|
||||
FormatSettings settings;
|
||||
|
@ -121,7 +121,7 @@ Nullable
|
||||
2
|
||||
0
|
||||
4
|
||||
FAIL
|
||||
OK
|
||||
null Nullable(Int64)
|
||||
0
|
||||
\N
|
||||
@ -191,8 +191,11 @@ tuple Tuple(Nullable(Int64), Nullable(String))
|
||||
(3,'Hello')
|
||||
(4,'Hello')
|
||||
Map
|
||||
OK
|
||||
OK
|
||||
{1:0,2:1}
|
||||
{1:1,2:2}
|
||||
{1:2,2:3}
|
||||
{1:3,2:4}
|
||||
{1:4,2:5}
|
||||
{'a':0,'b':1}
|
||||
{'a':1,'b':2}
|
||||
{'a':2,'b':3}
|
||||
|
@ -88,7 +88,7 @@ echo "Nullable"
|
||||
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'null Nullable(UInt32)') select number % 2 ? NULL : number from numbers(5) settings engine_file_truncate_on_insert=1"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null Nullable(UInt32)')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null UInt32')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null UInt32') settings input_format_null_as_default=0" 2>&1 | grep -q -F "INCORRECT_DATA" && echo "OK" || echo "FAIL"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'null UInt32') settings input_format_null_as_default=0" 2>&1 | grep -q -F "ILLEGAL_COLUMN" && echo "OK" || echo "FAIL"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow)"
|
||||
@ -132,10 +132,10 @@ $CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow)"
|
||||
|
||||
|
||||
echo "Map"
|
||||
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)') select map(1, number, 2, number + 1) from numbers(5) settings engine_file_truncate_on_insert=1" 2>&1 | grep -q -F "ILLEGAL_COLUMN" && echo "OK" || echo "FAIL"
|
||||
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(String, UInt64)') select map('a', number, 'b', number + 1) from numbers(5) settings engine_file_truncate_on_insert=1"
|
||||
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)') select map(1, number, 2, number + 1) from numbers(5) settings engine_file_truncate_on_insert=1"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)')"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'map Map(UInt64, UInt64)')" 2>&1 | grep -q -F "ILLEGAL_COLUMN" && echo "OK" || echo "FAIL"
|
||||
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow, auto, 'map Map(String, UInt64)') select map('a', number, 'b', number + 1) from numbers(5) settings engine_file_truncate_on_insert=1"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'map Map(String, UInt64)')"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
|
||||
|
Binary file not shown.
@ -0,0 +1,5 @@
|
||||
{'a\\u0000b':42}
|
||||
c1 Nullable(Int32)
|
||||
c2 Nullable(Int32)
|
||||
c3 Map(String, Nullable(Int32))
|
||||
a b {42:42}
|
15
tests/queries/0_stateless/02593_bson_more_types.sh
Executable file
15
tests/queries/0_stateless/02593_bson_more_types.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select map('a\0b', 42) as c1 format BSONEachRow" | $CLICKHOUSE_LOCAL --input-format BSONEachRow --table test --structure "c1 Map(String, UInt32)" -q "select * from test"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select 'a'::Enum8('a' = 1) as c1, 'b'::Enum16('b' = 1) as c2, map(42, 42) as c3 format BSONEachRow" | $CLICKHOUSE_LOCAL --input-format BSONEachRow --table test -q "desc test"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select 'a'::Enum8('a' = 1) as c1, 'b'::Enum16('b' = 1) as c2, map(42, 42) as c3 format BSONEachRow" | $CLICKHOUSE_LOCAL --input-format BSONEachRow --table test --structure "c1 Enum8('a' = 1), c2 Enum16('b' = 1), c3 Map(UInt32, UInt32)" -q "select * from test"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user