Merge pull request #44382 from Avogar/fix-bson-object-id

Fix reading ObjectId in BSON schema inference
This commit is contained in:
Kruglov Pavel 2022-12-21 10:48:50 +01:00 committed by GitHub
commit 09ab5832b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 28 additions and 4 deletions

View File

@ -7,6 +7,8 @@ namespace DB
{ {
static const uint8_t BSON_DOCUMENT_END = 0x00; static const uint8_t BSON_DOCUMENT_END = 0x00;
static const size_t BSON_OBJECT_ID_SIZE = 12;
static const size_t BSON_DB_POINTER_SIZE = 12;
using BSONSizeT = uint32_t; using BSONSizeT = uint32_t;
static const BSONSizeT MAX_BSON_SIZE = std::numeric_limits<BSONSizeT>::max(); static const BSONSizeT MAX_BSON_SIZE = std::numeric_limits<BSONSizeT>::max();

View File

@ -18,6 +18,7 @@
#include <Columns/ColumnMap.h> #include <Columns/ColumnMap.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeUUID.h> #include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypeDateTime64.h> #include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
@ -282,7 +283,7 @@ static void readAndInsertString(ReadBuffer & in, IColumn & column, BSONType bson
} }
else if (bson_type == BSONType::OBJECT_ID) else if (bson_type == BSONType::OBJECT_ID)
{ {
readAndInsertStringImpl<is_fixed_string>(in, column, 12); readAndInsertStringImpl<is_fixed_string>(in, column, BSON_OBJECT_ID_SIZE);
} }
else else
{ {
@ -664,7 +665,7 @@ static void skipBSONField(ReadBuffer & in, BSONType type)
} }
case BSONType::OBJECT_ID: case BSONType::OBJECT_ID:
{ {
in.ignore(12); in.ignore(BSON_OBJECT_ID_SIZE);
break; break;
} }
case BSONType::REGEXP: case BSONType::REGEXP:
@ -677,7 +678,7 @@ static void skipBSONField(ReadBuffer & in, BSONType type)
{ {
BSONSizeT size; BSONSizeT size;
readBinary(size, in); readBinary(size, in);
in.ignore(size + 12); in.ignore(size + BSON_DB_POINTER_SIZE);
break; break;
} }
case BSONType::JAVA_SCRIPT_CODE_W_SCOPE: case BSONType::JAVA_SCRIPT_CODE_W_SCOPE:
@ -796,7 +797,6 @@ DataTypePtr BSONEachRowSchemaReader::getDataTypeFromBSONField(BSONType type, boo
} }
case BSONType::SYMBOL: [[fallthrough]]; case BSONType::SYMBOL: [[fallthrough]];
case BSONType::JAVA_SCRIPT_CODE: [[fallthrough]]; case BSONType::JAVA_SCRIPT_CODE: [[fallthrough]];
case BSONType::OBJECT_ID: [[fallthrough]];
case BSONType::STRING: case BSONType::STRING:
{ {
BSONSizeT size; BSONSizeT size;
@ -804,6 +804,11 @@ DataTypePtr BSONEachRowSchemaReader::getDataTypeFromBSONField(BSONType type, boo
in.ignore(size); in.ignore(size);
return std::make_shared<DataTypeString>(); return std::make_shared<DataTypeString>();
} }
case BSONType::OBJECT_ID:;
{
in.ignore(BSON_OBJECT_ID_SIZE);
return makeNullable(std::make_shared<DataTypeFixedString>(BSON_OBJECT_ID_SIZE));
}
case BSONType::DOCUMENT: case BSONType::DOCUMENT:
{ {
auto nested_names_and_types = getDataTypesFromBSONDocument(false); auto nested_names_and_types = getDataTypesFromBSONDocument(false);
@ -954,6 +959,7 @@ void registerInputFormatBSONEachRow(FormatFactory & factory)
"BSONEachRow", "BSONEachRow",
[](ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings) [](ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings)
{ return std::make_shared<BSONEachRowRowInputFormat>(buf, sample, std::move(params), settings); }); { return std::make_shared<BSONEachRowRowInputFormat>(buf, sample, std::move(params), settings); });
factory.registerFileExtension("bson", "BSONEachRow");
} }
void registerFileSegmentationEngineBSONEachRow(FormatFactory & factory) void registerFileSegmentationEngineBSONEachRow(FormatFactory & factory)

View File

@ -0,0 +1,6 @@
_id Nullable(FixedString(12))
name Nullable(String)
email Nullable(String)
movie_id Nullable(FixedString(12))
text Nullable(String)
date Nullable(DateTime64(6, \'UTC\'))

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "desc file('$CURDIR/data_bson/comments.bson')"
$CLICKHOUSE_LOCAL -q "select _id from file('$CURDIR/data_bson/comments.bson') format Null"

Binary file not shown.