Merge branch 'master' into rocksdb_metacache

This commit is contained in:
李扬 2022-02-08 03:05:40 -06:00 committed by GitHub
commit f7c2a32b96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 149 additions and 12 deletions

View File

@ -142,6 +142,7 @@ Checks: '-*,
clang-analyzer-cplusplus.PlacementNewChecker,
clang-analyzer-cplusplus.SelfAssignment,
clang-analyzer-deadcode.DeadStores,
clang-analyzer-cplusplus.Move,
clang-analyzer-optin.cplusplus.VirtualCall,
clang-analyzer-security.insecureAPI.UncheckedReturn,
clang-analyzer-security.insecureAPI.bcmp,

View File

@ -610,6 +610,7 @@ class IColumn;
M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \
M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \
\
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \

View File

@ -130,4 +130,10 @@ IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS,
{"JSON", FormatSettings::EscapingRule::JSON},
{"XML", FormatSettings::EscapingRule::XML},
{"Raw", FormatSettings::EscapingRule::Raw}})
IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation , ErrorCodes::BAD_ARGUMENTS,
{{"bin", FormatSettings::MsgPackUUIDRepresentation::BIN},
{"str", FormatSettings::MsgPackUUIDRepresentation::STR},
{"ext", FormatSettings::MsgPackUUIDRepresentation::EXT}})
}

View File

@ -171,4 +171,6 @@ DECLARE_SETTING_ENUM(ShortCircuitFunctionEvaluation)
DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode)
DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule)
DECLARE_SETTING_ENUM_WITH_RENAME(MsgPackUUIDRepresentation, FormatSettings::MsgPackUUIDRepresentation)
}

View File

@ -129,6 +129,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.seekable_read = settings.input_format_allow_seeks;
format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns;
format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation;
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context

View File

@ -231,9 +231,17 @@ struct FormatSettings
EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES;
} capn_proto;
enum class MsgPackUUIDRepresentation
{
STR, // Output UUID as a string of 36 characters.
BIN, // Output UUID as 16-bytes binary.
EXT, // Output UUID as ExtType = 2
};
struct
{
UInt64 number_of_columns = 0;
MsgPackUUIDRepresentation output_uuid_representation = MsgPackUUIDRepresentation::EXT;
} msgpack;
};

View File

@ -0,0 +1,11 @@
#pragma once
namespace DB
{
enum class MsgPackExtensionTypes
{
UUID = 0x02,
};
}

View File

@ -5,6 +5,7 @@
#include <cstdlib>
#include <Common/assert_cast.h>
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromMemory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeDateTime64.h>
@ -12,6 +13,7 @@
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeUUID.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
@ -20,6 +22,8 @@
#include <Columns/ColumnMap.h>
#include <Columns/ColumnLowCardinality.h>
#include <Formats/MsgPackExtensionTypes.h>
namespace DB
{
@ -153,16 +157,29 @@ static void insertInteger(IColumn & column, DataTypePtr type, UInt64 value)
}
}
static void insertString(IColumn & column, DataTypePtr type, const char * value, size_t size)
static void insertString(IColumn & column, DataTypePtr type, const char * value, size_t size, bool bin)
{
auto insert_func = [&](IColumn & column_, DataTypePtr type_)
{
insertString(column_, type_, value, size);
insertString(column_, type_, value, size, bin);
};
if (checkAndInsertNullable(column, type, insert_func) || checkAndInsertLowCardinality(column, type, insert_func))
return;
if (isUUID(type))
{
ReadBufferFromMemory buf(value, size);
UUID uuid;
if (bin)
readBinary(uuid, buf);
else
readUUIDText(uuid, buf);
assert_cast<ColumnUUID &>(column).insertValue(uuid);
return;
}
if (!isStringOrFixedString(type))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert MessagePack string into column with type {}.", type->getName());
@ -218,6 +235,15 @@ static void insertNull(IColumn & column, DataTypePtr type)
assert_cast<ColumnNullable &>(column).insertDefault();
}
static void insertUUID(IColumn & column, DataTypePtr /*type*/, const char * value, size_t size)
{
ReadBufferFromMemory buf(value, size);
UUID uuid;
readBinaryBigEndian(uuid.toUnderType().items[0], buf);
readBinaryBigEndian(uuid.toUnderType().items[1], buf);
assert_cast<ColumnUUID &>(column).insertValue(uuid);
}
bool MsgPackVisitor::visit_positive_integer(UInt64 value) // NOLINT
{
insertInteger(info_stack.top().column, info_stack.top().type, value);
@ -232,13 +258,13 @@ bool MsgPackVisitor::visit_negative_integer(Int64 value) // NOLINT
bool MsgPackVisitor::visit_str(const char * value, size_t size) // NOLINT
{
insertString(info_stack.top().column, info_stack.top().type, value, size);
insertString(info_stack.top().column, info_stack.top().type, value, size, false);
return true;
}
bool MsgPackVisitor::visit_bin(const char * value, size_t size) // NOLINT
{
insertString(info_stack.top().column, info_stack.top().type, value, size);
insertString(info_stack.top().column, info_stack.top().type, value, size, true);
return true;
}
@ -324,6 +350,18 @@ bool MsgPackVisitor::visit_nil()
return true;
}
bool MsgPackVisitor::visit_ext(const char * value, uint32_t size)
{
int8_t type = *value;
if (*value == int8_t(MsgPackExtensionTypes::UUID))
{
insertUUID(info_stack.top().column, info_stack.top().type, value + 1, size - 1);
return true;
}
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {%x}", type);
}
void MsgPackVisitor::parse_error(size_t, size_t) // NOLINT
{
throw Exception("Error occurred while parsing msgpack data.", ErrorCodes::INCORRECT_DATA);
@ -455,8 +493,13 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object)
}
case msgpack::type::object_type::NIL:
return nullptr;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported");
case msgpack::type::object_type::EXT:
{
msgpack::object_ext object_ext = object.via.ext;
if (object_ext.type() == int8_t(MsgPackExtensionTypes::UUID))
return std::make_shared<DataTypeUUID>();
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {%x} is not supported", object_ext.type());
}
}
}

View File

@ -42,6 +42,7 @@ public:
bool end_map_key();
bool start_map_value();
bool end_map_value();
bool visit_ext(const char * value, uint32_t size);
/// This function will be called if error occurs in parsing
[[noreturn]] void parse_error(size_t parsed_offset, size_t error_offset);
@ -55,7 +56,7 @@ private:
std::stack<Info> info_stack;
};
class MsgPackRowInputFormat final : public IRowInputFormat
class MsgPackRowInputFormat : public IRowInputFormat
{
public:
MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_);

View File

@ -5,6 +5,9 @@
#include <Formats/FormatFactory.h>
#include <Common/assert_cast.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeNullable.h>
@ -19,6 +22,8 @@
#include <Columns/ColumnMap.h>
#include <Columns/ColumnLowCardinality.h>
#include <Formats/MsgPackExtensionTypes.h>
namespace DB
{
@ -27,8 +32,8 @@ namespace ErrorCodes
extern const int ILLEGAL_COLUMN;
}
MsgPackRowOutputFormat::MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_)
: IRowOutputFormat(header_, out_, params_), packer(out_) {}
MsgPackRowOutputFormat::MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_)
: IRowOutputFormat(header_, out_, params_), packer(out_), format_settings(format_settings_) {}
void MsgPackRowOutputFormat::serializeField(const IColumn & column, DataTypePtr data_type, size_t row_num)
{
@ -164,6 +169,42 @@ void MsgPackRowOutputFormat::serializeField(const IColumn & column, DataTypePtr
serializeField(*dict_column, dict_type, index);
return;
}
case TypeIndex::UUID:
{
const auto & uuid_column = assert_cast<const ColumnUUID &>(column);
switch (format_settings.msgpack.output_uuid_representation)
{
case FormatSettings::MsgPackUUIDRepresentation::BIN:
{
WriteBufferFromOwnString buf;
writeBinary(uuid_column.getElement(row_num), buf);
StringRef uuid_bin = buf.stringRef();
packer.pack_bin(uuid_bin.size);
packer.pack_bin_body(uuid_bin.data, uuid_bin.size);
return;
}
case FormatSettings::MsgPackUUIDRepresentation::STR:
{
WriteBufferFromOwnString buf;
writeText(uuid_column.getElement(row_num), buf);
StringRef uuid_text = buf.stringRef();
packer.pack_str(uuid_text.size);
packer.pack_bin_body(uuid_text.data, uuid_text.size);
return;
}
case FormatSettings::MsgPackUUIDRepresentation::EXT:
{
WriteBufferFromOwnString buf;
UUID value = uuid_column.getElement(row_num);
writeBinaryBigEndian(value.toUnderType().items[0], buf);
writeBinaryBigEndian(value.toUnderType().items[1], buf);
StringRef uuid_ext = buf.stringRef();
packer.pack_ext(sizeof(UUID), int8_t(MsgPackExtensionTypes::UUID));
packer.pack_ext_body(uuid_ext.data, uuid_ext.size);
return;
}
}
}
default:
break;
}
@ -186,9 +227,9 @@ void registerOutputFormatMsgPack(FormatFactory & factory)
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings &)
const FormatSettings & settings)
{
return std::make_shared<MsgPackRowOutputFormat>(buf, sample, params);
return std::make_shared<MsgPackRowOutputFormat>(buf, sample, params, settings);
});
factory.markOutputFormatSupportsParallelFormatting("MsgPack");
}

View File

@ -18,7 +18,7 @@ namespace DB
class MsgPackRowOutputFormat final : public IRowOutputFormat
{
public:
MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_);
MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_);
String getName() const override { return "MsgPackRowOutputFormat"; }
@ -28,6 +28,7 @@ private:
void serializeField(const IColumn & column, DataTypePtr data_type, size_t row_num);
msgpack::packer<DB::WriteBuffer> packer;
const FormatSettings format_settings;
};
}

View File

@ -0,0 +1,4 @@
5e7084e0-019f-461f-9e70-84e0019f561f
5e7084e0-019f-461f-9e70-84e0019f561f
5e7084e0-019f-461f-9e70-84e0019f561f
5e7084e0-019f-461f-9e70-84e0019f561f UUID

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Tags: no-parallel, no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "insert into table function file('uuid_str.msgpack', 'MsgPack', 'uuid UUID') select toUUID('5e7084e0-019f-461f-9e70-84e0019f561f') settings output_format_msgpack_uuid_representation='str'"
$CLICKHOUSE_CLIENT -q "select * from file('uuid_str.msgpack', 'MsgPack', 'uuid UUID')"
$CLICKHOUSE_CLIENT -q "insert into table function file('uuid_bin.msgpack', 'MsgPack', 'uuid UUID') select toUUID('5e7084e0-019f-461f-9e70-84e0019f561f') settings output_format_msgpack_uuid_representation='bin'"
$CLICKHOUSE_CLIENT -q "select * from file('uuid_bin.msgpack', 'MsgPack', 'uuid UUID')"
$CLICKHOUSE_CLIENT -q "insert into table function file('uuid_ext.msgpack', 'MsgPack', 'uuid UUID') select toUUID('5e7084e0-019f-461f-9e70-84e0019f561f') settings output_format_msgpack_uuid_representation='ext'"
$CLICKHOUSE_CLIENT -q "select * from file('uuid_ext.msgpack', 'MsgPack', 'uuid UUID')"
$CLICKHOUSE_CLIENT -q "select c1, toTypeName(c1) from file('uuid_ext.msgpack') settings input_format_msgpack_number_of_columns=1"