From 4f27391a72d3c87b351c5dbe13bd3c1c8aaeba08 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 31 Mar 2023 13:37:41 +0300 Subject: [PATCH 01/81] Revert "Revert "Randomize JIT settings in tests"" --- tests/clickhouse-test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index a355c2f8e73..fa88bc19efd 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -558,6 +558,9 @@ class SettingsRandomizer: "enable_memory_bound_merging_of_aggregation_results": lambda: random.randint( 0, 1 ), + "min_count_to_compile_expression": lambda: random.choice([0, 3]), + "min_count_to_compile_aggregate_expression": lambda: random.choice([0, 3]), + "min_count_to_compile_sort_description": lambda: random.choice([0, 3]), } @staticmethod From 08b5196070fcece2cb904790c8327dfc76c20319 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 18 Jun 2023 19:54:31 +0200 Subject: [PATCH 02/81] Updated to support passing sas token authentication --- src/Storages/StorageAzureBlob.cpp | 78 +++++++++---------- .../TableFunctionAzureBlobStorage.cpp | 2 +- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 3ee176a68b7..41179d1e641 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -86,7 +86,7 @@ const std::unordered_set optional_configuration_keys = { bool isConnectionString(const std::string & candidate) { - return candidate.starts_with("DefaultEndpointsProtocol"); + return !candidate.starts_with("http"); } } @@ -320,52 +320,52 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co } else { + std::shared_ptr storage_shared_key_credential; + std::unique_ptr blob_service_client; if (configuration.account_name.has_value() && configuration.account_key.has_value()) { - auto storage_shared_key_credential = std::make_shared(*configuration.account_name, *configuration.account_key); - auto blob_service_client = std::make_unique(configuration.connection_url, storage_shared_key_credential); - try - { - result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); - } - catch (const Azure::Storage::StorageException & e) - { - if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) - { - auto final_url = configuration.connection_url - + (configuration.connection_url.back() == '/' ? "" : "/") - + configuration.container; - - result = std::make_unique(final_url, storage_shared_key_credential); - } - else - { - throw; - } - } + storage_shared_key_credential + = std::make_shared(*configuration.account_name, *configuration.account_key); + blob_service_client = std::make_unique(configuration.connection_url, storage_shared_key_credential); } else { - auto managed_identity_credential = std::make_shared(); - auto blob_service_client = std::make_unique(configuration.connection_url, managed_identity_credential); - try - { - result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); - } - catch (const Azure::Storage::StorageException & e) - { - if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) - { - auto final_url = configuration.connection_url - + (configuration.connection_url.back() == '/' ? "" : "/") - + configuration.container; + blob_service_client = std::make_unique(configuration.connection_url); + } - result = std::make_unique(final_url, managed_identity_credential); + + try + { + blob_service_client = std::make_unique(configuration.connection_url); + result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); + } + catch (const Azure::Storage::StorageException & e) + { + if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) + { + std::string final_url; + size_t pos = configuration.connection_url.find("?"); + if (pos != std::string::npos) + { + auto url_without_sas = configuration.connection_url.substr(0, pos); + final_url = url_without_sas + + (url_without_sas.back() == '/' ? "" : "/") + + configuration.container + + configuration.connection_url.substr(pos); } else - { - throw; - } + final_url = configuration.connection_url + + (configuration.connection_url.back() == '/' ? "" : "/") + + configuration.container; + + if (storage_shared_key_credential) + result = std::make_unique(final_url, storage_shared_key_credential); + else + result = std::make_unique(final_url); + } + else + { + throw; } } } diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index d2a96173491..d51942d133d 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -39,7 +39,7 @@ namespace bool isConnectionString(const std::string & candidate) { - return candidate.starts_with("DefaultEndpointsProtocol"); + return !candidate.starts_with("http"); } } From 9a5d917cb561fefb67dfc22fdf95f71970d54726 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 19 Jun 2023 14:59:21 +0200 Subject: [PATCH 03/81] Fixed clang build issue --- src/Storages/StorageAzureBlob.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 41179d1e641..4eb60db0b99 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -344,7 +344,7 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { std::string final_url; - size_t pos = configuration.connection_url.find("?"); + size_t pos = configuration.connection_url.find('?'); if (pos != std::string::npos) { auto url_without_sas = configuration.connection_url.substr(0, pos); From 4060beae4928865d6f836f828769f2c8e3478e97 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 22 Jun 2023 17:58:57 +0000 Subject: [PATCH 04/81] Structure to CapnProto/Protobuf schema take 1 --- src/Core/Settings.h | 6 +- src/Core/SettingsEnums.cpp | 8 +- src/Core/SettingsEnums.h | 2 +- src/DataTypes/NestedUtils.cpp | 15 +- src/DataTypes/NestedUtils.h | 3 + src/Formats/CapnProtoSchema.cpp | 298 ++++ .../{CapnProtoUtils.h => CapnProtoSchema.h} | 13 +- src/Formats/CapnProtoSerializer.cpp | 1220 +++++++++++++++++ src/Formats/CapnProtoSerializer.h | 25 + src/Formats/CapnProtoUtils.cpp | 734 ---------- src/Formats/FormatFactory.cpp | 3 + src/Formats/FormatSchemaInfo.cpp | 82 ++ src/Formats/FormatSchemaInfo.h | 21 + src/Formats/FormatSettings.h | 9 +- src/Formats/ProtobufSerializer.cpp | 4 +- src/Formats/StructureToCapnProtoSchema.cpp | 236 ++++ src/Formats/StructureToCapnProtoSchema.h | 16 + src/Formats/StructureToFormatSchemaUtils.cpp | 113 ++ src/Formats/StructureToFormatSchemaUtils.h | 27 + src/Formats/StructureToProtobufSchema.cpp | 214 +++ src/Formats/StructureToProtobufSchema.h | 16 + src/Functions/generateRandomStructure.cpp | 507 +++++++ src/Functions/structureToFormatSchema.cpp | 137 ++ .../Formats/Impl/CapnProtoRowInputFormat.cpp | 252 +--- .../Formats/Impl/CapnProtoRowInputFormat.h | 12 +- .../Formats/Impl/CapnProtoRowOutputFormat.cpp | 275 +--- .../Formats/Impl/CapnProtoRowOutputFormat.h | 21 +- .../Formats/Impl/ProtobufListInputFormat.cpp | 6 +- .../Formats/Impl/ProtobufListInputFormat.h | 2 +- .../Formats/Impl/ProtobufListOutputFormat.cpp | 7 +- .../Formats/Impl/ProtobufListOutputFormat.h | 4 +- .../Formats/Impl/ProtobufRowInputFormat.cpp | 6 +- .../Formats/Impl/ProtobufRowInputFormat.h | 2 +- .../Formats/Impl/ProtobufRowOutputFormat.cpp | 7 +- .../Formats/Impl/ProtobufRowOutputFormat.h | 4 +- src/Storages/StorageGenerateRandom.cpp | 4 +- .../TableFunctionGenerateRandom.cpp | 6 +- src/TableFunctions/TableFunctionS3.cpp | 1 + 38 files changed, 3029 insertions(+), 1289 deletions(-) create mode 100644 src/Formats/CapnProtoSchema.cpp rename src/Formats/{CapnProtoUtils.h => CapnProtoSchema.h} (59%) create mode 100644 src/Formats/CapnProtoSerializer.cpp create mode 100644 src/Formats/CapnProtoSerializer.h delete mode 100644 src/Formats/CapnProtoUtils.cpp create mode 100644 src/Formats/StructureToCapnProtoSchema.cpp create mode 100644 src/Formats/StructureToCapnProtoSchema.h create mode 100644 src/Formats/StructureToFormatSchemaUtils.cpp create mode 100644 src/Formats/StructureToFormatSchemaUtils.h create mode 100644 src/Formats/StructureToProtobufSchema.cpp create mode 100644 src/Formats/StructureToProtobufSchema.h create mode 100644 src/Functions/generateRandomStructure.cpp create mode 100644 src/Functions/structureToFormatSchema.cpp diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 607be1522db..c4a5fe02855 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -965,7 +965,11 @@ class IColumn; M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ M(ORCCompression, output_format_orc_compression_method, "lz4", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ \ - M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ + M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ + \ + M(Bool, format_capn_proto_use_autogenerated_schema, true, "Use autogenerated CapnProto schema when format_schema is not set", 0) \ + M(Bool, format_protobuf_use_autogenerated_schema, true, "Use autogenerated Protobuf when format_schema is not set", 0) \ + M(String, output_format_schema, "", "The path to the file where the automatically generated schema will be saved", 0) \ \ M(String, input_format_mysql_dump_table_name, "", "Name of the table in MySQL dump from which to read data", 0) \ M(Bool, input_format_mysql_dump_map_column_names, true, "Match columns from table in MySQL dump and columns from ClickHouse table by names", 0) \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index e0f16ea00db..a291a23c140 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -144,10 +144,10 @@ IMPLEMENT_SETTING_ENUM(TransactionsWaitCSNMode, ErrorCodes::BAD_ARGUMENTS, {"wait", TransactionsWaitCSNMode::WAIT}, {"wait_unknown", TransactionsWaitCSNMode::WAIT_UNKNOWN}}) -IMPLEMENT_SETTING_ENUM(EnumComparingMode, ErrorCodes::BAD_ARGUMENTS, - {{"by_names", FormatSettings::EnumComparingMode::BY_NAMES}, - {"by_values", FormatSettings::EnumComparingMode::BY_VALUES}, - {"by_names_case_insensitive", FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE}}) +IMPLEMENT_SETTING_ENUM(CapnProtoEnumComparingMode, ErrorCodes::BAD_ARGUMENTS, + {{"by_names", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES}, + {"by_values", FormatSettings::CapnProtoEnumComparingMode::BY_VALUES}, + {"by_names_case_insensitive", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE}}) IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS, {{"None", FormatSettings::EscapingRule::None}, diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 3ae7bfaa673..1c5be910ef7 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -188,7 +188,7 @@ enum class TransactionsWaitCSNMode DECLARE_SETTING_ENUM(TransactionsWaitCSNMode) -DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode) +DECLARE_SETTING_ENUM_WITH_RENAME(CapnProtoEnumComparingMode, FormatSettings::CapnProtoEnumComparingMode) DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule) diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index f029ac6ba27..9ee803c4235 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -71,7 +71,7 @@ std::string extractTableName(const std::string & nested_name) } -Block flatten(const Block & block) +static Block flattenImpl(const Block & block, bool flatten_named_tuple) { Block res; @@ -114,7 +114,7 @@ Block flatten(const Block & block) else res.insert(elem); } - else if (const DataTypeTuple * type_tuple = typeid_cast(elem.type.get())) + else if (const DataTypeTuple * type_tuple = typeid_cast(elem.type.get()); type_tuple && flatten_named_tuple) { if (type_tuple->haveExplicitNames()) { @@ -143,6 +143,17 @@ Block flatten(const Block & block) return res; } +Block flatten(const Block & block) +{ + return flattenImpl(block, true); +} + + +Block flattenArrayOfTuples(const Block & block) +{ + return flattenImpl(block, false); +} + namespace { diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index 90fdd683493..e009ceb18fe 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -23,6 +23,9 @@ namespace Nested /// 2) For an Array with named Tuple element column, a Array(Tuple(x ..., y ..., ...)), replace it with multiple Array Columns, a.x ..., a.y ..., ... Block flatten(const Block & block); + /// Same as flatten but only for Array with named Tuple element column. + Block flattenArrayOfTuples(const Block & block); + /// Collect Array columns in a form of `column_name.element_name` to single Array(Tuple(...)) column. NamesAndTypesList collect(const NamesAndTypesList & names_and_types); diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp new file mode 100644 index 00000000000..22518d5061a --- /dev/null +++ b/src/Formats/CapnProtoSchema.cpp @@ -0,0 +1,298 @@ +#include + +#if USE_CAPNP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; + extern const int BAD_TYPE_OF_FIELD; + extern const int FILE_DOESNT_EXIST; + extern const int UNKNOWN_EXCEPTION; + extern const int CAPN_PROTO_BAD_TYPE; + extern const int BAD_ARGUMENTS; +} + +capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) +{ + capnp::ParsedSchema schema; + try + { + int fd; + KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) + auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); + schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); + } + catch (const kj::Exception & e) + { + /// That's not good to determine the type of error by its description, but + /// this is the only way to do it here, because kj doesn't specify the type of error. + auto description = std::string_view(e.getDescription().cStr()); + if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); + + if (description.find("Parse error") != String::npos) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); + + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", + description, schema_info.schemaDirectory(), schema_info.schemaPath()); + } + + auto message_maybe = schema.findNested(schema_info.messageName()); + auto * message_schema = kj::_::readMaybe(message_maybe); + if (!message_schema) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, + "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); + return message_schema->asStruct(); +} + +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); +} + +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); +} + +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type) +{ + static const std::map capnp_simple_type_names = + { + {capnp::schema::Type::Which::BOOL, "Bool"}, + {capnp::schema::Type::Which::VOID, "Void"}, + {capnp::schema::Type::Which::INT8, "Int8"}, + {capnp::schema::Type::Which::INT16, "Int16"}, + {capnp::schema::Type::Which::INT32, "Int32"}, + {capnp::schema::Type::Which::INT64, "Int64"}, + {capnp::schema::Type::Which::UINT8, "UInt8"}, + {capnp::schema::Type::Which::UINT16, "UInt16"}, + {capnp::schema::Type::Which::UINT32, "UInt32"}, + {capnp::schema::Type::Which::UINT64, "UInt64"}, + {capnp::schema::Type::Which::FLOAT32, "Float32"}, + {capnp::schema::Type::Which::FLOAT64, "Float64"}, + {capnp::schema::Type::Which::TEXT, "Text"}, + {capnp::schema::Type::Which::DATA, "Data"}, + {capnp::schema::Type::Which::INTERFACE, "Interface"}, + {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, + }; + + switch (type.which()) + { + case capnp::schema::Type::Which::STRUCT: + { + auto struct_schema = type.asStruct(); + + auto non_union_fields = struct_schema.getNonUnionFields(); + std::vector non_union_field_names; + for (auto nested_field : non_union_fields) + non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + auto union_fields = struct_schema.getUnionFields(); + std::vector union_field_names; + for (auto nested_field : union_fields) + union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; + /// Check if the struct is a named union. + if (non_union_field_names.empty()) + return union_name; + + String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); + /// Check if the struct contains unnamed union. + if (!union_field_names.empty()) + type_name += ", " + union_name; + type_name += ")"; + return type_name; + } + case capnp::schema::Type::Which::LIST: + return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; + case capnp::schema::Type::Which::ENUM: + { + auto enum_schema = type.asEnum(); + String enum_name = "Enum("; + auto enumerants = enum_schema.getEnumerants(); + for (unsigned i = 0; i != enumerants.size(); ++i) + { + enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); + if (i + 1 != enumerants.size()) + enum_name += ", "; + } + enum_name += ")"; + return enum_name; + } + default: + auto it = capnp_simple_type_names.find(type.which()); + if (it == capnp_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); + return it->second; + } +} + +namespace +{ + + template + static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) + { + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); + } + + static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) + { + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); + } + + static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + + if (struct_schema.getFields().size() == 0) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); + } + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); + } + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); + } + + auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (!nested_type) + continue; + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(nested_type); + } + if (nested_types.empty()) + return nullptr; + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } + } +} + +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (type) + names_and_types.emplace_back(name, type); + } + if (names_and_types.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); + + return names_and_types; +} + +} + +#endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoSchema.h similarity index 59% rename from src/Formats/CapnProtoUtils.h rename to src/Formats/CapnProtoSchema.h index 2d8cdb418d7..225f6f56207 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoSchema.h @@ -30,17 +30,14 @@ public: capnp::StructSchema getMessageSchema(const FormatSchemaInfo & schema_info); }; -std::pair splitCapnProtoFieldName(const String & name); +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema); +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema); -bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode); - -std::pair getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name); - -capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name); - -void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type); NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields); + } #endif diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp new file mode 100644 index 00000000000..1a9a51756ba --- /dev/null +++ b/src/Formats/CapnProtoSerializer.cpp @@ -0,0 +1,1220 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int THERE_IS_NO_COLUMN; + extern const int BAD_TYPE_OF_FIELD; + extern const int CAPN_PROTO_BAD_CAST; + extern const int INCORRECT_DATA; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + std::pair splitFieldName(const String & name) + { + const auto * begin = name.data(); + const auto * end = name.data() + name.size(); + const auto * it = find_first_symbols<'_', '.'>(begin, end); + String first = String(begin, it); + String second = it == end ? "" : String(it + 1, end); + return {first, second}; + } + + std::optional findFieldByName(const capnp::StructSchema & struct_schema, const String & name) + { + const auto & fields = struct_schema.getFields(); + for (auto field : fields) + { + auto field_name = String(field.getProto().getName()); + if (boost::to_lower_copy(name) == boost::to_lower_copy(field_name)) + return field; + } + return std::nullopt; + } + + [[noreturn]] void throwCannotConvert(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type) + { + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}", + name, + type->getName(), + getCapnProtoFullTypeName(capnp_type)); + } + + struct FieldBuilder + { + virtual ~FieldBuilder() = default; + }; + + struct ListBuilder : public FieldBuilder + { + explicit ListBuilder(capnp::DynamicValue::Builder builder) : impl(builder.as()) + { + } + + capnp::DynamicList::Builder impl; + std::vector> nested_builders; + }; + + struct StructBuilder : public FieldBuilder + { + explicit StructBuilder(capnp::DynamicValue::Builder builder, size_t fields_size) : impl(builder.as()), field_builders(fields_size) + { + } + + explicit StructBuilder(capnp::DynamicStruct::Builder struct_builder, size_t fields_size) : impl(std::move(struct_builder)), field_builders(fields_size) + { + } + + capnp::DynamicStruct::Builder impl; + std::vector> field_builders; + }; + + std::unique_ptr initStructFieldBuilderIfNeeded(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, const capnp::StructSchema::Field & field, const capnp::Type & capnp_type, size_t nested_fields_size) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::LIST: + { + const auto * array_column = assert_cast(column.get()); + size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; + return std::make_unique(struct_builder.init(field, static_cast(size))); + } + case capnp::schema::Type::STRUCT: + { + return std::make_unique(struct_builder.init(field), nested_fields_size); + } + default: + return nullptr; + } + } + + class ICapnProtoSerializer + { + public: + virtual std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) = 0; + virtual void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) = 0; + + virtual ~ICapnProtoSerializer() = default; + }; + + template + class CapnProtoIntegerSerializer : public ICapnProtoSerializer + { + public: + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) + return capnp::DynamicValue::Reader(column->getInt(row_num)); + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) + return capnp::DynamicValue::Reader(column->getUInt(row_num)); + return capnp::DynamicValue::Reader(column->getBool(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + NumericType value; + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) + value = static_cast(reader.as()); + else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) + value = static_cast(reader.as()); + else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::BOOL) + value = static_cast(reader.as()); + + if constexpr (is_bool_data_type) + assert_cast(column).insertValue(static_cast(value)); + else + assert_cast &>(column).insertValue(value); + } + }; + + template + static std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: [[fallthrough]]; + case capnp::schema::Type::INT16: [[fallthrough]]; + case capnp::schema::Type::INT32: [[fallthrough]]; + case capnp::schema::Type::INT64: + return std::make_unique>(); + case capnp::schema::Type::UINT8: [[fallthrough]]; + case capnp::schema::Type::UINT16: [[fallthrough]]; + case capnp::schema::Type::UINT32: [[fallthrough]]; + case capnp::schema::Type::UINT64: + return std::make_unique>(); + case capnp::schema::Type::BOOL: + return std::make_unique>(); + default: + throwCannotConvert(data_type, column_name, capnp_type); + } + } + + template + class CapnProtoBigIntegerSerializer : public ICapnProtoSerializer + { + public: + CapnProtoBigIntegerSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(NumericType)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + private: + DataTypePtr data_type; + }; + + template + class CapnProtoFloatSerializer : public ICapnProtoSerializer + { + public: + CapnProtoFloatSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isFloat32() && !capnp_type.isFloat64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getFloat64(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast &>(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoEnumSerializer : public ICapnProtoSerializer + { + public: + CapnProtoEnumSerializer( + const DataTypePtr & data_type_, + const String & column_name, + const capnp::Type & capnp_type, + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode_) : data_type(data_type_), enum_schema(capnp_type.asEnum()), enum_comparing_mode(enum_comparing_mode_) + { + if (!capnp_type.isEnum()) + throwCannotConvert(data_type, column_name, capnp_type); + + bool to_lower = enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE; + const auto * enum_type = assert_cast *>(data_type.get()); + const auto & enum_values = dynamic_cast &>(*enum_type); + + auto enumerants = enum_schema.getEnumerants(); + constexpr auto max_value = std::is_same_v ? INT8_MAX : INT16_MAX; + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + { + /// In CapnProto Enum fields are numbered sequentially starting from zero. + if (enumerants.size() > max_value) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Enum from CapnProto schema contains values that are out of range for Clickhouse enum type {}", + data_type->getName()); + + auto values = enum_values.getSetOfAllValues(); + std::unordered_set capn_enum_values; + for (auto enumerant : enumerants) + capn_enum_values.insert(EnumType(enumerant.getOrdinal())); + if (values != capn_enum_values) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"); + } + else + { + auto names = enum_values.getSetOfAllNames(to_lower); + std::unordered_set capn_enum_names; + + for (auto enumerant : enumerants) + { + String name = enumerant.getProto().getName(); + capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); + } + + if (names != capn_enum_names) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + } + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + const auto * enum_data_type = assert_cast *>(data_type.get()); + EnumType enum_value = assert_cast &>(*column).getElement(row_num); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + return capnp::DynamicValue::Reader(capnp::DynamicEnum(enum_schema, enum_value)); + + auto enum_name = enum_data_type->getNameForValue(enum_value); + for (const auto enumerant : enum_schema.getEnumerants()) + { + if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), enum_comparing_mode)) + return capnp::DynamicValue::Reader(capnp::DynamicEnum(enumerant)); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto enum_value = reader.as(); + auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); + auto enum_type = assert_cast *>(data_type.get()); + DataTypePtr nested_type = std::make_shared>(); + switch (enum_comparing_mode) + { + case FormatSettings::CapnProtoEnumComparingMode::BY_VALUES: + { + assert_cast &>(column).insertValue(static_cast(enumerant.getOrdinal())); + return; + } + case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES: + { + auto value = enum_type->getValue(String(enumerant.getProto().getName())); + assert_cast &>(column).insertValue(value); + return; + } + case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE: + { + /// Find the same enum name case insensitive. + String enum_name = enumerant.getProto().getName(); + for (auto & name : enum_type->getAllRegisteredNames()) + { + if (compareEnumNames(name, enum_name, enum_comparing_mode)) + { + assert_cast &>(column).insertValue(enum_type->getValue(name)); + break; + } + } + return; + } + } + } + + private: + bool compareEnumNames(const String & first, const String & second, const FormatSettings::CapnProtoEnumComparingMode mode) + { + if (mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE) + return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); + return first == second; + } + + DataTypePtr data_type; + capnp::EnumSchema enum_schema; + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; + }; + + class CapnProtoDateSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt16()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getUInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDate32Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDate32Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDateTimeSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTimeSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDateTime64Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTime64Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoDecimalSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDecimalSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + auto which = WhichDataType(data_type); + if ((!capnp_type.isInt32() && which.isDecimal32()) || (!capnp_type.isInt64() && which.isDecimal64())) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast &>(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoBigDecimalSerializer : public ICapnProtoSerializer + { + public: + CapnProtoBigDecimalSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(DecimalType)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + private: + DataTypePtr data_type; + }; + + template + class CapnProtoStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + if constexpr (is_binary) + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + /// For type TEXT 0 byte must be at the end, so we cannot use getDataAt method, because it cuts last 0 byte. + const auto & string_column = assert_cast(*column); + const auto & chars = string_column.getChars(); + const auto & offsets = string_column.getOffsets(); + size_t start = offsets[ssize_t(row_num) - 1]; + size_t size = offsets[row_num] - start; + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(&chars[start]), size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + if constexpr (is_binary) + { + auto value = reader.as(); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + auto value = reader.as(); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + private: + capnp::Type capnp_type; + }; + + template + class CapnProtoFixedStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoFixedStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + if constexpr (is_binary) + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + + if (data.data[data.size - 1] == 0) + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(data.data), data.size)); + + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(tmp_string.data()), tmp_string.size())); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto & fixed_string_column = assert_cast(column); + if constexpr (is_binary) + { + auto value = reader.as(); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + auto value = reader.as(); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + private: + String tmp_string; + capnp::Type capnp_type; + }; + + class CapnProtoIPv4Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv4Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(assert_cast(*column).getElement(row_num).toUnderType()); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(IPv4(reader.as())); + } + }; + + class CapnProtoIPv6Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv6Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(IPv6)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of IPv6 value: {}", value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + }; + + class CapnProtoUUIDSerializer : public ICapnProtoSerializer + { + public: + CapnProtoUUIDSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(UUID)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of UUID value: {}", value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + }; + + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings); + + class CapnProtoLowCardinalitySerializer : public ICapnProtoSerializer + { + public: + CapnProtoLowCardinalitySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + nested_serializer = createSerializer(assert_cast(*data_type).getDictionaryType(), column_name, capnp_type, settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + const auto & low_cardinality_column = assert_cast(*column); + size_t index = low_cardinality_column.getIndexAt(row_num); + const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); + return nested_serializer->writeRow(dict_column, field_builder, index); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto & low_cardinality_column = assert_cast(column); + auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + nested_serializer->readRow(*tmp_column, reader); + low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + } + + private: + std::unique_ptr nested_serializer; + }; + + class CapnProtoNullableSerializer : public ICapnProtoSerializer + { + public: + CapnProtoNullableSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type, got CapnProto type {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + /// Check that struct is a named union of type VOID and one arbitrary type. + auto struct_schema = capnp_type.asStruct(); + if (!checkIfStructIsNamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto struct is not a named union: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto union_fields = struct_schema.getUnionFields(); + if (union_fields.size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union have more than 2 fields: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto first = union_fields[0]; + auto second = union_fields[1]; + auto nested_type = assert_cast(data_type.get())->getNestedType(); + if (first.getType().isVoid()) + { + null_field = first; + nested_field = second; + nested_capnp_type = second.getType(); + if (nested_capnp_type.isStruct()) + nested_fields_size = nested_capnp_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + } + else if (second.getType().isVoid()) + { + null_field = second; + nested_field = first; + nested_capnp_type = first.getType(); + if (nested_capnp_type.isStruct()) + nested_fields_size = nested_capnp_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + } + else + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union doesn't have field with type Void: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & struct_builder = assert_cast(*field_builder); + const auto & nullable_column = assert_cast(*column); + if (nullable_column.isNullAt(row_num)) + { + struct_builder.impl.set(null_field, capnp::Void()); + } + else + { + struct_builder.impl.clear(nested_field); + const auto & nested_column = nullable_column.getNestedColumnPtr(); + auto nested_field_builder = initStructFieldBuilderIfNeeded(nested_column, row_num, struct_builder.impl, nested_field, nested_capnp_type, nested_fields_size); + auto value = nested_serializer->writeRow(nested_column, nested_field_builder.get(), row_num); + if (value) + struct_builder.impl.set(nested_field, *value); + } + + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + auto & nullable_column = assert_cast(column); + auto field = *kj::_::readMaybe(struct_reader.which()); + if (field.getType().isVoid()) + nullable_column.insertDefault(); + else + { + auto & nested_column = nullable_column.getNestedColumn(); + auto nested_reader = struct_reader.get(field); + nested_serializer->readRow(nested_column, nested_reader); + nullable_column.getNullMapData().push_back(0); + } + } + + private: + std::unique_ptr nested_serializer; + capnp::StructSchema::Field null_field; + capnp::StructSchema::Field nested_field; + size_t nested_fields_size = 0; + capnp::Type nested_capnp_type; + }; + + class CapnProtoArraySerializer : public ICapnProtoSerializer + { + public: + CapnProtoArraySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isList()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto nested_type = assert_cast(data_type.get())->getNestedType(); + element_type = capnp_type.asList().getElementType(); + if (element_type.isStruct()) + element_struct_fields = element_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, capnp_type.asList().getElementType(), settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & list_builder = assert_cast(*field_builder); + const auto * array_column = assert_cast(column.get()); + const auto & nested_column = array_column->getDataPtr(); + const auto & offsets = array_column->getOffsets(); + auto offset = offsets[row_num - 1]; + size_t size = offsets[row_num] - offset; + bool need_nested_builders = list_builder.nested_builders.empty(); + for (unsigned i = 0; i != static_cast(size); ++i) + { + if (need_nested_builders) + { + /// For nested lists we need to initialize nested list builder. + if (element_type.isList()) + { + const auto & nested_offset = checkAndGetColumn(*nested_column)->getOffsets(); + size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; + list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl.init(i, static_cast(nested_array_size)))); + } + else if (element_type.isStruct()) + { + list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl[i], element_struct_fields)); + } + else + { + list_builder.nested_builders.emplace_back(); + } + } + + auto value = nested_serializer->writeRow(nested_column, list_builder.nested_builders[i].get(), offset + i); + if (value) + list_builder.impl.set(i, *value); + } + + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto list_reader = reader.as(); + auto & column_array = assert_cast(column); + auto & offsets = column_array.getOffsets(); + offsets.push_back(offsets.back() + list_reader.size()); + + auto & nested_column = column_array.getData(); + for (const auto & nested_reader : list_reader) + nested_serializer->readRow(nested_column, nested_reader); + } + + private: + std::unique_ptr nested_serializer; + capnp::Type element_type; + size_t element_struct_fields; + }; + + class CapnProtoMapSerializer : public ICapnProtoSerializer + { + public: + CapnProtoMapSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + /// We output/input Map type as follow CapnProto schema + /// + /// struct Map { + /// struct Entry { + /// key @0: Key; + /// value @1: Value; + /// } + /// entries @0 :List(Entry); + /// } + + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto struct_schema = capnp_type.asStruct(); + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto Struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + if (struct_schema.getFields().size() != 1) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got struct: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + const auto & field_type = struct_schema.getFields()[0].getType(); + if (!field_type.isList()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got field: {}", + column_name, + getCapnProtoFullTypeName(field_type)); + + auto list_element_type = field_type.asList().getElementType(); + if (!list_element_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Field of struct that represents Map should be a list of structs, got list of {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + auto key_value_struct = list_element_type.asStruct(); + if (checkIfStructContainsUnnamedUnion(key_value_struct)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries is unnamed union: {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + if (key_value_struct.getFields().size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries should contain only 2 fields, got struct {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + const auto & map_type = assert_cast(*data_type); + DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; + Names names = {"key", "value"}; + auto entries_type = std::make_shared(std::make_shared(types, names)); + entries_field = struct_schema.getFields()[0]; + entries_capnp_type = entries_field.getType(); + nested_serializer = createSerializer(entries_type, column_name, field_type, settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & struct_builder = assert_cast(*field_builder); + const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); + auto entries_builder = initStructFieldBuilderIfNeeded(entries_column, row_num, struct_builder.impl, entries_field, entries_capnp_type, 0); + nested_serializer->writeRow(entries_column, entries_builder.get(), row_num); + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + auto & entries_column = assert_cast(column).getNestedColumn(); + nested_serializer->readRow(entries_column, struct_reader.get(entries_field)); + } + + private: + std::unique_ptr nested_serializer; + capnp::StructSchema::Field entries_field; + capnp::Type entries_capnp_type; + }; + + class CapnProtoStructureSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStructureSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + { + initialize(data_types, names, schema, settings); + } + + CapnProtoStructureSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto struct_schema = capnp_type.asStruct(); + + if (checkIfStructIsNamedUnion(struct_schema) || checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto named union/struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + const auto * tuple_data_type = assert_cast(data_type.get()); + auto nested_types = tuple_data_type->getElements(); + Names nested_names; + bool have_explicit_names = tuple_data_type->haveExplicitNames(); + auto structure_fields = struct_schema.getFields(); + if (!have_explicit_names) + { + if (nested_types.size() != structure_fields.size()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}: Tuple and Struct have different sizes {} != {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type), + nested_types.size(), + structure_fields.size()); + nested_names.reserve(structure_fields.size()); + for (auto field : structure_fields) + nested_names.push_back(field.getProto().getName()); + } + else + { + nested_names = tuple_data_type->getElementNames(); + } + + try + { + initialize(nested_types, nested_names, struct_schema, settings); + } + catch (Exception & e) + { + e.addMessage("(while converting column {})", column_name); + throw e; + } + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) override + { + assert(builder); + auto & struct_builder = assert_cast(*builder); + if (auto tuple_column = typeid_cast(column.get())) + writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); + else + writeRow(Columns{column}, struct_builder, row_num); + return std::nullopt; + } + + void writeRow(const Columns & columns, StructBuilder & struct_builder, size_t row_num) + { + for (size_t i = 0; i != columns.size(); ++i) + { + const auto & field = fields[i]; + size_t field_index = field.getIndex(); + if (likely(!struct_builder.field_builders[field_index])) + struct_builder.field_builders[field_index] = initStructFieldBuilderIfNeeded( + columns[i], row_num, struct_builder.impl, field, fields_types[i], nested_field_sizes[i]); + + auto value = field_serializers[i]->writeRow(columns[i], struct_builder.field_builders[field_index].get(), row_num); + if (value) + struct_builder.impl.set(field, *value); + } + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + if (auto tuple_column = typeid_cast(&column)) + { + for (size_t i = 0; i != tuple_column->tupleSize(); ++i) + field_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader.get(fields[i])); + } + else + field_serializers[0]->readRow(column, struct_reader.get(fields[0])); + } + + void readRow(MutableColumns & columns, const capnp::DynamicStruct::Reader & reader) + { + for (size_t i = 0; i != columns.size(); ++i) + field_serializers[i]->readRow(*columns[i], reader.get(fields[i])); + } + + private: + void initialize(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + { + field_serializers.reserve(data_types.size()); + fields.reserve(data_types.size()); + fields_types.reserve(data_types.size()); + nested_field_sizes.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + { + auto [field_name, _] = splitFieldName(names[i]); + auto field = findFieldByName(schema, field_name); + if (!field) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); + + fields.push_back(*field); + auto capnp_type = field->getType(); + fields_types.push_back(capnp_type); + nested_field_sizes.push_back(capnp_type.isStruct() ? capnp_type.asStruct().getFields().size() : 0); + field_serializers.push_back(createSerializer(data_types[i], names[i], capnp_type, settings)); + } + } + + std::vector> field_serializers; + std::vector fields; + std::vector nested_field_sizes; + std::vector fields_types; + }; + + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + auto [field_name, nested_name] = splitFieldName(name); + if (!nested_name.empty() && !capnp_type.isList()) + { + if (!capnp_type.isStruct()) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); + + return std::make_unique(DataTypes{type}, Names{nested_name}, capnp_type.asStruct(), settings); + } + + switch (type->getTypeId()) + { + case TypeIndex::Int8: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt8: + if (isBool(type)) + return createIntegerSerializer(type, name, capnp_type); + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int16: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt16: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int32: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt32: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int64: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt64: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::UInt128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Int256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::UInt256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Float32: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Float64: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Date: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Date32: + return std::make_unique(type, name, capnp_type); + case TypeIndex::DateTime: + return std::make_unique(type, name, capnp_type); + case TypeIndex::DateTime64: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Decimal32: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal64: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::IPv4: + return std::make_unique(type, name, capnp_type); + case TypeIndex::IPv6: + return std::make_unique(type, name, capnp_type); + case TypeIndex::UUID: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Enum8: + return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::Enum16: + return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::String: + if (capnp_type.isData()) + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + case TypeIndex::FixedString: + if (capnp_type.isData()) + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + case TypeIndex::LowCardinality: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Nullable: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Array: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Map: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Tuple: + return std::make_unique(type, name, capnp_type, settings); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in CapnProto format", type->getName()); + } + } +} + +class CapnProtoSerializer::Impl +{ +public: + Impl(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : struct_serializer(std::make_unique(data_types, names, schema, settings)) + , fields_size(schema.getFields().size()) + { + } + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) + { + StructBuilder struct_builder(std::move(builder), fields_size); + struct_serializer->writeRow(columns, struct_builder, row_num); + } + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) + { + struct_serializer->readRow(columns, reader); + } + +private: + std::unique_ptr struct_serializer; + size_t fields_size; +}; + +CapnProtoSerializer::CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : serializer_impl(std::make_unique(data_types, names, schema, settings)) +{ +} + +void CapnProtoSerializer::writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) +{ + serializer_impl->writeRow(columns, std::move(builder), row_num); +} + +void CapnProtoSerializer::readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) +{ + serializer_impl->readRow(columns, reader); +} + +CapnProtoSerializer::~CapnProtoSerializer() = default; + +} diff --git a/src/Formats/CapnProtoSerializer.h b/src/Formats/CapnProtoSerializer.h new file mode 100644 index 00000000000..efae797875b --- /dev/null +++ b/src/Formats/CapnProtoSerializer.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class CapnProtoSerializer +{ +public: + CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings); + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num); + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader); + + ~CapnProtoSerializer(); + +private: + class Impl; + std::unique_ptr serializer_impl; +}; + +} diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp deleted file mode 100644 index d6c032408bb..00000000000 --- a/src/Formats/CapnProtoUtils.cpp +++ /dev/null @@ -1,734 +0,0 @@ -#include - -#if USE_CAPNP - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; - extern const int THERE_IS_NO_COLUMN; - extern const int BAD_TYPE_OF_FIELD; - extern const int CAPN_PROTO_BAD_CAST; - extern const int FILE_DOESNT_EXIST; - extern const int UNKNOWN_EXCEPTION; - extern const int INCORRECT_DATA; - extern const int CAPN_PROTO_BAD_TYPE; - extern const int BAD_ARGUMENTS; -} - -std::pair splitCapnProtoFieldName(const String & name) -{ - const auto * begin = name.data(); - const auto * end = name.data() + name.size(); - const auto * it = find_first_symbols<'_', '.'>(begin, end); - String first = String(begin, it); - String second = it == end ? "" : String(it + 1, end); - return {first, second}; -} - -capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) -{ - capnp::ParsedSchema schema; - try - { - int fd; - KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) - auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); - schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); - } - catch (const kj::Exception & e) - { - /// That's not good to determine the type of error by its description, but - /// this is the only way to do it here, because kj doesn't specify the type of error. - auto description = std::string_view(e.getDescription().cStr()); - if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); - - if (description.find("Parse error") != String::npos) - throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); - - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", - description, schema_info.schemaDirectory(), schema_info.schemaPath()); - } - - auto message_maybe = schema.findNested(schema_info.messageName()); - auto * message_schema = kj::_::readMaybe(message_maybe); - if (!message_schema) - throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, - "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); - return message_schema->asStruct(); -} - -bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode) -{ - if (mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE) - return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); - return first == second; -} - -static const std::map capnp_simple_type_names = -{ - {capnp::schema::Type::Which::BOOL, "Bool"}, - {capnp::schema::Type::Which::VOID, "Void"}, - {capnp::schema::Type::Which::INT8, "Int8"}, - {capnp::schema::Type::Which::INT16, "Int16"}, - {capnp::schema::Type::Which::INT32, "Int32"}, - {capnp::schema::Type::Which::INT64, "Int64"}, - {capnp::schema::Type::Which::UINT8, "UInt8"}, - {capnp::schema::Type::Which::UINT16, "UInt16"}, - {capnp::schema::Type::Which::UINT32, "UInt32"}, - {capnp::schema::Type::Which::UINT64, "UInt64"}, - {capnp::schema::Type::Which::FLOAT32, "Float32"}, - {capnp::schema::Type::Which::FLOAT64, "Float64"}, - {capnp::schema::Type::Which::TEXT, "Text"}, - {capnp::schema::Type::Which::DATA, "Data"}, - {capnp::schema::Type::Which::INTERFACE, "Interface"}, - {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, -}; - -static bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) -{ - return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); -} - -static bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) -{ - return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); -} - -/// Get full name of type for better exception messages. -static String getCapnProtoFullTypeName(const capnp::Type & type) -{ - switch (type.which()) - { - case capnp::schema::Type::Which::STRUCT: - { - auto struct_schema = type.asStruct(); - - auto non_union_fields = struct_schema.getNonUnionFields(); - std::vector non_union_field_names; - for (auto nested_field : non_union_fields) - non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); - - auto union_fields = struct_schema.getUnionFields(); - std::vector union_field_names; - for (auto nested_field : union_fields) - union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); - - String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; - /// Check if the struct is a named union. - if (non_union_field_names.empty()) - return union_name; - - String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); - /// Check if the struct contains unnamed union. - if (!union_field_names.empty()) - type_name += ", " + union_name; - type_name += ")"; - return type_name; - } - case capnp::schema::Type::Which::LIST: - return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; - case capnp::schema::Type::Which::ENUM: - { - auto enum_schema = type.asEnum(); - String enum_name = "Enum("; - auto enumerants = enum_schema.getEnumerants(); - for (unsigned i = 0; i != enumerants.size(); ++i) - { - enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); - if (i + 1 != enumerants.size()) - enum_name += ", "; - } - enum_name += ")"; - return enum_name; - } - default: - auto it = capnp_simple_type_names.find(type.which()); - if (it == capnp_simple_type_names.end()) - throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); - return it->second; - } -} - -template -static bool checkEnums(const capnp::Type & capnp_type, const DataTypePtr column_type, FormatSettings::EnumComparingMode mode, UInt64 max_value, String & error_message) -{ - if (!capnp_type.isEnum()) - return false; - - auto enum_schema = capnp_type.asEnum(); - bool to_lower = mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE; - const auto * enum_type = assert_cast *>(column_type.get()); - const auto & enum_values = dynamic_cast &>(*enum_type); - - auto enumerants = enum_schema.getEnumerants(); - if (mode == FormatSettings::EnumComparingMode::BY_VALUES) - { - /// In CapnProto Enum fields are numbered sequentially starting from zero. - if (enumerants.size() > max_value) - { - error_message += "Enum from CapnProto schema contains values that is out of range for Clickhouse Enum"; - return false; - } - - auto values = enum_values.getSetOfAllValues(); - std::unordered_set capn_enum_values; - for (auto enumerant : enumerants) - capn_enum_values.insert(Type(enumerant.getOrdinal())); - auto result = values == capn_enum_values; - if (!result) - error_message += "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"; - return result; - } - - auto names = enum_values.getSetOfAllNames(to_lower); - std::unordered_set capn_enum_names; - - for (auto enumerant : enumerants) - { - String name = enumerant.getProto().getName(); - capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); - } - - auto result = names == capn_enum_names; - if (!result) - error_message += "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"; - return result; -} - -static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name); - -static bool checkNullableType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - if (!capnp_type.isStruct()) - return false; - - /// Check that struct is a named union of type VOID and one arbitrary type. - auto struct_schema = capnp_type.asStruct(); - if (!checkIfStructIsNamedUnion(struct_schema)) - return false; - - auto union_fields = struct_schema.getUnionFields(); - if (union_fields.size() != 2) - return false; - - auto first = union_fields[0]; - auto second = union_fields[1]; - - auto nested_type = assert_cast(data_type.get())->getNestedType(); - if (first.getType().isVoid()) - return checkCapnProtoType(second.getType(), nested_type, mode, error_message, column_name); - if (second.getType().isVoid()) - return checkCapnProtoType(first.getType(), nested_type, mode, error_message, column_name); - return false; -} - -static bool checkTupleType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message) -{ - if (!capnp_type.isStruct()) - return false; - auto struct_schema = capnp_type.asStruct(); - - if (checkIfStructIsNamedUnion(struct_schema)) - return false; - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - const auto * tuple_data_type = assert_cast(data_type.get()); - auto nested_types = tuple_data_type->getElements(); - if (nested_types.size() != struct_schema.getFields().size()) - { - error_message += "Tuple and Struct types have different sizes"; - return false; - } - - bool have_explicit_names = tuple_data_type->haveExplicitNames(); - const auto & nested_names = tuple_data_type->getElementNames(); - for (uint32_t i = 0; i != nested_names.size(); ++i) - { - if (have_explicit_names) - { - KJ_IF_MAYBE (field, struct_schema.findFieldByName(nested_names[i])) - { - if (!checkCapnProtoType(field->getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i])) - return false; - } - else - { - error_message += "CapnProto struct doesn't contain a field with name " + nested_names[i]; - return false; - } - } - else if (!checkCapnProtoType(struct_schema.getFields()[i].getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i])) - return false; - } - - return true; -} - -static bool checkArrayType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - if (!capnp_type.isList()) - return false; - auto list_schema = capnp_type.asList(); - auto nested_type = assert_cast(data_type.get())->getNestedType(); - - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - if (!nested_name.empty() && list_schema.getElementType().isStruct()) - { - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(field, struct_schema.findFieldByName(nested_name)) - return checkCapnProtoType(field->getType(), nested_type, mode, error_message, nested_name); - - error_message += "Element type of List {} doesn't contain field with name " + nested_name; - return false; - } - - return checkCapnProtoType(list_schema.getElementType(), nested_type, mode, error_message, column_name); -} - -static bool checkMapType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message) -{ - /// We output/input Map type as follow CapnProto schema - /// - /// struct Map { - /// struct Entry { - /// key @0: Key; - /// value @1: Value; - /// } - /// entries @0 :List(Entry); - /// } - - if (!capnp_type.isStruct()) - return false; - auto struct_schema = capnp_type.asStruct(); - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - if (struct_schema.getFields().size() != 1) - { - error_message += "CapnProto struct that represents Map type can contain only one field"; - return false; - } - - const auto & field_type = struct_schema.getFields()[0].getType(); - if (!field_type.isList()) - { - error_message += "Field of CapnProto struct that represents Map is not a list"; - return false; - } - - auto list_element_type = field_type.asList().getElementType(); - if (!list_element_type.isStruct()) - { - error_message += "Field of CapnProto struct that represents Map is not a list of structs"; - return false; - } - - auto key_value_struct = list_element_type.asStruct(); - if (checkIfStructContainsUnnamedUnion(key_value_struct)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - if (key_value_struct.getFields().size() != 2) - { - error_message += "Key-value structure for Map struct should have exactly 2 fields"; - return false; - } - - const auto & map_type = assert_cast(*data_type); - DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; - Names names = {"key", "value"}; - - for (size_t i = 0; i != types.size(); ++i) - { - KJ_IF_MAYBE(field, key_value_struct.findFieldByName(names[i])) - { - if (!checkCapnProtoType(field->getType(), types[i], mode, error_message, names[i])) - return false; - } - else - { - error_message += R"(Key-value structure for Map struct should have exactly 2 fields with names "key" and "value")"; - return false; - } - } - - return true; -} - -static bool isCapnInteger(const capnp::Type & capnp_type) -{ - return capnp_type.isInt8() || capnp_type.isUInt8() || capnp_type.isInt16() || capnp_type.isUInt16() || capnp_type.isInt32() - || capnp_type.isUInt32() || capnp_type.isInt64() || capnp_type.isUInt64(); -} - -static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - switch (data_type->getTypeId()) - { - case TypeIndex::UInt8: - return capnp_type.isBool() || isCapnInteger(capnp_type); - case TypeIndex::Int8: [[fallthrough]]; - case TypeIndex::Int16: [[fallthrough]]; - case TypeIndex::UInt16: [[fallthrough]]; - case TypeIndex::Int32: [[fallthrough]]; - case TypeIndex::UInt32: [[fallthrough]]; - case TypeIndex::Int64: [[fallthrough]]; - case TypeIndex::UInt64: - /// Allow integer conversions durin input/output. - return isCapnInteger(capnp_type); - case TypeIndex::Date: - return capnp_type.isUInt16(); - case TypeIndex::DateTime: [[fallthrough]]; - case TypeIndex::IPv4: - return capnp_type.isUInt32(); - case TypeIndex::Date32: [[fallthrough]]; - case TypeIndex::Decimal32: - return capnp_type.isInt32() || capnp_type.isUInt32(); - case TypeIndex::DateTime64: [[fallthrough]]; - case TypeIndex::Decimal64: - return capnp_type.isInt64() || capnp_type.isUInt64(); - case TypeIndex::Float32:[[fallthrough]]; - case TypeIndex::Float64: - /// Allow converting between Float32 and isFloat64 - return capnp_type.isFloat32() || capnp_type.isFloat64(); - case TypeIndex::Enum8: - return checkEnums(capnp_type, data_type, mode, INT8_MAX, error_message); - case TypeIndex::Enum16: - return checkEnums(capnp_type, data_type, mode, INT16_MAX, error_message); - case TypeIndex::Int128: [[fallthrough]]; - case TypeIndex::UInt128: [[fallthrough]]; - case TypeIndex::Int256: [[fallthrough]]; - case TypeIndex::UInt256: [[fallthrough]]; - case TypeIndex::Decimal128: [[fallthrough]]; - case TypeIndex::Decimal256: - return capnp_type.isData(); - case TypeIndex::Tuple: - return checkTupleType(capnp_type, data_type, mode, error_message); - case TypeIndex::Nullable: - { - auto result = checkNullableType(capnp_type, data_type, mode, error_message, column_name); - if (!result) - error_message += "Nullable can be represented only as a named union of type Void and nested type"; - return result; - } - case TypeIndex::Array: - return checkArrayType(capnp_type, data_type, mode, error_message, column_name); - case TypeIndex::LowCardinality: - return checkCapnProtoType(capnp_type, assert_cast(data_type.get())->getDictionaryType(), mode, error_message, column_name); - case TypeIndex::FixedString: [[fallthrough]]; - case TypeIndex::IPv6: [[fallthrough]]; - case TypeIndex::String: - return capnp_type.isText() || capnp_type.isData(); - case TypeIndex::Map: - return checkMapType(capnp_type, data_type, mode, error_message); - default: - return false; - } -} - -capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, struct_reader.getSchema().findFieldByName(field_name)) - { - capnp::DynamicValue::Reader field_reader; - try - { - field_reader = struct_reader.get(*field); - } - catch (const kj::Exception & e) - { - throw Exception(ErrorCodes::INCORRECT_DATA, - "Cannot extract field value from struct by provided schema, error: " - "{} Perhaps the data was generated by another schema", String(e.getDescription().cStr())); - } - - if (nested_name.empty()) - return field_reader; - - /// Support reading Nested as List of Structs. - if (field_reader.getType() == capnp::DynamicValue::LIST) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return field_reader; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (field_reader.getType() != capnp::DynamicValue::STRUCT) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getReaderByColumnName(field_reader.as(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name); -} - -std::pair getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, struct_builder.getSchema().findFieldByName(field_name)) - { - if (nested_name.empty()) - return {struct_builder, *field}; - - auto field_builder = struct_builder.get(*field); - - /// Support reading Nested as List of Structs. - if (field_builder.getType() == capnp::DynamicValue::LIST) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return {struct_builder, *field}; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (field_builder.getType() != capnp::DynamicValue::STRUCT) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getStructBuilderAndFieldByColumnName(field_builder.as(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name); -} - -static std::pair getFieldByName(const capnp::StructSchema & schema, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, schema.findFieldByName(field_name)) - { - if (nested_name.empty()) - return {*field, name}; - - /// Support reading Nested as List of Structs. - if (field->getType().isList()) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return {*field, name}; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (!field->getType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getFieldByName(field->getType().asStruct(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); -} - -void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode) -{ - /// Firstly check that struct doesn't contain unnamed union, because we don't support it. - if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Schema contains unnamed union that is not supported"); - auto names_and_types = header.getNamesAndTypesList(); - String additional_error_message; - for (auto & [name, type] : names_and_types) - { - auto [field, field_name] = getFieldByName(schema, name); - if (!checkCapnProtoType(field.getType(), type, mode, additional_error_message, field_name)) - { - auto e = Exception( - ErrorCodes::CAPN_PROTO_BAD_CAST, - "Cannot convert ClickHouse type {} to CapnProto type {}", - type->getName(), - getCapnProtoFullTypeName(field.getType())); - if (!additional_error_message.empty()) - e.addMessage(additional_error_message); - throw std::move(e); - } - } -} - -template -static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) -{ - std::vector> values; - for (auto enumerant : enumerants) - values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); - return std::make_shared>(std::move(values)); -} - -static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) -{ - auto enumerants = enum_schema.getEnumerants(); - if (enumerants.size() < 128) - return getEnumDataTypeFromEnumerants(enumerants); - if (enumerants.size() < 32768) - return getEnumDataTypeFromEnumerants(enumerants); - - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); -} - -static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) -{ - switch (capnp_type.which()) - { - case capnp::schema::Type::INT8: - return std::make_shared(); - case capnp::schema::Type::INT16: - return std::make_shared(); - case capnp::schema::Type::INT32: - return std::make_shared(); - case capnp::schema::Type::INT64: - return std::make_shared(); - case capnp::schema::Type::BOOL: [[fallthrough]]; - case capnp::schema::Type::UINT8: - return std::make_shared(); - case capnp::schema::Type::UINT16: - return std::make_shared(); - case capnp::schema::Type::UINT32: - return std::make_shared(); - case capnp::schema::Type::UINT64: - return std::make_shared(); - case capnp::schema::Type::FLOAT32: - return std::make_shared(); - case capnp::schema::Type::FLOAT64: - return std::make_shared(); - case capnp::schema::Type::DATA: [[fallthrough]]; - case capnp::schema::Type::TEXT: - return std::make_shared(); - case capnp::schema::Type::ENUM: - return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); - case capnp::schema::Type::LIST: - { - auto list_schema = capnp_type.asList(); - auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); - if (!nested_type) - return nullptr; - return std::make_shared(nested_type); - } - case capnp::schema::Type::STRUCT: - { - auto struct_schema = capnp_type.asStruct(); - - - if (struct_schema.getFields().size() == 0) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); - } - - /// Check if it can be Nullable. - if (checkIfStructIsNamedUnion(struct_schema)) - { - auto fields = struct_schema.getUnionFields(); - if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); - } - auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); - if (value_type.isStruct() || value_type.isList()) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); - } - - auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); - if (!nested_type) - return nullptr; - return std::make_shared(nested_type); - } - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); - - /// Treat Struct as Tuple. - DataTypes nested_types; - Names nested_names; - for (auto field : struct_schema.getNonUnionFields()) - { - auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); - if (!nested_type) - continue; - nested_names.push_back(field.getProto().getName()); - nested_types.push_back(nested_type); - } - if (nested_types.empty()) - return nullptr; - return std::make_shared(std::move(nested_types), std::move(nested_names)); - } - default: - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); - } - } -} - -NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) -{ - if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); - - NamesAndTypesList names_and_types; - for (auto field : schema.getNonUnionFields()) - { - auto name = field.getProto().getName(); - auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); - if (type) - names_and_types.emplace_back(name, type); - } - if (names_and_types.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); - - return names_and_types; -} - -} - -#endif diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index dd6252b96f1..52f9098fbcc 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -132,12 +132,14 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.protobuf.input_flatten_google_wrappers = settings.input_format_protobuf_flatten_google_wrappers; format_settings.protobuf.output_nullables_with_google_wrappers = settings.output_format_protobuf_nullables_with_google_wrappers; format_settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference; + format_settings.protobuf.use_autogenerated_schema = settings.format_protobuf_use_autogenerated_schema; format_settings.regexp.escaping_rule = settings.format_regexp_escaping_rule; format_settings.regexp.regexp = settings.format_regexp; format_settings.regexp.skip_unmatched = settings.format_regexp_skip_unmatched; format_settings.schema.format_schema = settings.format_schema; format_settings.schema.format_schema_path = context->getFormatSchemaPath(); format_settings.schema.is_server = context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER); + format_settings.schema.output_format_schema = settings.output_format_schema; format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields; format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; @@ -178,6 +180,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference; + format_settings.capn_proto.use_autogenerated_schema = settings.format_capn_proto_use_autogenerated_schema; format_settings.seekable_read = settings.input_format_allow_seeks; format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation; diff --git a/src/Formats/FormatSchemaInfo.cpp b/src/Formats/FormatSchemaInfo.cpp index 8e2afe1b2c9..c0f0aec6fd3 100644 --- a/src/Formats/FormatSchemaInfo.cpp +++ b/src/Formats/FormatSchemaInfo.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include @@ -105,4 +107,84 @@ FormatSchemaInfo::FormatSchemaInfo(const FormatSettings & settings, const String { } +template +MaybeAutogeneratedFormatSchemaInfo::MaybeAutogeneratedFormatSchemaInfo( + const FormatSettings & settings, const String & format, const Block & header, bool use_autogenerated_schema) +{ + if (!use_autogenerated_schema || !settings.schema.format_schema.empty()) + { + schema_info = std::make_unique(settings, format, true); + return; + } + + String schema_path; + fs::path default_schema_directory_path(fs::canonical(settings.schema.format_schema_path) / ""); + fs::path path; + if (!settings.schema.output_format_schema.empty()) + { + schema_path = settings.schema.output_format_schema; + path = schema_path; + if (path.is_absolute()) + { + if (settings.schema.is_server) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Absolute path in the 'output_format_schema' setting is prohibited: {}", path.string()); + } + else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string())) + { + if (settings.schema.is_server) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", + default_schema_directory_path.string(), + path.string(), + default_schema_directory_path.string()); + path = default_schema_directory_path / path; + } + else + { + path = default_schema_directory_path / path; + } + } + else + { + if (settings.schema.is_server) + { + tmp_file_path = PocoTemporaryFile::tempName(default_schema_directory_path.string()) + '.' + getFormatSchemaDefaultFileExtension(format); + schema_path = fs::path(tmp_file_path).filename(); + } + else + { + tmp_file_path = PocoTemporaryFile::tempName() + '.' + getFormatSchemaDefaultFileExtension(format); + schema_path = tmp_file_path; + } + + path = tmp_file_path; + } + + WriteBufferFromFile buf(path.string()); + SchemaGenerator::writeSchema(buf, "Message", header.getNamesAndTypesList()); + buf.finalize(); + + schema_info = std::make_unique(schema_path + ":Message", format, true, settings.schema.is_server, settings.schema.format_schema_path); +} + +template +MaybeAutogeneratedFormatSchemaInfo::~MaybeAutogeneratedFormatSchemaInfo() +{ + if (!tmp_file_path.empty()) + { + try + { + fs::remove(tmp_file_path); + } + catch (...) + { + tryLogCurrentException("MaybeAutogeneratedFormatSchemaInfo", "Cannot delete temporary schema file"); + } + } +} + +template class MaybeAutogeneratedFormatSchemaInfo; +template class MaybeAutogeneratedFormatSchemaInfo; + } diff --git a/src/Formats/FormatSchemaInfo.h b/src/Formats/FormatSchemaInfo.h index 8c430218af0..e8758c3f761 100644 --- a/src/Formats/FormatSchemaInfo.h +++ b/src/Formats/FormatSchemaInfo.h @@ -2,6 +2,8 @@ #include #include +#include +#include namespace DB { @@ -30,4 +32,23 @@ private: String message_name; }; + +template +class MaybeAutogeneratedFormatSchemaInfo +{ +public: + MaybeAutogeneratedFormatSchemaInfo(const FormatSettings & settings, const String & format, const Block & header, bool use_autogenerated_schema); + + ~MaybeAutogeneratedFormatSchemaInfo(); + + const FormatSchemaInfo & getSchemaInfo() const { return *schema_info; } +private: + + std::unique_ptr schema_info; + String tmp_file_path; +}; + +using CapnProtoSchemaInfo = MaybeAutogeneratedFormatSchemaInfo; +using ProtobufSchemaInfo = MaybeAutogeneratedFormatSchemaInfo; + } diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index c88af650671..a21668f9642 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -254,6 +254,7 @@ struct FormatSettings */ bool allow_multiple_rows_without_delimiter = false; bool skip_fields_with_unsupported_types_in_schema_inference = false; + bool use_autogenerated_schema = true; } protobuf; struct @@ -275,6 +276,7 @@ struct FormatSettings std::string format_schema; std::string format_schema_path; bool is_server = false; + std::string output_format_schema; } schema; struct @@ -325,17 +327,18 @@ struct FormatSettings /// For capnProto format we should determine how to /// compare ClickHouse Enum and Enum from schema. - enum class EnumComparingMode + enum class CapnProtoEnumComparingMode { BY_NAMES, // Names in enums should be the same, values can be different. BY_NAMES_CASE_INSENSITIVE, // Case-insensitive name comparison. BY_VALUES, // Values should be the same, names can be different. }; - struct + struct CapnProto { - EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; + CapnProtoEnumComparingMode enum_comparing_mode = CapnProtoEnumComparingMode::BY_VALUES; bool skip_fields_with_unsupported_types_in_schema_inference = false; + bool use_autogenerated_schema = true; } capn_proto; enum class MsgPackUUIDRepresentation diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index f690800d145..dd37c25719c 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -3029,7 +3029,7 @@ namespace if (!message_serializer) { throw Exception(ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS, - "Not found matches between the names of the columns {{}} and the fields {{}} of the message {} in the protobuf schema", + "Not found matches between the names of the columns ({}) and the fields ({}) of the message {} in the protobuf schema", boost::algorithm::join(column_names, ", "), boost::algorithm::join(getFieldNames(message_descriptor), ", "), quoteString(message_descriptor.full_name())); } @@ -3647,7 +3647,7 @@ namespace if (!message_serializer) { throw Exception(ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS, - "Not found matches between the names of the tuple's elements {{}} and the fields {{}} " + "Not found matches between the names of the tuple's elements ({}) and the fields ({}) " "of the message {} in the protobuf schema", boost::algorithm::join(tuple_data_type.getElementNames(), ", "), boost::algorithm::join(getFieldNames(*field_descriptor.message_type()), ", "), diff --git a/src/Formats/StructureToCapnProtoSchema.cpp b/src/Formats/StructureToCapnProtoSchema.cpp new file mode 100644 index 00000000000..56e2155e5bd --- /dev/null +++ b/src/Formats/StructureToCapnProtoSchema.cpp @@ -0,0 +1,236 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +using namespace StructureToFormatSchemaUtils; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +const std::unordered_map capn_proto_simple_type_names = +{ + {TypeIndex::Int8, "Int8"}, + {TypeIndex::UInt8, "UInt8"}, + {TypeIndex::Int16, "Int16"}, + {TypeIndex::UInt16, "UInt16"}, + {TypeIndex::Int32, "Int32"}, + {TypeIndex::UInt32, "UInt32"}, + {TypeIndex::Int64, "Int64"}, + {TypeIndex::UInt64, "UInt64"}, + {TypeIndex::Int128, "Data"}, + {TypeIndex::UInt128, "Data"}, + {TypeIndex::Int256, "Data"}, + {TypeIndex::UInt256, "Data"}, + {TypeIndex::Float32, "Float32"}, + {TypeIndex::Float64, "Float64"}, + {TypeIndex::Decimal32, "Int32"}, + {TypeIndex::Decimal64, "Int64"}, + {TypeIndex::Decimal128, "Data"}, + {TypeIndex::Decimal256, "Data"}, + {TypeIndex::String, "Data"}, + {TypeIndex::FixedString, "Data"}, + {TypeIndex::UUID, "Data"}, + {TypeIndex::Date, "UInt16"}, + {TypeIndex::Date32, "Int32"}, + {TypeIndex::DateTime, "UInt32"}, + {TypeIndex::DateTime64, "Int64"}, + {TypeIndex::IPv4, "UInt32"}, + {TypeIndex::IPv6, "Data"}, +}; + +void writeCapnProtoHeader(WriteBuffer & buf) +{ + pcg64 rng(randomSeed()); + size_t id = rng() | (1ull << 63); /// First bit should be 1 + writeString(fmt::format("@0x{};\n\n", getHexUIntLowercase(id)), buf); +} + +void writeFieldDefinition(WriteBuffer & buf, const String & type_name, const String & column_name, size_t & field_index, size_t indent) +{ + writeIndent(buf, indent); + writeString(fmt::format("{} @{} : {};\n", getSchemaFieldName(column_name), field_index++, type_name), buf); +} + +void startEnum(WriteBuffer & buf, const String & enum_name, size_t indent) +{ + startNested(buf, enum_name, "enum", indent); +} + +void startUnion(WriteBuffer & buf, size_t indent) +{ + startNested(buf, "", "union", indent); +} + +void startStruct(WriteBuffer & buf, const String & struct_name, size_t indent) +{ + startNested(buf, struct_name, "struct", indent); +} + +String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent); + +void writeField(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t & field_index, size_t indent) +{ + auto field_type_name = prepareAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + writeFieldDefinition(buf, field_type_name, column_name, field_index, indent); +} + +String prepareArrayAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & nested_type = assert_cast(*data_type).getNestedType(); + auto nested_type_name = prepareAndGetCapnProtoTypeName(buf, nested_type, column_name, indent); + return "List(" + nested_type_name + ")"; +} + +String prepareNullableAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + /// Nullable is represented as a struct with union with 2 fields: + /// + /// struct Nullable + /// { + /// union + /// { + /// value @0 : Value; + /// null @1 : Void; + /// } + /// } + auto struct_name = getSchemaMessageName(column_name); + startStruct(buf, struct_name, indent); + auto nested_type_name = prepareAndGetCapnProtoTypeName(buf, assert_cast(*data_type).getNestedType(), column_name, indent); + startUnion(buf, indent + 1); + size_t field_index = 0; + writeFieldDefinition(buf, nested_type_name, "value", field_index, indent + 2); + writeFieldDefinition(buf, "Void", "null", field_index, indent + 2); + endNested(buf, indent + 1); + endNested(buf, indent); + return struct_name; +} + +String prepareTupleAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & tuple_type = assert_cast(*data_type); + auto nested_names_and_types = getCollectedTupleElements(tuple_type); + + String struct_name = getSchemaMessageName(column_name); + startStruct(buf, struct_name, indent); + size_t nested_field_index = 0; + for (const auto & [name, type] : nested_names_and_types) + writeField(buf, type, name, nested_field_index, indent + 1); + endNested(buf, indent); + return struct_name; +} + +String prepareMapAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + /// We output/input Map type as follow CapnProto schema + /// + /// struct Map + /// { + /// struct Entry + /// { + /// key @0: Key; + /// value @1: Value; + /// } + /// entries @0 :List(Entry); + /// } + const auto & map_type = assert_cast(*data_type); + const auto & key_type = map_type.getKeyType(); + const auto & value_type = map_type.getValueType(); + + String struct_name = getSchemaMessageName(column_name); + startStruct(buf, struct_name, indent); + startStruct(buf, "Entry", indent + 1); + auto key_type_name = prepareAndGetCapnProtoTypeName(buf, key_type, "key", indent + 2); + auto value_type_name = prepareAndGetCapnProtoTypeName(buf, value_type, "value", indent + 2); + size_t field_index = 0; + writeFieldDefinition(buf, key_type_name, "key", field_index, indent + 2); + writeFieldDefinition(buf, value_type_name, "value", field_index, indent + 2); + endNested(buf, indent + 1); + field_index = 0; + writeFieldDefinition(buf, "List(Entry)", "entries", field_index, indent + 1); + endNested(buf, indent); + return struct_name; +} + +template +String prepareEnumAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & enum_type = assert_cast &>(*data_type); + String enum_name = getSchemaMessageName(column_name); + startEnum(buf, enum_name, indent); + const auto & names = enum_type.getAllRegisteredNames(); + for (size_t i = 0; i != names.size(); ++i) + { + writeIndent(buf, indent + 1); + writeString(fmt::format("{} @{};\n", names[i], std::to_string(i)), buf); + } + endNested(buf, indent); + return enum_name; +} + +String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + TypeIndex type_id = data_type->getTypeId(); + + switch (data_type->getTypeId()) + { + case TypeIndex::Nullable: + return prepareNullableAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::LowCardinality: + return prepareAndGetCapnProtoTypeName(buf, assert_cast(*data_type).getDictionaryType(), column_name, indent); + case TypeIndex::Array: + return prepareArrayAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Tuple: + return prepareTupleAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Map: + return prepareMapAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Enum8: + return prepareEnumAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Enum16: + return prepareEnumAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + default: + { + if (isBool(data_type)) + return "Bool"; + + auto it = capn_proto_simple_type_names.find(type_id); + if (it == capn_proto_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "CapnProto type name is not found for type {}", data_type->getName()); + return it->second; + } + } +} + +} + +void StructureToCapnProtoSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) +{ + auto names_and_types = collectNested(names_and_types_); + writeCapnProtoHeader(buf); + startStruct(buf, getSchemaMessageName(message_name), 0); + + size_t field_index = 0; + for (const auto & [column_name, data_type] : names_and_types) + writeField(buf, data_type, column_name, field_index, 1); + + endNested(buf, 0); +} + +} diff --git a/src/Formats/StructureToCapnProtoSchema.h b/src/Formats/StructureToCapnProtoSchema.h new file mode 100644 index 00000000000..b2a0a8a8cf9 --- /dev/null +++ b/src/Formats/StructureToCapnProtoSchema.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace DB +{ + +struct StructureToCapnProtoSchema +{ + static constexpr auto name = "structureToCapnProtoSchema"; + + static void writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_); +}; + +} diff --git a/src/Formats/StructureToFormatSchemaUtils.cpp b/src/Formats/StructureToFormatSchemaUtils.cpp new file mode 100644 index 00000000000..02c9af2c65f --- /dev/null +++ b/src/Formats/StructureToFormatSchemaUtils.cpp @@ -0,0 +1,113 @@ +#include + +namespace DB +{ + +namespace StructureToFormatSchemaUtils +{ + +void writeIndent(WriteBuffer & buf, size_t indent) +{ + writeChar(' ', indent * 4, buf); +} + +void startNested(WriteBuffer & buf, const String & nested_name, const String & nested_type, size_t indent) +{ + writeIndent(buf, indent); + writeString(nested_type, buf); + if (!nested_name.empty()) + { + writeChar(' ', buf); + writeString(nested_name, buf); + } + writeChar('\n', buf); + writeIndent(buf, indent); + writeCString("{\n", buf); +} + +void endNested(WriteBuffer & buf, size_t indent) +{ + writeIndent(buf, indent); + writeCString("}\n", buf); +} + +String getSchemaFieldName(const String & column_name) +{ + String result = column_name; + size_t i = 0; + while (i < result.size() && isupper(result[i])) + { + result[i] = tolower(result[i]); + ++i; + } + return result; +} + +String getSchemaMessageName(const String & column_name) +{ + String result = column_name; + if (!column_name.empty() && isalpha(column_name[0])) + result[0] = toupper(column_name[0]); + return result; +} + +namespace +{ + std::pair splitName(const String & name) + { + const auto * begin = name.data(); + const auto * end = name.data() + name.size(); + const auto * it = find_first_symbols<'_', '.'>(begin, end); + String first = String(begin, it); + String second = it == end ? "" : String(it + 1, end); + return {first, second}; + } +} + +NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types) +{ + /// Find all columns with dots '.' or underscores '_' and move them into a tuple. + /// For example if we have columns 'a.b UInt32, a.c UInt32, x_y String' we will + /// change it to 'a Tuple(b UInt32, c UInt32), x Tuple(y String)' + NamesAndTypesList result; + std::unordered_map nested; + for (const auto & [name, type] : names_and_types) + { + auto [field_name, nested_name] = splitName(name); + if (nested_name.empty()) + result.emplace_back(name, type); + else + nested[field_name].emplace_back(nested_name, type); + } + + for (const auto & [field_name, elements]: nested) + result.emplace_back(field_name, std::make_shared(elements.getTypes(), elements.getNames())); + + return result; +} + +NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type) +{ + const auto & nested_types = tuple_type.getElements(); + Names nested_names; + if (tuple_type.haveExplicitNames()) + { + nested_names = tuple_type.getElementNames(); + } + else + { + nested_names.reserve(nested_types.size()); + for (size_t i = 0; i != nested_types.size(); ++i) + nested_names.push_back("e" + std::to_string(i + 1)); + } + + NamesAndTypesList result; + for (size_t i = 0; i != nested_names.size(); ++i) + result.emplace_back(nested_names[i], nested_types[i]); + + return collectNested(result); +} + +} + +} diff --git a/src/Formats/StructureToFormatSchemaUtils.h b/src/Formats/StructureToFormatSchemaUtils.h new file mode 100644 index 00000000000..c6b86501ac8 --- /dev/null +++ b/src/Formats/StructureToFormatSchemaUtils.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +namespace StructureToFormatSchemaUtils +{ + void writeIndent(WriteBuffer & buf, size_t indent); + + void startNested(WriteBuffer & buf, const String & nested_name, const String & nested_type, size_t indent); + + void endNested(WriteBuffer & buf, size_t indent); + + String getSchemaFieldName(const String & column_name); + + String getSchemaMessageName(const String & column_name); + + NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types); + + NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type); +} + +} diff --git a/src/Formats/StructureToProtobufSchema.cpp b/src/Formats/StructureToProtobufSchema.cpp new file mode 100644 index 00000000000..086a6ff6864 --- /dev/null +++ b/src/Formats/StructureToProtobufSchema.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +using namespace StructureToFormatSchemaUtils; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +const std::unordered_map protobuf_simple_type_names = +{ + {TypeIndex::Int8, "int32"}, + {TypeIndex::UInt8, "uint32"}, + {TypeIndex::Int16, "int32"}, + {TypeIndex::UInt16, "uint32"}, + {TypeIndex::Int32, "int32"}, + {TypeIndex::UInt32, "uint32"}, + {TypeIndex::Int64, "int64"}, + {TypeIndex::UInt64, "uint64"}, + {TypeIndex::Int128, "bytes"}, + {TypeIndex::UInt128, "bytes"}, + {TypeIndex::Int256, "bytes"}, + {TypeIndex::UInt256, "bytes"}, + {TypeIndex::Float32, "float"}, + {TypeIndex::Float64, "double"}, + {TypeIndex::Decimal32, "bytes"}, + {TypeIndex::Decimal64, "bytes"}, + {TypeIndex::Decimal128, "bytes"}, + {TypeIndex::Decimal256, "bytes"}, + {TypeIndex::String, "bytes"}, + {TypeIndex::FixedString, "bytes"}, + {TypeIndex::UUID, "bytes"}, + {TypeIndex::Date, "uint32"}, + {TypeIndex::Date32, "int32"}, + {TypeIndex::DateTime, "uint32"}, + {TypeIndex::DateTime64, "uint64"}, + {TypeIndex::IPv4, "uint32"}, + {TypeIndex::IPv6, "bytes"}, +}; + +void writeProtobufHeader(WriteBuffer & buf) +{ + writeCString("syntax = \"proto3\";\n\n", buf); +} + +void startEnum(WriteBuffer & buf, const String & enum_name, size_t indent) +{ + startNested(buf, enum_name, "enum", indent); +} + +void startMessage(WriteBuffer & buf, const String & message_name, size_t indent) +{ + startNested(buf, message_name, "message", indent); +} + +void writeFieldDefinition(WriteBuffer & buf, const String & type_name, const String & column_name, size_t & field_index, size_t indent) +{ + writeIndent(buf, indent); + writeString(fmt::format("{} {} = {};\n", type_name, getSchemaFieldName(column_name), field_index++), buf); +} + +String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent); + +void writeProtobufField(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t & field_index, size_t indent) +{ + auto field_type_name = prepareAndGetProtobufTypeName(buf, data_type, column_name, indent); + writeFieldDefinition(buf, field_type_name, column_name, field_index, indent); +} + +String prepareArrayAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & nested_type = assert_cast(*data_type).getNestedType(); + /// Simple case when we can just use 'repeated '. + if (!isArray(nested_type) && !isMap(nested_type)) + { + auto nested_type_name = prepareAndGetProtobufTypeName(buf, nested_type, column_name, indent); + return "repeated " + nested_type_name; + } + + /// Protobuf doesn't support multidimensional repeated fields and repeated maps. + /// When we have Array(Array(...)) or Array(Map(...)) we should place nested type into a nested Message with one field. + String message_name = getSchemaMessageName(column_name); + startMessage(buf, message_name, indent); + size_t nested_field_index = 1; + writeProtobufField(buf, nested_type, column_name, nested_field_index, indent + 1); + endNested(buf, indent); + return "repeated " + message_name; +} + +String prepareTupleAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & tuple_type = assert_cast(*data_type); + auto nested_names_and_types = getCollectedTupleElements(tuple_type); + + String message_name = getSchemaMessageName(column_name); + startMessage(buf, message_name, indent); + size_t nested_field_index = 1; + for (const auto & [name, type] : nested_names_and_types) + writeProtobufField(buf, type, name, nested_field_index, indent + 1); + endNested(buf, indent); + return message_name; +} + +String prepareMapAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & map_type = assert_cast(*data_type); + const auto & key_type = map_type.getKeyType(); + const auto & value_type = map_type.getValueType(); + auto it = protobuf_simple_type_names.find(key_type->getTypeId()); + if (it == protobuf_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for conversion into Map key in Protobuf schema", data_type->getName()); + auto key_type_name = it->second; + /// Protobuf map type doesn't support "bytes" type as a key. Change it to "string" + if (key_type_name == "bytes") + key_type_name = "string"; + + /// Special cases when value type is Array or Map, because Protobuf + /// doesn't support syntax "map" and "map>" + /// In this case we should place it into a nested Message with one field. + String value_type_name; + if (isArray(value_type) || isMap(value_type)) + { + value_type_name = getSchemaMessageName(column_name) + "Value"; + startMessage(buf, value_type_name, indent); + size_t nested_field_index = 1; + writeProtobufField(buf, value_type, column_name + "Value", nested_field_index, indent + 1); + endNested(buf, indent); + } + else + { + value_type_name = prepareAndGetProtobufTypeName(buf, value_type, column_name + "Value", indent); + } + + return fmt::format("map<{}, {}>", key_type_name, value_type_name); +} + +template +String prepareEnumAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & enum_type = assert_cast &>(*data_type); + String enum_name = getSchemaMessageName(column_name); + startEnum(buf, enum_name, indent); + const auto & names = enum_type.getAllRegisteredNames(); + for (size_t i = 0; i != names.size(); ++i) + { + writeIndent(buf, indent + 1); + writeString(fmt::format("{} = {};\n", names[i], std::to_string(i)), buf); + } + endNested(buf, indent); + return enum_name; +} + +String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + TypeIndex type_id = data_type->getTypeId(); + + switch (data_type->getTypeId()) + { + case TypeIndex::Nullable: + return prepareAndGetProtobufTypeName(buf, assert_cast(*data_type).getNestedType(), column_name, indent); + case TypeIndex::LowCardinality: + return prepareAndGetProtobufTypeName(buf, assert_cast(*data_type).getDictionaryType(), column_name, indent); + case TypeIndex::Array: + return prepareArrayAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Tuple: + return prepareTupleAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Map: + return prepareMapAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Enum8: + return prepareEnumAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Enum16: + return prepareEnumAndGetProtobufTypeName(buf, data_type, column_name, indent); + default: + { + if (isBool(data_type)) + return "bool"; + + auto it = protobuf_simple_type_names.find(type_id); + if (it == protobuf_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for conversion into Protobuf schema", data_type->getName()); + return it->second; + } + } +} + +} + +void StructureToProtobufSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) +{ + auto names_and_types = collectNested(names_and_types_); + writeProtobufHeader(buf); + startMessage(buf, getSchemaMessageName(message_name), 0); + size_t field_index = 1; + for (const auto & [column_name, data_type] : names_and_types) + writeProtobufField(buf, data_type, column_name, field_index, 1); + endNested(buf, 0); +} + +} diff --git a/src/Formats/StructureToProtobufSchema.h b/src/Formats/StructureToProtobufSchema.h new file mode 100644 index 00000000000..f4dfb0ae0c2 --- /dev/null +++ b/src/Formats/StructureToProtobufSchema.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace DB +{ + +struct StructureToProtobufSchema +{ + static constexpr auto name = "structureToProtobufSchema"; + + static void writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_); +}; + +} diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp new file mode 100644 index 00000000000..5f2153ff89f --- /dev/null +++ b/src/Functions/generateRandomStructure.cpp @@ -0,0 +1,507 @@ +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; +} + +class FunctionGenerateRandomStructure : public IFunction +{ +private: + enum class Type + { + Int8, + UInt8, + Bool, + Int16, + UInt16, + Int32, + UInt32, + Int64, + UInt64, + Float32, + Float64, + DateTime64, + Decimal32, + Decimal64, + Date, + Date32, + DateTime, + String, + FixedString, + IPv4, + IPv6, + Int128, + UInt128, + Int256, + UInt256, + Decimal128, + Decimal256, + Enum8, + Enum16, + Nullable, + LowCardinality, + Array, + Tuple, + Map, + Nested, + }; + + static constexpr std::array simple_types + { + Type::Int8, + Type::UInt8, + Type::Bool, + Type::Int16, + Type::UInt16, + Type::Int32, + Type::UInt32, + Type::Int64, + Type::UInt64, + Type::Float32, + Type::Float64, + Type::Date, + Type::Date32, + Type::DateTime, + Type::String, + Type::FixedString, + }; + + static constexpr std::array big_integer_types + { + Type::Int128, + Type::UInt128, + Type::Int256, + Type::UInt256, + }; + + static constexpr std::array decimal_types + { + Type::DateTime64, + Type::Decimal32, + Type::Decimal64, + }; + + static constexpr std::array big_decimal_types + { + Type::Decimal128, + Type::Decimal256, + }; + + static constexpr std::array enum_types + { + Type::Enum8, + Type::Enum16, + }; + + static constexpr std::array ip_types + { + Type::IPv4, + Type::IPv6, + }; + + static constexpr std::array complex_types + { + Type::Nullable, + Type::LowCardinality, + Type::Array, + Type::Tuple, + Type::Map, + Type::Nested, + }; + + static constexpr std::array map_key_types + { + Type::Int8, + Type::UInt8, + Type::Bool, + Type::Int16, + Type::UInt16, + Type::Int32, + Type::UInt32, + Type::Int64, + Type::UInt64, + Type::Date, + Type::Date32, + Type::DateTime, + Type::String, + Type::FixedString, + }; + + static constexpr std::array map_key_string_types + { + Type::String, + Type::FixedString + }; + + static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; + static constexpr size_t MAX_TUPLE_ELEMENTS = 16; + static constexpr size_t MAX_DATETIME64_PRECISION = 9; + static constexpr size_t MAX_DECIMAL32_PRECISION = 9; + static constexpr size_t MAX_DECIMAL64_PRECISION = 18; + static constexpr size_t MAX_DECIMAL128_PRECISION = 38; + static constexpr size_t MAX_DECIMAL256_PRECISION = 76; + static constexpr size_t MAX_DEPTH = 32; + +public: + static constexpr auto name = "generateRandomStructure"; + + static FunctionPtr create(ContextPtr /*context*/) + { + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1, 2, 3, 4, 5, 6}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 7) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 7", + getName(), arguments.size()); + + for (size_t i = 0; i != 2; ++i) + { + if (arguments.size() == i) + break; + + if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", + i + 1, + arguments[i]->getName(), + getName()); + } + } + + for (size_t i = 2; i != 7; ++i) + { + if (arguments.size() <= i) + break; + + if (!isUInt8(arguments[i])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected UInt8", + i + 1, + arguments[i]->getName(), + getName()); + } + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + size_t seed = randomSeed(); + size_t number_of_columns = 0; + + if (!arguments.empty() && !arguments[0].column->onlyNull()) + { + number_of_columns = arguments[0].column->getUInt(0); + if (number_of_columns > MAX_NUMBER_OF_COLUMNS) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Maximum allowed number of columns is {}, got {}", + MAX_NUMBER_OF_COLUMNS, + number_of_columns); + } + + if (arguments.size() > 1 && !arguments[1].column->onlyNull()) + seed = arguments[1].column->getUInt(0); + + bool allow_big_numbers = true; + if (arguments.size() > 2) + allow_big_numbers = arguments[2].column->getBool(0); + + bool allow_enums = true; + if (arguments.size() > 3) + allow_enums = arguments[3].column->getBool(0); + + bool allow_decimals = true; + if (arguments.size() > 4) + allow_decimals = arguments[4].column->getBool(0); + + bool allow_ip = true; + if (arguments.size() > 5) + allow_ip = arguments[5].column->getBool(0); + + bool only_string_map_key = false; + if (arguments.size() > 6) + only_string_map_key = arguments[6].column->getBool(0); + + pcg64 rng(seed); + if (number_of_columns == 0) + number_of_columns = generateNumberOfColumns(rng); + + auto col_res = ColumnString::create(); + String generated_structure; + for (size_t i = 0; i != number_of_columns; ++i) + { + if (i != 0) + generated_structure += ", "; + String column_name = "c" + std::to_string(i + 1); + auto type = generateRandomType(column_name, rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, only_string_map_key); + generated_structure += column_name + " " + type; + } + col_res->insert(generated_structure); + return ColumnConst::create(std::move(col_res), input_rows_count); + } + +private: + + size_t generateNumberOfColumns(pcg64 & rng) const + { + return rng() % MAX_NUMBER_OF_COLUMNS + 1; + } + + /// Helper struct to call generateRandomTypeImpl with lots of bool template arguments without writing big if/else over all bool variables. + template + struct Dispatcher + { + static auto call(const FunctionGenerateRandomStructure * f, const String & column_name, pcg64 & rng) + { + return f->generateRandomTypeImpl(column_name, rng); + } + + template + static auto call(const FunctionGenerateRandomStructure * f, const String & column_name, pcg64 & rng, bool b, Args1... ar1) + { + if (b) + return Dispatcher::call(f, column_name, rng, ar1...); + else + return Dispatcher::call(f, column_name, rng, ar1...); + } + + friend FunctionGenerateRandomStructure; + }; + + String generateRandomType(const String & column_name, pcg64 & rng, bool allow_big_numbers, bool allow_enums, bool allow_decimals, bool allow_ip, bool allow_only_string_map_keys) const + { + return Dispatcher<>::call(this, column_name, rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, allow_only_string_map_keys, true); + } + + template + String generateRandomTypeImpl(const String & column_name, pcg64 & rng, size_t depth = 0) const + { + constexpr auto all_types = getAllTypes(); + auto type = all_types[rng() % all_types.size()]; + + switch (type) + { + case Type::FixedString: + return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + case Type::DateTime64: + return "DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")"; + case Type::Decimal32: + return "Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")"; + case Type::Decimal64: + return "Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")"; + case Type::Decimal128: + return "Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")"; + case Type::Decimal256: + return "Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")"; + case Type::Enum8: + return "Enum8(" + generateEnumValues(column_name, rng) + ")"; + case Type::Enum16: + return "Enum16(" + generateEnumValues(column_name, rng) + ")"; + case Type::LowCardinality: + return "LowCardinality(" + generateLowCardinalityNestedType(rng) + ")"; + case Type::Nullable: + { + auto nested_type = generateRandomTypeImpl(column_name, rng, depth + 1); + return "Nullable(" + nested_type + ")"; + } + case Type::Array: + { + auto nested_type = generateRandomTypeImpl(column_name, rng, depth + 1); + return "Array(" + nested_type + ")"; + } + case Type::Map: + { + auto key_type = generateMapKeyType(rng); + auto value_type = generateRandomTypeImpl(column_name, rng, depth + 1); + return "Map(" + key_type + ", " + value_type + ")"; + } + case Type::Tuple: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + bool named_tuple = rng() % 2; + String tuple_type = "Tuple("; + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + tuple_type += ", "; + + String element_name = "e" + std::to_string(i + 1); + if (named_tuple) + tuple_type += element_name + " "; + tuple_type += generateRandomTypeImpl(element_name, rng, depth + 1); + } + return tuple_type + ")"; + } + case Type::Nested: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + String nested_type = "Nested("; + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + nested_type += ", "; + String element_name = "e" + std::to_string(i + 1); + auto element_type = generateRandomTypeImpl(element_name, rng, depth + 1); + nested_type += element_name + " " + element_type; + } + return nested_type + ")"; + } + default: + return String(magic_enum::enum_name(type)); + } + } + + template + String generateMapKeyType(pcg64 & rng) const + { + Type type; + if constexpr (allow_only_string_map_keys) + type = map_key_string_types[rng() % map_key_string_types.size()]; + else + type = map_key_types[rng() % map_key_types.size()]; + + if (type == Type::FixedString) + return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + return String(magic_enum::enum_name(type)); + } + + String generateLowCardinalityNestedType(pcg64 & rng) const + { + /// Support only String and FixedString. + String nested_type; + if (rng() % 2) + nested_type = "String"; + else + nested_type = "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + return rng() % 2 ? nested_type : "Nullable(" + nested_type + ")"; + } + + String generateEnumValues(const String & column_name, pcg64 & rng) const + { + /// Don't generate big enums, because it will lead to really big strings + /// and slowness of this function, and it can lead to `Max query size exceeded` + /// while using this function with generateRandom. + ssize_t num_values = rng() % 16 + 1; + String result; + for (ssize_t i = 0; i != num_values; ++i) + { + if (i != 0) + result += ", "; + result += "'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(i); + } + return result; + } + + template + static constexpr auto getAllTypes() + { + constexpr size_t big_integer_types_size = big_integer_types.size() * allow_big_numbers; + constexpr size_t enum_types_size = enum_types.size() * allow_enums; + constexpr size_t decimal_types_size = decimal_types.size() * allow_decimals; + constexpr size_t big_decimal_types_size = big_decimal_types.size() * allow_big_numbers * allow_decimals; + constexpr size_t ip_types_size = ip_types.size() * allow_ip; + constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; + + constexpr size_t result_size = simple_types.size() + big_integer_types_size + enum_types_size + decimal_types_size + + big_decimal_types_size + ip_types_size + complex_types_size; + std::array result; + size_t index = 0; + + for (size_t i = 0; i != simple_types.size(); ++i, ++index) + result[index] = simple_types[i]; + + for (size_t i = 0; i != big_integer_types_size; ++i, ++index) + result[index] = big_integer_types[i]; + + for (size_t i = 0; i != enum_types_size; ++i, ++index) + result[index] = enum_types[i]; + + for (size_t i = 0; i != decimal_types_size; ++i, ++index) + result[index] = decimal_types[i]; + + for (size_t i = 0; i != big_decimal_types_size; ++i, ++index) + result[index] = big_decimal_types[i]; + + for (size_t i = 0; i != ip_types_size; ++i, ++index) + result[index] = ip_types[i]; + + for (size_t i = 0; i != complex_types_size; ++i, ++index) + result[index] = complex_types[i]; + + return result; + } +}; + + +REGISTER_FUNCTION(GenerateRandomStructure) +{ + factory.registerFunction( + { + R"( +Generates a random table structure. +This function takes 4 optional constant arguments: +1) the number of column in the result structure (random by default) +2) random seed (random by default) +3) flag that indicates if big number types can be used (true by default) +4) flag that indicates if enum types can be used (true by default) +5) flag that indicates if decimal types can be used (true by default) +6) flag that indicates if ip types (IPv4, IPv6) can be used (true by default) +7) flag that indicates if map keys should be only String or FixedString (false by default) +The maximum number of columns is 128. +The function returns a value of type String. +)", + Documentation::Examples{ + {"random", "SELECT generateRandomStructure()"}, + {"with specified number of arguments", "SELECT generateRandomStructure(10)"}, + {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, + {"without big number types", "SELECT generateRandomStructure(10, NULL, false)"}, + {"without enum types", "SELECT generateRandomStructure(10, NULL, true, false)"}, + {"without decimal types", "SELECT generateRandomStructure(10, NULL, true, true, false)"}, + {"without ip types", "SELECT generateRandomStructure(10, NULL, true, true, true, false)"}, + {"with only string mak key types", "SELECT generateRandomStructure(10, NULL, true, true, true, true, true)"}, + }, + Documentation::Categories{"Random"} + }, + FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/structureToFormatSchema.cpp b/src/Functions/structureToFormatSchema.cpp new file mode 100644 index 00000000000..332db44e06e --- /dev/null +++ b/src/Functions/structureToFormatSchema.cpp @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; +} + +template +class FunctionStructureToFormatSchema : public IFunction +{ +public: + + static constexpr auto name = Impl::name; + FunctionStructureToFormatSchema(ContextPtr context_) : context(std::move(context_)) + { + } + + static FunctionPtr create(ContextPtr ctx) + { + return std::make_shared(std::move(ctx)); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + bool isVariadic() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.empty() || arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected 1 or 2", + getName(), arguments.size()); + + if (!isString(arguments[0])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the first argument of function {}, expected constant string", + arguments[0]->getName(), + getName()); + } + + if (arguments.size() > 1 && !isString(arguments[1])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the second argument of function {}, expected constant string", + arguments[1]->getName(), + getName()); + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + if (arguments.empty() || arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected 1 or 2", + getName(), arguments.size()); + + String structure = arguments[0].column->getDataAt(0).toString(); + String message_name = arguments.size() == 2 ? arguments[1].column->getDataAt(0).toString() : "Message"; + auto columns_list = parseColumnsListFromString(structure, context); + auto col_res = ColumnString::create(); + auto & data = assert_cast(*col_res).getChars(); + WriteBufferFromVector buf(data); + Impl::writeSchema(buf, message_name, columns_list.getAll()); + buf.finalize(); + auto & offsets = assert_cast(*col_res).getOffsets(); + offsets.push_back(data.size()); + return ColumnConst::create(std::move(col_res), input_rows_count); + } + +private: + ContextPtr context; +}; + + +REGISTER_FUNCTION(StructureToCapnProtoSchema) +{ + factory.registerFunction>( + { + R"( + +)", + Documentation::Examples{ + {"random", "SELECT structureToCapnProtoSchema()"}, + }, + Documentation::Categories{"Other"} + }, + FunctionFactory::CaseSensitive); +} + + +REGISTER_FUNCTION(StructureToProtobufSchema) +{ + factory.registerFunction>( + { + R"( + +)", + Documentation::Examples{ + {"random", "SELECT structureToCapnProtoSchema()"}, + }, + Documentation::Categories{"Other"} + }, + FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 2f84e9bde3c..ce7c48d1f37 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -9,23 +9,6 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - namespace DB { @@ -35,16 +18,14 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } -CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings_) - : IRowInputFormat(std::move(header), in_, std::move(params_)) +CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const CapnProtoSchemaInfo & info, const FormatSettings & format_settings) + : IRowInputFormat(std::move(header_), in_, std::move(params_)) , parser(std::make_shared()) - , format_settings(format_settings_) - , column_types(getPort().getHeader().getDataTypes()) - , column_names(getPort().getHeader().getNames()) { // Parse the schema and fetch the root object - root = parser->getMessageSchema(info); - checkCapnProtoSchemaStructure(root, getPort().getHeader(), format_settings.capn_proto.enum_comparing_mode); + schema = parser->getMessageSchema(info.getSchemaInfo()); + const auto & header = getPort().getHeader(); + serializer = std::make_unique(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto); } kj::Array CapnProtoRowInputFormat::readMessage() @@ -82,213 +63,6 @@ kj::Array CapnProtoRowInputFormat::readMessage() return msg; } -static void insertInteger(IColumn & column, const DataTypePtr & column_type, UInt64 value) -{ - switch (column_type->getTypeId()) - { - case TypeIndex::Int8: - assert_cast(column).insertValue(value); - break; - case TypeIndex::UInt8: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Int16: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Date: [[fallthrough]]; - case TypeIndex::UInt16: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Int32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::DateTime: [[fallthrough]]; - case TypeIndex::UInt32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::IPv4: - assert_cast(column).insertValue(IPv4(static_cast(value))); - break; - case TypeIndex::Int64: - assert_cast(column).insertValue(value); - break; - case TypeIndex::UInt64: - assert_cast(column).insertValue(value); - break; - case TypeIndex::DateTime64: - assert_cast &>(column).insertValue(value); - break; - case TypeIndex::Decimal32: - assert_cast &>(column).insertValue(static_cast(value)); - break; - case TypeIndex::Decimal64: - assert_cast &>(column).insertValue(value); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type {} cannot be parsed from integer", column_type->getName()); - } -} - -static void insertFloat(IColumn & column, const DataTypePtr & column_type, Float64 value) -{ - switch (column_type->getTypeId()) - { - case TypeIndex::Float32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::Float64: - assert_cast(column).insertValue(value); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type is not a float."); - } -} - -template -static void insertData(IColumn & column, const DataTypePtr & column_type, Value value) -{ - if (column_type->haveMaximumSizeOfValue() && value.size() != column_type->getSizeOfValueInMemory()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", column_type->getName(), value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); -} - -template -static void insertEnum(IColumn & column, const DataTypePtr & column_type, const capnp::DynamicEnum & enum_value, FormatSettings::EnumComparingMode enum_comparing_mode) -{ - auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); - auto enum_type = assert_cast *>(column_type.get()); - DataTypePtr nested_type = std::make_shared>(); - switch (enum_comparing_mode) - { - case FormatSettings::EnumComparingMode::BY_VALUES: - insertInteger(column, nested_type, Int64(enumerant.getOrdinal())); - return; - case FormatSettings::EnumComparingMode::BY_NAMES: - insertInteger(column, nested_type, Int64(enum_type->getValue(String(enumerant.getProto().getName())))); - return; - case FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE: - { - /// Find the same enum name case insensitive. - String enum_name = enumerant.getProto().getName(); - for (auto & name : enum_type->getAllRegisteredNames()) - { - if (compareEnumNames(name, enum_name, enum_comparing_mode)) - { - insertInteger(column, nested_type, Int64(enum_type->getValue(name))); - break; - } - } - } - } -} - -static void insertValue(IColumn & column, const DataTypePtr & column_type, const String & column_name, const capnp::DynamicValue::Reader & value, FormatSettings::EnumComparingMode enum_comparing_mode) -{ - if (column_type->lowCardinality()) - { - auto & lc_column = assert_cast(column); - auto tmp_column = lc_column.getDictionary().getNestedColumn()->cloneEmpty(); - auto dict_type = assert_cast(column_type.get())->getDictionaryType(); - insertValue(*tmp_column, dict_type, column_name, value, enum_comparing_mode); - lc_column.insertFromFullColumn(*tmp_column, 0); - return; - } - - switch (value.getType()) - { - case capnp::DynamicValue::Type::INT: - insertInteger(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::UINT: - insertInteger(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::FLOAT: - insertFloat(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::BOOL: - insertInteger(column, column_type, UInt64(value.as())); - break; - case capnp::DynamicValue::Type::DATA: - insertData(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::TEXT: - insertData(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::ENUM: - if (column_type->getTypeId() == TypeIndex::Enum8) - insertEnum(column, column_type, value.as(), enum_comparing_mode); - else - insertEnum(column, column_type, value.as(), enum_comparing_mode); - break; - case capnp::DynamicValue::LIST: - { - auto list_value = value.as(); - auto & column_array = assert_cast(column); - auto & offsets = column_array.getOffsets(); - offsets.push_back(offsets.back() + list_value.size()); - - auto & nested_column = column_array.getData(); - auto nested_type = assert_cast(column_type.get())->getNestedType(); - for (const auto & nested_value : list_value) - insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode); - break; - } - case capnp::DynamicValue::Type::STRUCT: - { - auto struct_value = value.as(); - if (column_type->isNullable()) - { - auto & nullable_column = assert_cast(column); - auto field = *kj::_::readMaybe(struct_value.which()); - if (field.getType().isVoid()) - nullable_column.insertDefault(); - else - { - auto & nested_column = nullable_column.getNestedColumn(); - auto nested_type = assert_cast(column_type.get())->getNestedType(); - auto nested_value = struct_value.get(field); - insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode); - nullable_column.getNullMapData().push_back(0); - } - } - else if (isTuple(column_type)) - { - auto & tuple_column = assert_cast(column); - const auto * tuple_type = assert_cast(column_type.get()); - bool have_explicit_names = tuple_type->haveExplicitNames(); - auto struct_schema = struct_value.getSchema(); - for (uint32_t i = 0; i != tuple_column.tupleSize(); ++i) - insertValue( - tuple_column.getColumn(i), - tuple_type->getElements()[i], - tuple_type->getElementNames()[i], - struct_value.get(have_explicit_names ? struct_schema.getFieldByName(tuple_type->getElementNames()[i]) : struct_schema.getFields()[i]), - enum_comparing_mode); - } - else if (isMap(column_type)) - { - const auto & map_type = assert_cast(*column_type); - DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()}; - Names key_value_names = {"key", "value"}; - auto entries_type = std::make_shared(std::make_shared(key_value_types, key_value_names)); - auto & entries_column = assert_cast(column).getNestedColumn(); - auto entries_field = struct_value.getSchema().getFields()[0]; - insertValue(entries_column, entries_type, column_name, struct_value.get(entries_field), enum_comparing_mode); - } - else - { - /// It can be nested column from Nested type. - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - insertValue(column, column_type, nested_name, struct_value.get(nested_name), enum_comparing_mode); - } - break; - } - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto value type."); - } -} - bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) { if (in->eof()) @@ -298,12 +72,8 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension { auto array = readMessage(); capnp::FlatArrayMessageReader msg(array); - auto root_reader = msg.getRoot(root); - for (size_t i = 0; i != columns.size(); ++i) - { - auto value = getReaderByColumnName(root_reader, column_names[i]); - insertValue(*columns[i], column_types[i], column_names[i], value, format_settings.capn_proto.enum_comparing_mode); - } + auto root_reader = msg.getRoot(schema); + serializer->readRow(columns, root_reader); } catch (const kj::Exception & e) { @@ -337,8 +107,12 @@ void registerInputFormatCapnProto(FormatFactory & factory) "CapnProto", [](ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(buf, sample, std::move(params), - FormatSchemaInfo(settings, "CapnProto", true), settings); + return std::make_shared( + buf, + sample, + std::move(params), + CapnProtoSchemaInfo(settings, "CapnProto", sample, settings.capn_proto.use_autogenerated_schema), + settings); }); factory.markFormatSupportsSubsetOfColumns("CapnProto"); factory.registerFileExtension("capnp", "CapnProto"); diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index cf23f22b643..88f91b5eb11 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -4,7 +4,8 @@ #if USE_CAPNP #include -#include +#include +#include #include #include @@ -23,7 +24,7 @@ class ReadBuffer; class CapnProtoRowInputFormat final : public IRowInputFormat { public: - CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings_); + CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params_, const CapnProtoSchemaInfo & info, const FormatSettings & format_settings); String getName() const override { return "CapnProtoRowInputFormat"; } @@ -33,10 +34,9 @@ private: kj::Array readMessage(); std::shared_ptr parser; - capnp::StructSchema root; - const FormatSettings format_settings; - DataTypes column_types; - Names column_names; + capnp::StructSchema schema; + std::unique_ptr serializer; + }; class CapnProtoSchemaReader : public IExternalSchemaReader diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp index 0225680b396..0919c93ed28 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp @@ -1,28 +1,13 @@ #include #if USE_CAPNP -#include +#include #include +#include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - namespace DB { @@ -44,252 +29,24 @@ void CapnProtoOutputStream::write(const void * buffer, size_t size) CapnProtoRowOutputFormat::CapnProtoRowOutputFormat( WriteBuffer & out_, const Block & header_, - const FormatSchemaInfo & info, - const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_), column_names(header_.getNames()), column_types(header_.getDataTypes()), output_stream(std::make_unique(out_)), format_settings(format_settings_) + const CapnProtoSchemaInfo & info, + const FormatSettings & format_settings) + : IRowOutputFormat(header_, out_) + , column_names(header_.getNames()) + , column_types(header_.getDataTypes()) + , output_stream(std::make_unique(out_)) { - schema = schema_parser.getMessageSchema(info); - checkCapnProtoSchemaStructure(schema, getPort(PortKind::Main).getHeader(), format_settings.capn_proto.enum_comparing_mode); -} - -template -static capnp::DynamicEnum getDynamicEnum( - const ColumnPtr & column, - const DataTypePtr & data_type, - size_t row_num, - const capnp::EnumSchema & enum_schema, - FormatSettings::EnumComparingMode mode) -{ - const auto * enum_data_type = assert_cast *>(data_type.get()); - EnumValue enum_value = column->getInt(row_num); - if (mode == FormatSettings::EnumComparingMode::BY_VALUES) - return capnp::DynamicEnum(enum_schema, enum_value); - - auto enum_name = enum_data_type->getNameForValue(enum_value); - for (const auto enumerant : enum_schema.getEnumerants()) - { - if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), mode)) - return capnp::DynamicEnum(enumerant); - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); -} - -static capnp::DynamicValue::Builder initStructFieldBuilder(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, capnp::StructSchema::Field field) -{ - if (const auto * array_column = checkAndGetColumn(*column)) - { - size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; - return struct_builder.init(field, static_cast(size)); - } - - if (field.getType().isStruct()) - return struct_builder.init(field); - - return struct_builder.get(field); -} - -static std::optional convertToDynamicValue( - const ColumnPtr & column, - const DataTypePtr & data_type, - size_t row_num, - const String & column_name, - capnp::DynamicValue::Builder builder, - FormatSettings::EnumComparingMode enum_comparing_mode, - std::vector> & temporary_text_data_storage) -{ - /// Here we don't do any types validation, because we did it in CapnProtoRowOutputFormat constructor. - - if (data_type->lowCardinality()) - { - const auto * lc_column = assert_cast(column.get()); - const auto & dict_type = assert_cast(data_type.get())->getDictionaryType(); - size_t index = lc_column->getIndexAt(row_num); - return convertToDynamicValue(lc_column->getDictionary().getNestedColumn(), dict_type, index, column_name, builder, enum_comparing_mode, temporary_text_data_storage); - } - - switch (builder.getType()) - { - case capnp::DynamicValue::Type::INT: - return capnp::DynamicValue::Reader(column->getInt(row_num)); - case capnp::DynamicValue::Type::UINT: - { - /// IPv4 column doesn't support getUInt method. - if (isIPv4(data_type)) - return capnp::DynamicValue::Reader(assert_cast(column.get())->getElement(row_num)); - return capnp::DynamicValue::Reader(column->getUInt(row_num)); - } - case capnp::DynamicValue::Type::BOOL: - return capnp::DynamicValue::Reader(column->getBool(row_num)); - case capnp::DynamicValue::Type::FLOAT: - return capnp::DynamicValue::Reader(column->getFloat64(row_num)); - case capnp::DynamicValue::Type::ENUM: - { - auto enum_schema = builder.as().getSchema(); - if (data_type->getTypeId() == TypeIndex::Enum8) - return capnp::DynamicValue::Reader( - getDynamicEnum(column, data_type, row_num, enum_schema, enum_comparing_mode)); - return capnp::DynamicValue::Reader( - getDynamicEnum(column, data_type, row_num, enum_schema, enum_comparing_mode)); - } - case capnp::DynamicValue::Type::DATA: - { - auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - } - case capnp::DynamicValue::Type::TEXT: - { - /// In TEXT type data should be null-terminated, but ClickHouse String data could not be. - /// To make data null-terminated we should copy it to temporary String object, but - /// capnp::Text::Reader works only with pointer to the data and it's size, so we should - /// guarantee that new String object life time is longer than capnp::Text::Reader life time. - /// To do this we store new String object in a temporary storage, passed in this function - /// by reference. We use unique_ptr instead of just String to avoid pointers - /// invalidation on vector reallocation. - temporary_text_data_storage.push_back(std::make_unique(column->getDataAt(row_num))); - auto & data = temporary_text_data_storage.back(); - return capnp::DynamicValue::Reader(capnp::Text::Reader(data->data(), data->size())); - } - case capnp::DynamicValue::Type::STRUCT: - { - auto struct_builder = builder.as(); - auto nested_struct_schema = struct_builder.getSchema(); - /// Struct can represent Tuple, Nullable (named union with two fields) or single column when it contains one nested column. - if (data_type->isNullable()) - { - const auto * nullable_type = assert_cast(data_type.get()); - const auto * nullable_column = assert_cast(column.get()); - auto fields = nested_struct_schema.getUnionFields(); - if (nullable_column->isNullAt(row_num)) - { - auto null_field = fields[0].getType().isVoid() ? fields[0] : fields[1]; - struct_builder.set(null_field, capnp::Void()); - } - else - { - auto value_field = fields[0].getType().isVoid() ? fields[1] : fields[0]; - struct_builder.clear(value_field); - const auto & nested_column = nullable_column->getNestedColumnPtr(); - auto value_builder = initStructFieldBuilder(nested_column, row_num, struct_builder, value_field); - auto value = convertToDynamicValue(nested_column, nullable_type->getNestedType(), row_num, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(value_field, *value); - } - } - else if (isTuple(data_type)) - { - const auto * tuple_data_type = assert_cast(data_type.get()); - const auto & nested_types = tuple_data_type->getElements(); - const auto & nested_names = tuple_data_type->getElementNames(); - const auto & nested_columns = assert_cast(column.get())->getColumns(); - bool have_explicit_names = tuple_data_type->haveExplicitNames(); - for (uint32_t i = 0; i != nested_names.size(); ++i) - { - capnp::StructSchema::Field nested_field = have_explicit_names ? nested_struct_schema.getFieldByName(nested_names[i]) : nested_struct_schema.getFields()[i]; - auto field_builder = initStructFieldBuilder(nested_columns[i], row_num, struct_builder, nested_field); - auto value = convertToDynamicValue(nested_columns[i], nested_types[i], row_num, nested_names[i], field_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(nested_field, *value); - } - } - else if (isMap(data_type)) - { - /// We output Map type as follow CapnProto schema - /// - /// struct Map { - /// struct Entry { - /// key @0: Key; - /// value @1: Value; - /// } - /// entries @0 :List(Entry); - /// } - /// - /// And we don't need to check that struct have this form here because we checked it before. - const auto & map_type = assert_cast(*data_type); - DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()}; - Names key_value_names = {"key", "value"}; - auto entries_type = std::make_shared(std::make_shared(key_value_types, key_value_names)); - - /// Nested column in Map is actually Array(Tuple), so we can output it according to "entries" field schema. - const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); - - auto entries_field = nested_struct_schema.getFields()[0]; - auto field_builder = initStructFieldBuilder(entries_column, row_num, struct_builder, entries_field); - auto entries_value = convertToDynamicValue(entries_column, entries_type, row_num, column_name, field_builder, enum_comparing_mode, temporary_text_data_storage); - if (entries_value) - struct_builder.set(entries_field, *entries_value); - } - else - { - /// It can be nested column from Nested type. - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - auto nested_field = nested_struct_schema.getFieldByName(nested_name); - auto field_builder = initStructFieldBuilder(column, row_num, struct_builder, nested_field); - auto value = convertToDynamicValue(column, data_type, row_num, nested_name, field_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(nested_field, *value); - } - return std::nullopt; - } - case capnp::DynamicValue::Type::LIST: - { - auto list_builder = builder.as(); - const auto * array_column = assert_cast(column.get()); - const auto & nested_column = array_column->getDataPtr(); - const auto & nested_type = assert_cast(data_type.get())->getNestedType(); - const auto & offsets = array_column->getOffsets(); - auto offset = offsets[row_num - 1]; - size_t size = offsets[row_num] - offset; - - const auto * nested_array_column = checkAndGetColumn(*nested_column); - for (unsigned i = 0; i != static_cast(size); ++i) - { - capnp::DynamicValue::Builder value_builder; - /// For nested arrays we need to initialize nested list builder. - if (nested_array_column) - { - const auto & nested_offset = nested_array_column->getOffsets(); - size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; - value_builder = list_builder.init(i, static_cast(nested_array_size)); - } - else - value_builder = list_builder[i]; - - auto value = convertToDynamicValue(nested_column, nested_type, offset + i, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - list_builder.set(i, *value); - } - return std::nullopt; - } - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto type."); - } + schema = schema_parser.getMessageSchema(info.getSchemaInfo()); + const auto & header = getPort(PortKind::Main).getHeader(); + serializer = std::make_unique(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto); + capnp::MallocMessageBuilder message; } void CapnProtoRowOutputFormat::write(const Columns & columns, size_t row_num) { capnp::MallocMessageBuilder message; - /// Temporary storage for data that will be outputted in fields with CapnProto type TEXT. - /// See comment in convertToDynamicValue() for more details. - std::vector> temporary_text_data_storage; capnp::DynamicStruct::Builder root = message.initRoot(schema); - - /// Some columns can share same field builder. For example when we have - /// column with Nested type that was flattened into several columns. - std::unordered_map field_builders; - for (size_t i = 0; i != columns.size(); ++i) - { - auto [struct_builder, field] = getStructBuilderAndFieldByColumnName(root, column_names[i]); - if (!field_builders.contains(field.getIndex())) - { - auto field_builder = initStructFieldBuilder(columns[i], row_num, struct_builder, field); - field_builders[field.getIndex()] = field_builder; - } - auto value = convertToDynamicValue(columns[i], column_types[i], row_num, column_names[i], field_builders[field.getIndex()], format_settings.capn_proto.enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(field, *value); - } - + serializer->writeRow(columns, std::move(root), row_num); capnp::writeMessage(*output_stream, message); } @@ -300,7 +57,11 @@ void registerOutputFormatCapnProto(FormatFactory & factory) const Block & sample, const FormatSettings & format_settings) { - return std::make_shared(buf, sample, FormatSchemaInfo(format_settings, "CapnProto", true), format_settings); + return std::make_shared( + buf, + sample, + CapnProtoSchemaInfo(format_settings, "CapnProto", sample, format_settings.capn_proto.use_autogenerated_schema), + format_settings); }); } diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h index 5cc7099d4c7..c00dceb498e 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h @@ -3,15 +3,17 @@ #include "config.h" #if USE_CAPNP -#include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include namespace DB { + class CapnProtoOutputStream : public kj::OutputStream { public: @@ -29,8 +31,8 @@ public: CapnProtoRowOutputFormat( WriteBuffer & out_, const Block & header_, - const FormatSchemaInfo & info, - const FormatSettings & format_settings_); + const CapnProtoSchemaInfo & info, + const FormatSettings & format_settings); String getName() const override { return "CapnProtoRowOutputFormat"; } @@ -43,8 +45,9 @@ private: DataTypes column_types; capnp::StructSchema schema; std::unique_ptr output_stream; - const FormatSettings format_settings; CapnProtoSchemaParser schema_parser; + std::unique_ptr serializer; + }; } diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp index 9777f2361a2..8dc5e6fd5d1 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp @@ -14,7 +14,7 @@ ProtobufListInputFormat::ProtobufListInputFormat( ReadBuffer & in_, const Block & header_, const Params & params_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, bool flatten_google_wrappers_) : IRowInputFormat(header_, in_, params_) , reader(std::make_unique(in_)) @@ -22,7 +22,7 @@ ProtobufListInputFormat::ProtobufListInputFormat( header_.getNames(), header_.getDataTypes(), missing_column_indices, - *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_, ProtobufSchemas::WithEnvelope::Yes), + *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_.getSchemaInfo(), ProtobufSchemas::WithEnvelope::Yes), /* with_length_delimiter = */ true, /* with_envelope = */ true, flatten_google_wrappers_, @@ -84,7 +84,7 @@ void registerInputFormatProtobufList(FormatFactory & factory) const FormatSettings & settings) { return std::make_shared(buf, sample, std::move(params), - FormatSchemaInfo(settings, "Protobuf", true), settings.protobuf.input_flatten_google_wrappers); + ProtobufSchemaInfo(settings, "Protobuf", sample, settings.protobuf.use_autogenerated_schema), settings.protobuf.input_flatten_google_wrappers); }); factory.markFormatSupportsSubsetOfColumns("ProtobufList"); factory.registerAdditionalInfoForSchemaCacheGetter( diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.h b/src/Processors/Formats/Impl/ProtobufListInputFormat.h index ba2e8014878..09eabb0ee59 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.h @@ -28,7 +28,7 @@ public: ReadBuffer & in_, const Block & header_, const Params & params_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, bool flatten_google_wrappers_); String getName() const override { return "ProtobufListInputFormat"; } diff --git a/src/Processors/Formats/Impl/ProtobufListOutputFormat.cpp b/src/Processors/Formats/Impl/ProtobufListOutputFormat.cpp index 815b9ebb61d..ae0b9db7357 100644 --- a/src/Processors/Formats/Impl/ProtobufListOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufListOutputFormat.cpp @@ -2,7 +2,6 @@ #if USE_PROTOBUF # include -# include # include # include # include @@ -13,14 +12,14 @@ namespace DB ProtobufListOutputFormat::ProtobufListOutputFormat( WriteBuffer & out_, const Block & header_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, bool defaults_for_nullable_google_wrappers_) : IRowOutputFormat(header_, out_) , writer(std::make_unique(out)) , serializer(ProtobufSerializer::create( header_.getNames(), header_.getDataTypes(), - *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_, ProtobufSchemas::WithEnvelope::Yes), + *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_.getSchemaInfo(), ProtobufSchemas::WithEnvelope::Yes), /* with_length_delimiter = */ true, /* with_envelope = */ true, defaults_for_nullable_google_wrappers_, @@ -55,7 +54,7 @@ void registerOutputFormatProtobufList(FormatFactory & factory) const FormatSettings & settings) { return std::make_shared( - buf, header, FormatSchemaInfo(settings, "Protobuf", true), + buf, header, ProtobufSchemaInfo(settings, "Protobuf", header, settings.protobuf.use_autogenerated_schema), settings.protobuf.output_nullables_with_google_wrappers); }); } diff --git a/src/Processors/Formats/Impl/ProtobufListOutputFormat.h b/src/Processors/Formats/Impl/ProtobufListOutputFormat.h index d85018c0351..e7765590d51 100644 --- a/src/Processors/Formats/Impl/ProtobufListOutputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufListOutputFormat.h @@ -4,10 +4,10 @@ #if USE_PROTOBUF # include +# include namespace DB { -class FormatSchemaInfo; class ProtobufWriter; class ProtobufSerializer; @@ -26,7 +26,7 @@ public: ProtobufListOutputFormat( WriteBuffer & out_, const Block & header_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, bool defaults_for_nullable_google_wrappers_); String getName() const override { return "ProtobufListOutputFormat"; } diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index ee60501dba5..94d81fc34c8 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -11,9 +11,9 @@ namespace DB { ProtobufRowInputFormat::ProtobufRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, - const FormatSchemaInfo & schema_info_, bool with_length_delimiter_, bool flatten_google_wrappers_) + const ProtobufSchemaInfo & schema_info_, bool with_length_delimiter_, bool flatten_google_wrappers_) : IRowInputFormat(header_, in_, params_) - , message_descriptor(ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_, ProtobufSchemas::WithEnvelope::No)) + , message_descriptor(ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_.getSchemaInfo(), ProtobufSchemas::WithEnvelope::No)) , with_length_delimiter(with_length_delimiter_) , flatten_google_wrappers(flatten_google_wrappers_) { @@ -89,7 +89,7 @@ void registerInputFormatProtobuf(FormatFactory & factory) const FormatSettings & settings) { return std::make_shared(buf, sample, std::move(params), - FormatSchemaInfo(settings, "Protobuf", true), + ProtobufSchemaInfo(settings, "Protobuf", sample, settings.protobuf.use_autogenerated_schema), with_length_delimiter, settings.protobuf.input_flatten_google_wrappers); }); diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index 5c042f7c5ab..9d7cecd7146 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -33,7 +33,7 @@ public: ReadBuffer & in_, const Block & header_, const Params & params_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, bool with_length_delimiter_, bool flatten_google_wrappers_); diff --git a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp index 129c9ca3156..7b4cc1bf0be 100644 --- a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp @@ -3,7 +3,6 @@ #if USE_PROTOBUF # include # include -# include # include # include # include @@ -20,7 +19,7 @@ namespace ErrorCodes ProtobufRowOutputFormat::ProtobufRowOutputFormat( WriteBuffer & out_, const Block & header_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, const FormatSettings & settings_, bool with_length_delimiter_) : IRowOutputFormat(header_, out_) @@ -28,7 +27,7 @@ ProtobufRowOutputFormat::ProtobufRowOutputFormat( , serializer(ProtobufSerializer::create( header_.getNames(), header_.getDataTypes(), - *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_, ProtobufSchemas::WithEnvelope::No), + *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_.getSchemaInfo(), ProtobufSchemas::WithEnvelope::No), with_length_delimiter_, /* with_envelope = */ false, settings_.protobuf.output_nullables_with_google_wrappers, @@ -61,7 +60,7 @@ void registerOutputFormatProtobuf(FormatFactory & factory) const FormatSettings & settings) { return std::make_shared( - buf, header, FormatSchemaInfo(settings, "Protobuf", true), + buf, header, ProtobufSchemaInfo(settings, "Protobuf", header, settings.protobuf.use_autogenerated_schema), settings, with_length_delimiter); }); } diff --git a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h index f6ff5bae999..213e1c785fd 100644 --- a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h @@ -4,11 +4,11 @@ #if USE_PROTOBUF # include +# include namespace DB { class DB; -class FormatSchemaInfo; class ProtobufSerializer; class ProtobufWriter; class WriteBuffer; @@ -30,7 +30,7 @@ public: ProtobufRowOutputFormat( WriteBuffer & out_, const Block & header_, - const FormatSchemaInfo & schema_info_, + const ProtobufSchemaInfo & schema_info_, const FormatSettings & settings_, bool with_length_delimiter_); diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index 567c674ded9..e48d3187cb2 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -470,7 +470,7 @@ class GenerateSource : public ISource { public: GenerateSource(UInt64 block_size_, UInt64 max_array_length_, UInt64 max_string_length_, UInt64 random_seed_, Block block_header_, ContextPtr context_) - : ISource(Nested::flatten(prepareBlockToFill(block_header_))) + : ISource(Nested::flattenArrayOfTuples(prepareBlockToFill(block_header_))) , block_size(block_size_), max_array_length(max_array_length_), max_string_length(max_string_length_) , block_to_fill(std::move(block_header_)), rng(random_seed_), context(context_) {} @@ -485,7 +485,7 @@ protected: for (const auto & elem : block_to_fill) columns.emplace_back(fillColumnWithRandomData(elem.type, block_size, max_array_length, max_string_length, rng, context)); - columns = Nested::flatten(block_to_fill.cloneWithColumns(columns)).getColumns(); + columns = Nested::flattenArrayOfTuples(block_to_fill.cloneWithColumns(columns)).getColumns(); return {std::move(columns), block_size}; } diff --git a/src/TableFunctions/TableFunctionGenerateRandom.cpp b/src/TableFunctions/TableFunctionGenerateRandom.cpp index 5f1a13d8857..12cbda334a3 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.cpp +++ b/src/TableFunctions/TableFunctionGenerateRandom.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include "registerTableFunctions.h" @@ -28,7 +29,7 @@ namespace ErrorCodes extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } -void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, ContextPtr /*context*/) +void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, ContextPtr context) { ASTs & args_func = ast_function->children; @@ -45,6 +46,9 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co "Table function '{}' requires at most four arguments: " " structure, [random_seed, max_string_length, max_array_length].", getName()); + /// Allow constant expression for structure argument, it can be generated using generateRandomStructure function. + args[0] = evaluateConstantExpressionAsLiteral(args[0], context); + // All the arguments must be literals. for (const auto & arg : args) { diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index c8cc0cddd30..5b81aa72914 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -306,6 +306,7 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) bool TableFunctionS3::supportsReadingSubsetOfColumns() { + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); } From 56c73cab0d4105c41606d1d036cba4b9f89735fe Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 26 Jun 2023 13:30:22 +0200 Subject: [PATCH 05/81] Fix azure test fails and updated to work without create privileges --- src/Storages/StorageAzureBlob.cpp | 68 +++++++++++++++++-------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 4eb60db0b99..6971094556b 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -316,16 +316,19 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co if (configuration.is_connection_string) { result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); - result->CreateIfNotExists(); } else { std::shared_ptr storage_shared_key_credential; - std::unique_ptr blob_service_client; if (configuration.account_name.has_value() && configuration.account_key.has_value()) { storage_shared_key_credential = std::make_shared(*configuration.account_name, *configuration.account_key); + } + + std::unique_ptr blob_service_client; + if (storage_shared_key_credential) + { blob_service_client = std::make_unique(configuration.connection_url, storage_shared_key_credential); } else @@ -333,40 +336,45 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co blob_service_client = std::make_unique(configuration.connection_url); } + Azure::Storage::Blobs::ListBlobContainersOptions options; + options.Prefix = configuration.container; + options.PageSizeHint = 1; - try + auto containers_list_response = blob_service_client->ListBlobContainers(options); + auto containers_list = containers_list_response.BlobContainers; + + bool container_exists = false; + for (const auto & container : containers_list) { - blob_service_client = std::make_unique(configuration.connection_url); - result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); - } - catch (const Azure::Storage::StorageException & e) - { - if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) + if (configuration.container == container.Name) { - std::string final_url; - size_t pos = configuration.connection_url.find('?'); - if (pos != std::string::npos) - { - auto url_without_sas = configuration.connection_url.substr(0, pos); - final_url = url_without_sas - + (url_without_sas.back() == '/' ? "" : "/") - + configuration.container - + configuration.connection_url.substr(pos); - } - else - final_url = configuration.connection_url - + (configuration.connection_url.back() == '/' ? "" : "/") - + configuration.container; + container_exists = true; + break; + } + } - if (storage_shared_key_credential) - result = std::make_unique(final_url, storage_shared_key_credential); - else - result = std::make_unique(final_url); + if (container_exists) + { + std::string final_url; + size_t pos = configuration.connection_url.find('?'); + if (pos != std::string::npos) + { + auto url_without_sas = configuration.connection_url.substr(0, pos); + final_url = url_without_sas + (url_without_sas.back() == '/' ? "" : "/") + configuration.container + + configuration.connection_url.substr(pos); } else - { - throw; - } + final_url + = configuration.connection_url + (configuration.connection_url.back() == '/' ? "" : "/") + configuration.container; + + if (storage_shared_key_credential) + result = std::make_unique(final_url, storage_shared_key_credential); + else + result = std::make_unique(final_url); + } + else + { + result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); } } From 8dafd66c8dca5892b3dbdd1387b4c3e0b46e572b Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 26 Jun 2023 16:35:16 +0200 Subject: [PATCH 06/81] Updated creating container using connection string --- src/Storages/StorageAzureBlob.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 6971094556b..8222940ce53 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -315,7 +315,31 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co if (configuration.is_connection_string) { + std::unique_ptr blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(configuration.connection_url)); + + Azure::Storage::Blobs::ListBlobContainersOptions options; + options.Prefix = configuration.container; + options.PageSizeHint = 1; + + auto containers_list_response = blob_service_client->ListBlobContainers(options); + auto containers_list = containers_list_response.BlobContainers; + + bool container_exists = false; + for (const auto & container : containers_list) + { + if (configuration.container == container.Name) + { + container_exists = true; + break; + } + } + result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); + + if (!container_exists) + { + result->CreateIfNotExists(); + } } else { From 2d3a148ffd48d683276859b29cf952a985b6fb5e Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 6 Jul 2023 10:56:07 +0200 Subject: [PATCH 07/81] Added option to check if container exists before attempting to read/create it, added this flag for all table functions, only used in azureBlobStorage --- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/InterpreterDescribeQuery.cpp | 2 +- src/Storages/StorageAzureBlob.cpp | 72 +++++++++---------- src/Storages/StorageAzureBlob.h | 2 +- src/Storages/getStructureOfRemoteTable.cpp | 2 +- src/TableFunctions/Hive/TableFunctionHive.cpp | 5 +- src/TableFunctions/Hive/TableFunctionHive.h | 4 +- src/TableFunctions/ITableFunction.cpp | 8 +-- src/TableFunctions/ITableFunction.h | 4 +- src/TableFunctions/ITableFunctionDataLake.h | 5 +- src/TableFunctions/ITableFunctionFileLike.cpp | 2 +- src/TableFunctions/ITableFunctionFileLike.h | 2 +- src/TableFunctions/ITableFunctionXDBC.cpp | 6 +- src/TableFunctions/ITableFunctionXDBC.h | 4 +- .../TableFunctionAzureBlobStorage.cpp | 8 +-- .../TableFunctionAzureBlobStorage.h | 5 +- .../TableFunctionDictionary.cpp | 6 +- src/TableFunctions/TableFunctionDictionary.h | 4 +- .../TableFunctionExecutable.cpp | 6 +- src/TableFunctions/TableFunctionExecutable.h | 4 +- src/TableFunctions/TableFunctionExplain.cpp | 6 +- src/TableFunctions/TableFunctionExplain.h | 4 +- src/TableFunctions/TableFunctionFile.cpp | 2 +- src/TableFunctions/TableFunctionFile.h | 2 +- src/TableFunctions/TableFunctionFormat.cpp | 6 +- src/TableFunctions/TableFunctionFormat.h | 4 +- .../TableFunctionGenerateRandom.cpp | 6 +- .../TableFunctionGenerateRandom.h | 4 +- src/TableFunctions/TableFunctionHDFS.cpp | 2 +- src/TableFunctions/TableFunctionHDFS.h | 2 +- src/TableFunctions/TableFunctionInput.cpp | 6 +- src/TableFunctions/TableFunctionInput.h | 4 +- .../TableFunctionMeiliSearch.cpp | 4 +- src/TableFunctions/TableFunctionMeiliSearch.h | 4 +- src/TableFunctions/TableFunctionMerge.cpp | 6 +- src/TableFunctions/TableFunctionMerge.h | 4 +- src/TableFunctions/TableFunctionMongoDB.cpp | 6 +- src/TableFunctions/TableFunctionMongoDB.h | 4 +- src/TableFunctions/TableFunctionMySQL.cpp | 5 +- src/TableFunctions/TableFunctionMySQL.h | 4 +- src/TableFunctions/TableFunctionNull.cpp | 4 +- src/TableFunctions/TableFunctionNull.h | 4 +- src/TableFunctions/TableFunctionNumbers.cpp | 4 +- src/TableFunctions/TableFunctionNumbers.h | 4 +- .../TableFunctionPostgreSQL.cpp | 4 +- src/TableFunctions/TableFunctionPostgreSQL.h | 4 +- src/TableFunctions/TableFunctionRedis.cpp | 6 +- src/TableFunctions/TableFunctionRedis.h | 4 +- src/TableFunctions/TableFunctionRemote.cpp | 6 +- src/TableFunctions/TableFunctionRemote.h | 4 +- src/TableFunctions/TableFunctionS3.cpp | 4 +- src/TableFunctions/TableFunctionS3.h | 5 +- src/TableFunctions/TableFunctionS3Cluster.cpp | 2 +- src/TableFunctions/TableFunctionS3Cluster.h | 3 +- src/TableFunctions/TableFunctionSQLite.cpp | 4 +- src/TableFunctions/TableFunctionSQLite.h | 4 +- src/TableFunctions/TableFunctionURL.cpp | 2 +- src/TableFunctions/TableFunctionURL.h | 2 +- .../TableFunctionURLCluster.cpp | 2 +- src/TableFunctions/TableFunctionValues.cpp | 6 +- src/TableFunctions/TableFunctionValues.h | 4 +- src/TableFunctions/TableFunctionView.cpp | 6 +- src/TableFunctions/TableFunctionView.h | 4 +- .../TableFunctionViewIfPermitted.cpp | 8 +-- .../TableFunctionViewIfPermitted.h | 4 +- src/TableFunctions/TableFunctionZeros.cpp | 4 +- src/TableFunctions/TableFunctionZeros.h | 4 +- 67 files changed, 176 insertions(+), 174 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index d0bb3dd389f..35a399b3ce7 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -764,7 +764,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti /// Table function without columns list. auto table_function_ast = create.as_table_function->ptr(); auto table_function = TableFunctionFactory::instance().get(table_function_ast, getContext()); - properties.columns = table_function->getActualTableStructure(getContext()); + properties.columns = table_function->getActualTableStructure(getContext(), /*is_insert_query*/ true); } else if (create.is_dictionary) { diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp index 0b9eea86b46..26d9e5254f3 100644 --- a/src/Interpreters/InterpreterDescribeQuery.cpp +++ b/src/Interpreters/InterpreterDescribeQuery.cpp @@ -96,7 +96,7 @@ BlockIO InterpreterDescribeQuery::execute() else if (table_expression.table_function) { TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().get(table_expression.table_function, getContext()); - auto table_function_column_descriptions = table_function_ptr->getActualTableStructure(getContext()); + auto table_function_column_descriptions = table_function_ptr->getActualTableStructure(getContext(), /*is_insert_query*/ true); for (const auto & table_function_column_description : table_function_column_descriptions) columns.emplace_back(table_function_column_description); } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 2485f032808..c2c7700a62f 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -257,7 +257,7 @@ void registerStorageAzureBlob(StorageFactory & factory) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); auto configuration = StorageAzureBlob::getConfiguration(engine_args, args.getLocalContext()); - auto client = StorageAzureBlob::createClient(configuration); + auto client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current // session and user are ignored. @@ -309,35 +309,41 @@ void registerStorageAzureBlob(StorageFactory & factory) }); } -AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration configuration) +static bool containerExists(std::unique_ptr &blob_service_client, std::string container_name) +{ + Azure::Storage::Blobs::ListBlobContainersOptions options; + options.Prefix = container_name; + options.PageSizeHint = 1; + + auto containers_list_response = blob_service_client->ListBlobContainers(options); + auto containers_list = containers_list_response.BlobContainers; + + for (const auto & container : containers_list) + { + if (container_name == container.Name) + return true; + } + return false; +} + +AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration configuration, bool is_read_only) { AzureClientPtr result; if (configuration.is_connection_string) { std::unique_ptr blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(configuration.connection_url)); - - Azure::Storage::Blobs::ListBlobContainersOptions options; - options.Prefix = configuration.container; - options.PageSizeHint = 1; - - auto containers_list_response = blob_service_client->ListBlobContainers(options); - auto containers_list = containers_list_response.BlobContainers; - - bool container_exists = false; - for (const auto & container : containers_list) - { - if (configuration.container == container.Name) - { - container_exists = true; - break; - } - } - result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); + bool container_exists = containerExists(blob_service_client,configuration.container); if (!container_exists) { + if (is_read_only) + throw Exception( + ErrorCodes::DATABASE_ACCESS_DENIED, + "AzureBlobStorage container does not exist '{}'", + configuration.blob_path); + result->CreateIfNotExists(); } } @@ -360,22 +366,7 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co blob_service_client = std::make_unique(configuration.connection_url); } - Azure::Storage::Blobs::ListBlobContainersOptions options; - options.Prefix = configuration.container; - options.PageSizeHint = 1; - - auto containers_list_response = blob_service_client->ListBlobContainers(options); - auto containers_list = containers_list_response.BlobContainers; - - bool container_exists = false; - for (const auto & container : containers_list) - { - if (configuration.container == container.Name) - { - container_exists = true; - break; - } - } + bool container_exists = containerExists(blob_service_client,configuration.container); if (container_exists) { @@ -398,6 +389,11 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co } else { + if (is_read_only) + throw Exception( + ErrorCodes::DATABASE_ACCESS_DENIED, + "AzureBlobStorage container does not exist '{}'", + configuration.blob_path); result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); } } @@ -470,7 +466,7 @@ void StorageAzureBlob::truncate(const ASTPtr &, const StorageMetadataPtr &, Cont { throw Exception( ErrorCodes::DATABASE_ACCESS_DENIED, - "S3 key '{}' contains globs, so the table is in readonly mode", + "AzureBlobStorage key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); } @@ -1259,7 +1255,7 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData( return nullptr; } - /// S3 file iterator could get new keys after new iteration, check them in schema cache. + ///AzureBlobStorage file iterator could get new keys after new iteration, check them in schema cache. if (ctx->getSettingsRef().schema_inference_use_cache_for_azure && read_keys.size() > prev_read_keys_size) { columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end(), configuration, format_settings, ctx); diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 3d6b0c64998..9c5541dac38 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -65,7 +65,7 @@ public: ASTPtr partition_by_); static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); - static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration); + static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); diff --git a/src/Storages/getStructureOfRemoteTable.cpp b/src/Storages/getStructureOfRemoteTable.cpp index e5fc01be9f4..604da0df567 100644 --- a/src/Storages/getStructureOfRemoteTable.cpp +++ b/src/Storages/getStructureOfRemoteTable.cpp @@ -39,7 +39,7 @@ ColumnsDescription getStructureOfRemoteTableInShard( if (shard_info.isLocal()) { TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().get(table_func_ptr, context); - return table_function_ptr->getActualTableStructure(context); + return table_function_ptr->getActualTableStructure(context, /*is_insert_query*/ true); } auto table_func_name = queryToString(table_func_ptr); diff --git a/src/TableFunctions/Hive/TableFunctionHive.cpp b/src/TableFunctions/Hive/TableFunctionHive.cpp index fb7635181dc..ebebee13092 100644 --- a/src/TableFunctions/Hive/TableFunctionHive.cpp +++ b/src/TableFunctions/Hive/TableFunctionHive.cpp @@ -49,13 +49,14 @@ namespace DB actual_columns = parseColumnsListFromString(table_structure, context_); } - ColumnsDescription TableFunctionHive::getActualTableStructure(ContextPtr /*context_*/) const { return actual_columns; } + ColumnsDescription TableFunctionHive::getActualTableStructure(ContextPtr /*context_*/, bool /*is_insert_query*/) const { return actual_columns; } StoragePtr TableFunctionHive::executeImpl( const ASTPtr & /*ast_function_*/, ContextPtr context_, const std::string & table_name_, - ColumnsDescription /*cached_columns_*/) const + ColumnsDescription /*cached_columns_*/, + bool /*is_insert_query*/) const { const Settings & settings = context_->getSettings(); ParserExpression partition_by_parser; diff --git a/src/TableFunctions/Hive/TableFunctionHive.h b/src/TableFunctions/Hive/TableFunctionHive.h index ec09a87a876..5e48be46ce1 100644 --- a/src/TableFunctions/Hive/TableFunctionHive.h +++ b/src/TableFunctions/Hive/TableFunctionHive.h @@ -17,10 +17,10 @@ public: bool hasStaticStructure() const override { return true; } StoragePtr executeImpl( - const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return storage_type_name; } - ColumnsDescription getActualTableStructure(ContextPtr) const override; + ColumnsDescription getActualTableStructure(ContextPtr, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function_, ContextPtr context_) override; private: diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index df19e0ebad3..137e1dc27fe 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -34,15 +34,15 @@ StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr conte auto context_to_use = use_global_context ? context->getGlobalContext() : context; if (cached_columns.empty()) - return executeImpl(ast_function, context, table_name, std::move(cached_columns)); + return executeImpl(ast_function, context, table_name, std::move(cached_columns), is_insert_query); - if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) - return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns)); + if (hasStaticStructure() && cached_columns == getActualTableStructure(context,is_insert_query)) + return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns), is_insert_query); auto this_table_function = shared_from_this(); auto get_storage = [=]() -> StoragePtr { - return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns, is_insert_query); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index fe71005cb9c..028bbb99c15 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -58,7 +58,7 @@ public: virtual void parseArguments(const ASTPtr & /*ast_function*/, ContextPtr /*context*/) {} /// Returns actual table structure probably requested from remote server, may fail - virtual ColumnsDescription getActualTableStructure(ContextPtr /*context*/) const = 0; + virtual ColumnsDescription getActualTableStructure(ContextPtr /*context*/, bool is_insert_query) const = 0; /// Check if table function needs a structure hint from SELECT query in case of /// INSERT INTO FUNCTION ... SELECT ... and INSERT INTO ... SELECT ... FROM table_function(...) @@ -89,7 +89,7 @@ protected: private: virtual StoragePtr executeImpl( - const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const = 0; + const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const = 0; virtual const char * getStorageTypeName() const = 0; }; diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index f87838cfb56..6d50e9138ff 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -26,7 +26,8 @@ protected: const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, - ColumnsDescription /*cached_columns*/) const override + ColumnsDescription /*cached_columns*/, + bool /*is_insert_query*/) const override { ColumnsDescription columns; if (TableFunction::configuration.structure != "auto") @@ -42,7 +43,7 @@ protected: const char * getStorageTypeName() const override { return Storage::name; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override + ColumnsDescription getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const override { if (TableFunction::configuration.structure == "auto") { diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index a60ab70d570..487826dc363 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -110,7 +110,7 @@ void ITableFunctionFileLike::addColumnsStructureToArguments(ASTs & args, const S } } -StoragePtr ITableFunctionFileLike::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr ITableFunctionFileLike::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { ColumnsDescription columns; if (structure != "auto") diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 8300cc27591..49a144c17fc 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -48,7 +48,7 @@ protected: ColumnsDescription structure_hint; private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; virtual StoragePtr getStorage( const String & source, const String & format, const ColumnsDescription & columns, ContextPtr global_context, diff --git a/src/TableFunctions/ITableFunctionXDBC.cpp b/src/TableFunctions/ITableFunctionXDBC.cpp index 1fb0f392e33..59702259b35 100644 --- a/src/TableFunctions/ITableFunctionXDBC.cpp +++ b/src/TableFunctions/ITableFunctionXDBC.cpp @@ -61,7 +61,7 @@ void ITableFunctionXDBC::startBridgeIfNot(ContextPtr context) const } } -ColumnsDescription ITableFunctionXDBC::getActualTableStructure(ContextPtr context) const +ColumnsDescription ITableFunctionXDBC::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { startBridgeIfNot(context); @@ -92,10 +92,10 @@ ColumnsDescription ITableFunctionXDBC::getActualTableStructure(ContextPtr contex return ColumnsDescription{columns}; } -StoragePtr ITableFunctionXDBC::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr ITableFunctionXDBC::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { startBridgeIfNot(context); - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); auto result = std::make_shared( StorageID(getDatabaseName(), table_name), schema_name, remote_table_name, columns, ConstraintsDescription{}, String{}, context, helper); result->startup(); diff --git a/src/TableFunctions/ITableFunctionXDBC.h b/src/TableFunctions/ITableFunctionXDBC.h index 984a6a1957f..da0fa83033b 100644 --- a/src/TableFunctions/ITableFunctionXDBC.h +++ b/src/TableFunctions/ITableFunctionXDBC.h @@ -16,7 +16,7 @@ namespace DB class ITableFunctionXDBC : public ITableFunction { private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; /* A factory method to create bridge helper, that will assist in remote interaction */ virtual BridgeHelperPtr createBridgeHelper(ContextPtr context, @@ -24,7 +24,7 @@ private: const std::string & connection_string_, bool use_connection_pooling_) const = 0; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index d51942d133d..278f602d5ff 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -193,12 +193,12 @@ void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, configuration = parseArgumentsImpl(args, context); } -ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(ContextPtr context, bool is_insert_query) const { if (configuration.structure == "auto") { context->checkAccess(getSourceAccessType()); - auto client = StorageAzureBlob::createClient(configuration); + auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); auto settings = StorageAzureBlob::createSettings(context); auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); @@ -213,9 +213,9 @@ bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns() return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); } -StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto client = StorageAzureBlob::createClient(configuration); + auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); auto settings = StorageAzureBlob::createSettings(context); ColumnsDescription columns; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h index 0ac3f9771c7..e4fcd4dda31 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -54,11 +54,12 @@ protected: const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const override; + ColumnsDescription cached_columns, + bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Azure"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; mutable StorageAzureBlob::Configuration configuration; diff --git a/src/TableFunctions/TableFunctionDictionary.cpp b/src/TableFunctions/TableFunctionDictionary.cpp index 90db9550a72..f0060acb411 100644 --- a/src/TableFunctions/TableFunctionDictionary.cpp +++ b/src/TableFunctions/TableFunctionDictionary.cpp @@ -43,7 +43,7 @@ void TableFunctionDictionary::parseArguments(const ASTPtr & ast_function, Contex dictionary_name = checkAndGetLiteralArgument(args[0], "dictionary_name"); } -ColumnsDescription TableFunctionDictionary::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionDictionary::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { const ExternalDictionariesLoader & external_loader = context->getExternalDictionariesLoader(); std::string resolved_name = external_loader.resolveDictionaryName(dictionary_name, context->getCurrentDatabase()); @@ -76,10 +76,10 @@ ColumnsDescription TableFunctionDictionary::getActualTableStructure(ContextPtr c } StoragePtr TableFunctionDictionary::executeImpl( - const ASTPtr &, ContextPtr context, const std::string & table_name, ColumnsDescription) const + const ASTPtr &, ContextPtr context, const std::string & table_name, ColumnsDescription, bool is_insert_query) const { StorageID dict_id(getDatabaseName(), table_name); - auto dictionary_table_structure = getActualTableStructure(context); + auto dictionary_table_structure = getActualTableStructure(context, is_insert_query); auto result = std::make_shared( dict_id, dictionary_name, std::move(dictionary_table_structure), String{}, StorageDictionary::Location::Custom, context); diff --git a/src/TableFunctions/TableFunctionDictionary.h b/src/TableFunctions/TableFunctionDictionary.h index cc184a32a17..d0beb292fe1 100644 --- a/src/TableFunctions/TableFunctionDictionary.h +++ b/src/TableFunctions/TableFunctionDictionary.h @@ -18,9 +18,9 @@ public: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Dictionary"; } diff --git a/src/TableFunctions/TableFunctionExecutable.cpp b/src/TableFunctions/TableFunctionExecutable.cpp index 66e7ce7592c..5a64a988156 100644 --- a/src/TableFunctions/TableFunctionExecutable.cpp +++ b/src/TableFunctions/TableFunctionExecutable.cpp @@ -120,12 +120,12 @@ void TableFunctionExecutable::parseArguments(const ASTPtr & ast_function, Contex } } -ColumnsDescription TableFunctionExecutable::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionExecutable::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { return parseColumnsListFromString(structure, context); } -StoragePtr TableFunctionExecutable::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionExecutable::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { auto storage_id = StorageID(getDatabaseName(), table_name); auto global_context = context->getGlobalContext(); @@ -135,7 +135,7 @@ StoragePtr TableFunctionExecutable::executeImpl(const ASTPtr & /*ast_function*/, if (settings_query != nullptr) settings.applyChanges(settings_query->as()->changes); - auto storage = std::make_shared(storage_id, format, settings, input_queries, getActualTableStructure(context), ConstraintsDescription{}); + auto storage = std::make_shared(storage_id, format, settings, input_queries, getActualTableStructure(context, is_insert_query), ConstraintsDescription{}); storage->startup(); return storage; } diff --git a/src/TableFunctions/TableFunctionExecutable.h b/src/TableFunctions/TableFunctionExecutable.h index 2d9f86e14e6..aa595312fe4 100644 --- a/src/TableFunctions/TableFunctionExecutable.h +++ b/src/TableFunctions/TableFunctionExecutable.h @@ -24,11 +24,11 @@ public: bool hasStaticStructure() const override { return true; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Executable"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; std::vector skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override; diff --git a/src/TableFunctions/TableFunctionExplain.cpp b/src/TableFunctions/TableFunctionExplain.cpp index aae93c8b596..f127979d92a 100644 --- a/src/TableFunctions/TableFunctionExplain.cpp +++ b/src/TableFunctions/TableFunctionExplain.cpp @@ -91,7 +91,7 @@ void TableFunctionExplain::parseArguments(const ASTPtr & ast_function, ContextPt query = std::move(explain_query); } -ColumnsDescription TableFunctionExplain::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionExplain::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { Block sample_block = getInterpreter(context).getSampleBlock(query->as()->getKind()); ColumnsDescription columns_description; @@ -123,7 +123,7 @@ static Block executeMonoBlock(QueryPipeline & pipeline) } StoragePtr TableFunctionExplain::executeImpl( - const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const + const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { /// To support settings inside explain subquery. auto mutable_context = Context::createCopy(context); @@ -132,7 +132,7 @@ StoragePtr TableFunctionExplain::executeImpl( Block block = executeMonoBlock(blockio.pipeline); StorageID storage_id(getDatabaseName(), table_name); - auto storage = std::make_shared(storage_id, getActualTableStructure(context), std::move(block)); + auto storage = std::make_shared(storage_id, getActualTableStructure(context, is_insert_query), std::move(block)); storage->startup(); return storage; } diff --git a/src/TableFunctions/TableFunctionExplain.h b/src/TableFunctions/TableFunctionExplain.h index 99d3e52ee68..2eb7e35d0b5 100644 --- a/src/TableFunctions/TableFunctionExplain.h +++ b/src/TableFunctions/TableFunctionExplain.h @@ -17,7 +17,7 @@ public: std::string getName() const override { return name; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Explain"; } @@ -25,7 +25,7 @@ private: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; InterpreterExplainQuery getInterpreter(ContextPtr context) const; diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index 0e49f26db40..806ea332636 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -83,7 +83,7 @@ StoragePtr TableFunctionFile::getStorage(const String & source, return std::make_shared(source, global_context->getUserFilesPath(), args); } -ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure == "auto") { diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 797948cad03..439ae87b4ae 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -20,7 +20,7 @@ public: return name; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override { diff --git a/src/TableFunctions/TableFunctionFormat.cpp b/src/TableFunctions/TableFunctionFormat.cpp index f5aff4bd098..8d788a7974e 100644 --- a/src/TableFunctions/TableFunctionFormat.cpp +++ b/src/TableFunctions/TableFunctionFormat.cpp @@ -51,7 +51,7 @@ void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr structure = checkAndGetLiteralArgument(args[1], "structure"); } -ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure == "auto") { @@ -87,9 +87,9 @@ Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr cont return concatenateBlocks(blocks); } -StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); Block res_block = parseData(columns, context); auto res = std::make_shared(StorageID(getDatabaseName(), table_name), columns, res_block); res->startup(); diff --git a/src/TableFunctions/TableFunctionFormat.h b/src/TableFunctions/TableFunctionFormat.h index d64ab14cb64..e20e8b6ea4b 100644 --- a/src/TableFunctions/TableFunctionFormat.h +++ b/src/TableFunctions/TableFunctionFormat.h @@ -18,10 +18,10 @@ public: bool hasStaticStructure() const override { return false; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Values"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; Block parseData(ColumnsDescription columns, ContextPtr context) const; diff --git a/src/TableFunctions/TableFunctionGenerateRandom.cpp b/src/TableFunctions/TableFunctionGenerateRandom.cpp index 08059796660..c6a9154cc66 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.cpp +++ b/src/TableFunctions/TableFunctionGenerateRandom.cpp @@ -97,7 +97,7 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co } } -ColumnsDescription TableFunctionGenerateRandom::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionGenerateRandom::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure == "auto") { @@ -113,9 +113,9 @@ ColumnsDescription TableFunctionGenerateRandom::getActualTableStructure(ContextP return parseColumnsListFromString(structure, context); } -StoragePtr TableFunctionGenerateRandom::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionGenerateRandom::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - ColumnsDescription columns = getActualTableStructure(context); + ColumnsDescription columns = getActualTableStructure(context, is_insert_query); auto res = std::make_shared( StorageID(getDatabaseName(), table_name), columns, String{}, max_array_length, max_string_length, random_seed); res->startup(); diff --git a/src/TableFunctions/TableFunctionGenerateRandom.h b/src/TableFunctions/TableFunctionGenerateRandom.h index 584d65311f4..a5d11ce0af6 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.h +++ b/src/TableFunctions/TableFunctionGenerateRandom.h @@ -19,10 +19,10 @@ public: void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "GenerateRandom"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; String structure = "auto"; diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 7aab55b48c9..a8e2108fda8 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -28,7 +28,7 @@ StoragePtr TableFunctionHDFS::getStorage( compression_method_); } -ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure == "auto") { diff --git a/src/TableFunctions/TableFunctionHDFS.h b/src/TableFunctions/TableFunctionHDFS.h index eec0a05fe8d..a7eb5daa440 100644 --- a/src/TableFunctions/TableFunctionHDFS.h +++ b/src/TableFunctions/TableFunctionHDFS.h @@ -34,7 +34,7 @@ public: return signature; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override { diff --git a/src/TableFunctions/TableFunctionInput.cpp b/src/TableFunctions/TableFunctionInput.cpp index 4941241acae..658a55c6fc4 100644 --- a/src/TableFunctions/TableFunctionInput.cpp +++ b/src/TableFunctions/TableFunctionInput.cpp @@ -43,7 +43,7 @@ void TableFunctionInput::parseArguments(const ASTPtr & ast_function, ContextPtr structure = checkAndGetLiteralArgument(evaluateConstantExpressionOrIdentifierAsLiteral(args[0], context), "structure"); } -ColumnsDescription TableFunctionInput::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionInput::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure == "auto") { @@ -58,9 +58,9 @@ ColumnsDescription TableFunctionInput::getActualTableStructure(ContextPtr contex return parseColumnsListFromString(structure, context); } -StoragePtr TableFunctionInput::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionInput::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto storage = std::make_shared(StorageID(getDatabaseName(), table_name), getActualTableStructure(context)); + auto storage = std::make_shared(StorageID(getDatabaseName(), table_name), getActualTableStructure(context, is_insert_query)); storage->startup(); return storage; } diff --git a/src/TableFunctions/TableFunctionInput.h b/src/TableFunctions/TableFunctionInput.h index 8e7b34cb829..3164ce43eef 100644 --- a/src/TableFunctions/TableFunctionInput.h +++ b/src/TableFunctions/TableFunctionInput.h @@ -20,10 +20,10 @@ public: void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Input"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; String structure; diff --git a/src/TableFunctions/TableFunctionMeiliSearch.cpp b/src/TableFunctions/TableFunctionMeiliSearch.cpp index 41ae5bb9ee2..01840a80262 100644 --- a/src/TableFunctions/TableFunctionMeiliSearch.cpp +++ b/src/TableFunctions/TableFunctionMeiliSearch.cpp @@ -8,13 +8,13 @@ namespace DB { StoragePtr TableFunctionMeiliSearch::executeImpl( - const ASTPtr & /* ast_function */, ContextPtr /*context*/, const String & table_name, ColumnsDescription /*cached_columns*/) const + const ASTPtr & /* ast_function */, ContextPtr /*context*/, const String & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { return std::make_shared( StorageID(getDatabaseName(), table_name), configuration.value(), ColumnsDescription{}, ConstraintsDescription{}, String{}); } -ColumnsDescription TableFunctionMeiliSearch::getActualTableStructure(ContextPtr /* context */) const +ColumnsDescription TableFunctionMeiliSearch::getActualTableStructure(ContextPtr /* context */, bool /*is_insert_query*/) const { return StorageMeiliSearch::getTableStructureFromData(configuration.value()); } diff --git a/src/TableFunctions/TableFunctionMeiliSearch.h b/src/TableFunctions/TableFunctionMeiliSearch.h index 86be944ab12..a127809a9c5 100644 --- a/src/TableFunctions/TableFunctionMeiliSearch.h +++ b/src/TableFunctions/TableFunctionMeiliSearch.h @@ -13,11 +13,11 @@ public: private: StoragePtr executeImpl( - const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns) const override; + const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "meilisearch"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; std::optional configuration; diff --git a/src/TableFunctions/TableFunctionMerge.cpp b/src/TableFunctions/TableFunctionMerge.cpp index 586cee54085..599953a1add 100644 --- a/src/TableFunctions/TableFunctionMerge.cpp +++ b/src/TableFunctions/TableFunctionMerge.cpp @@ -118,7 +118,7 @@ const TableFunctionMerge::DBToTableSetMap & TableFunctionMerge::getSourceDatabas return *source_databases_and_tables; } -ColumnsDescription TableFunctionMerge::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionMerge::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { for (const auto & db_with_tables : getSourceDatabasesAndTables(context)) { @@ -134,11 +134,11 @@ ColumnsDescription TableFunctionMerge::getActualTableStructure(ContextPtr contex } -StoragePtr TableFunctionMerge::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionMerge::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { auto res = std::make_shared( StorageID(getDatabaseName(), table_name), - getActualTableStructure(context), + getActualTableStructure(context, is_insert_query), String{}, source_database_name_or_regexp, database_is_regexp, diff --git a/src/TableFunctions/TableFunctionMerge.h b/src/TableFunctions/TableFunctionMerge.h index 3439056deda..8cc5119978a 100644 --- a/src/TableFunctions/TableFunctionMerge.h +++ b/src/TableFunctions/TableFunctionMerge.h @@ -17,13 +17,13 @@ public: std::string getName() const override { return name; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Merge"; } using TableSet = std::set; using DBToTableSetMap = std::map; const DBToTableSetMap & getSourceDatabasesAndTables(ContextPtr context) const; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; std::vector skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; static TableSet getMatchedTablesWithAccess(const String & database_name, const String & table_regexp, const ContextPtr & context); diff --git a/src/TableFunctions/TableFunctionMongoDB.cpp b/src/TableFunctions/TableFunctionMongoDB.cpp index 31dd64f8254..5c7c1d98cdf 100644 --- a/src/TableFunctions/TableFunctionMongoDB.cpp +++ b/src/TableFunctions/TableFunctionMongoDB.cpp @@ -27,9 +27,9 @@ namespace ErrorCodes StoragePtr TableFunctionMongoDB::executeImpl(const ASTPtr & /*ast_function*/, - ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const + ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); auto storage = std::make_shared( StorageID(configuration->database, table_name), configuration->host, @@ -46,7 +46,7 @@ StoragePtr TableFunctionMongoDB::executeImpl(const ASTPtr & /*ast_function*/, return storage; } -ColumnsDescription TableFunctionMongoDB::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionMongoDB::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { return parseColumnsListFromString(structure, context); } diff --git a/src/TableFunctions/TableFunctionMongoDB.h b/src/TableFunctions/TableFunctionMongoDB.h index b5033b2d654..c2c15cabe5a 100644 --- a/src/TableFunctions/TableFunctionMongoDB.h +++ b/src/TableFunctions/TableFunctionMongoDB.h @@ -17,11 +17,11 @@ public: private: StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, - const std::string & table_name, ColumnsDescription cached_columns) const override; + const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "MongoDB"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; std::optional configuration; diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp index 1705dfcbfa5..03bd2264551 100644 --- a/src/TableFunctions/TableFunctionMySQL.cpp +++ b/src/TableFunctions/TableFunctionMySQL.cpp @@ -57,7 +57,7 @@ void TableFunctionMySQL::parseArguments(const ASTPtr & ast_function, ContextPtr pool.emplace(createMySQLPoolWithFailover(*configuration, mysql_settings)); } -ColumnsDescription TableFunctionMySQL::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionMySQL::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { return StorageMySQL::getTableStructureFromData(*pool, configuration->database, configuration->table, context); } @@ -66,7 +66,8 @@ StoragePtr TableFunctionMySQL::executeImpl( const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, - ColumnsDescription /*cached_columns*/) const + ColumnsDescription /*cached_columns*/, + bool /*is_insert_query*/) const { auto res = std::make_shared( StorageID(getDatabaseName(), table_name), diff --git a/src/TableFunctions/TableFunctionMySQL.h b/src/TableFunctions/TableFunctionMySQL.h index 5a230530bc4..04f619f5f4b 100644 --- a/src/TableFunctions/TableFunctionMySQL.h +++ b/src/TableFunctions/TableFunctionMySQL.h @@ -23,10 +23,10 @@ public: return name; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "MySQL"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; mutable std::optional pool; diff --git a/src/TableFunctions/TableFunctionNull.cpp b/src/TableFunctions/TableFunctionNull.cpp index d25b9e15aa7..57911e16d4b 100644 --- a/src/TableFunctions/TableFunctionNull.cpp +++ b/src/TableFunctions/TableFunctionNull.cpp @@ -32,14 +32,14 @@ void TableFunctionNull::parseArguments(const ASTPtr & ast_function, ContextPtr c structure = checkAndGetLiteralArgument(evaluateConstantExpressionOrIdentifierAsLiteral(arguments[0], context), "structure"); } -ColumnsDescription TableFunctionNull::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionNull::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure != "auto") return parseColumnsListFromString(structure, context); return default_structure; } -StoragePtr TableFunctionNull::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionNull::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { ColumnsDescription columns; if (structure != "auto") diff --git a/src/TableFunctions/TableFunctionNull.h b/src/TableFunctions/TableFunctionNull.h index 4fece9e6da9..e80552d4cff 100644 --- a/src/TableFunctions/TableFunctionNull.h +++ b/src/TableFunctions/TableFunctionNull.h @@ -23,11 +23,11 @@ public: void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Null"; } void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; String structure = "auto"; ColumnsDescription structure_hint; diff --git a/src/TableFunctions/TableFunctionNumbers.cpp b/src/TableFunctions/TableFunctionNumbers.cpp index ba7a4dc4b36..d6cf50bc7d6 100644 --- a/src/TableFunctions/TableFunctionNumbers.cpp +++ b/src/TableFunctions/TableFunctionNumbers.cpp @@ -23,14 +23,14 @@ namespace ErrorCodes template -ColumnsDescription TableFunctionNumbers::getActualTableStructure(ContextPtr /*context*/) const +ColumnsDescription TableFunctionNumbers::getActualTableStructure(ContextPtr /*context*/, bool /*is_insert_query*/) const { /// NOTE: https://bugs.llvm.org/show_bug.cgi?id=47418 return ColumnsDescription{{{"number", std::make_shared()}}}; } template -StoragePtr TableFunctionNumbers::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionNumbers::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { if (const auto * function = ast_function->as()) { diff --git a/src/TableFunctions/TableFunctionNumbers.h b/src/TableFunctions/TableFunctionNumbers.h index 0a2f3eb863e..e380f40f7b2 100644 --- a/src/TableFunctions/TableFunctionNumbers.h +++ b/src/TableFunctions/TableFunctionNumbers.h @@ -19,12 +19,12 @@ public: std::string getName() const override { return name; } bool hasStaticStructure() const override { return true; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "SystemNumbers"; } UInt64 evaluateArgument(ContextPtr context, ASTPtr & argument) const; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; }; diff --git a/src/TableFunctions/TableFunctionPostgreSQL.cpp b/src/TableFunctions/TableFunctionPostgreSQL.cpp index 2b17a1b2c2b..322e0df7c15 100644 --- a/src/TableFunctions/TableFunctionPostgreSQL.cpp +++ b/src/TableFunctions/TableFunctionPostgreSQL.cpp @@ -20,7 +20,7 @@ namespace ErrorCodes StoragePtr TableFunctionPostgreSQL::executeImpl(const ASTPtr & /*ast_function*/, - ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const + ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { auto result = std::make_shared( StorageID(getDatabaseName(), table_name), @@ -38,7 +38,7 @@ StoragePtr TableFunctionPostgreSQL::executeImpl(const ASTPtr & /*ast_function*/, } -ColumnsDescription TableFunctionPostgreSQL::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionPostgreSQL::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { return StoragePostgreSQL::getTableStructureFromData(connection_pool, configuration->table, configuration->schema, context); } diff --git a/src/TableFunctions/TableFunctionPostgreSQL.h b/src/TableFunctions/TableFunctionPostgreSQL.h index 9f10e1c180e..f7d77567dd4 100644 --- a/src/TableFunctions/TableFunctionPostgreSQL.h +++ b/src/TableFunctions/TableFunctionPostgreSQL.h @@ -20,11 +20,11 @@ public: private: StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, - const std::string & table_name, ColumnsDescription cached_columns) const override; + const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "PostgreSQL"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; postgres::PoolWithFailoverPtr connection_pool; diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index ec659ae61e0..0b7433845b4 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -25,9 +25,9 @@ namespace ErrorCodes } StoragePtr TableFunctionRedis::executeImpl( - const ASTPtr & /*ast_function*/, ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const + const ASTPtr & /*ast_function*/, ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); StorageInMemoryMetadata metadata; metadata.setColumns(columns); @@ -39,7 +39,7 @@ StoragePtr TableFunctionRedis::executeImpl( return storage; } -ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { return parseColumnsListFromString(structure, context); } diff --git a/src/TableFunctions/TableFunctionRedis.h b/src/TableFunctions/TableFunctionRedis.h index b985a89e3d7..a7fc0df0a15 100644 --- a/src/TableFunctions/TableFunctionRedis.h +++ b/src/TableFunctions/TableFunctionRedis.h @@ -19,11 +19,11 @@ public: private: StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, - const String & table_name, ColumnsDescription cached_columns) const override; + const String & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Redis"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; RedisConfiguration configuration; diff --git a/src/TableFunctions/TableFunctionRemote.cpp b/src/TableFunctions/TableFunctionRemote.cpp index 4143014a7b3..72b5fb9b40e 100644 --- a/src/TableFunctions/TableFunctionRemote.cpp +++ b/src/TableFunctions/TableFunctionRemote.cpp @@ -276,12 +276,12 @@ void TableFunctionRemote::parseArguments(const ASTPtr & ast_function, ContextPtr remote_table_id.table_name = table; } -StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const +StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const { /// StorageDistributed supports mismatching structure of remote table, so we can use outdated structure for CREATE ... AS remote(...) /// without additional conversion in StorageTableFunctionProxy if (cached_columns.empty()) - cached_columns = getActualTableStructure(context); + cached_columns = getActualTableStructure(context, is_insert_query); assert(cluster); StoragePtr res = remote_table_function_ptr @@ -318,7 +318,7 @@ StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & /*ast_function*/, Con return res; } -ColumnsDescription TableFunctionRemote::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionRemote::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { assert(cluster); return getStructureOfRemoteTable(*cluster, remote_table_id, context, remote_table_function_ptr); diff --git a/src/TableFunctions/TableFunctionRemote.h b/src/TableFunctions/TableFunctionRemote.h index 32039d1e6a8..0f75bf2b854 100644 --- a/src/TableFunctions/TableFunctionRemote.h +++ b/src/TableFunctions/TableFunctionRemote.h @@ -22,13 +22,13 @@ public: std::string getName() const override { return name; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; bool needStructureConversion() const override { return false; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Distributed"; } void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index c8cc0cddd30..d1955e5028b 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -292,7 +292,7 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & } } -ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (configuration.structure == "auto") { @@ -309,7 +309,7 @@ bool TableFunctionS3::supportsReadingSubsetOfColumns() return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); } -StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { S3::URI s3_uri (configuration.url); diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index c983bec9bf4..4a469923b14 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -64,11 +64,12 @@ protected: const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const override; + ColumnsDescription cached_columns, + bool is_insert_query) const override; const char * getStorageTypeName() const override { return "S3"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; mutable StorageS3::Configuration configuration; diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp index 1d93132c411..ce96f7f580b 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ b/src/TableFunctions/TableFunctionS3Cluster.cpp @@ -17,7 +17,7 @@ namespace DB StoragePtr TableFunctionS3Cluster::executeImpl( const ASTPtr & /*function*/, ContextPtr context, - const std::string & table_name, ColumnsDescription /*cached_columns*/) const + const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { StoragePtr storage; ColumnsDescription columns; diff --git a/src/TableFunctions/TableFunctionS3Cluster.h b/src/TableFunctions/TableFunctionS3Cluster.h index 459ff144f02..4fe25079cf4 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.h +++ b/src/TableFunctions/TableFunctionS3Cluster.h @@ -52,7 +52,8 @@ protected: const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const override; + ColumnsDescription cached_columns, + bool is_insert_query) const override; const char * getStorageTypeName() const override { return "S3Cluster"; } }; diff --git a/src/TableFunctions/TableFunctionSQLite.cpp b/src/TableFunctions/TableFunctionSQLite.cpp index a9831363bd9..27e6fcf1fd1 100644 --- a/src/TableFunctions/TableFunctionSQLite.cpp +++ b/src/TableFunctions/TableFunctionSQLite.cpp @@ -29,7 +29,7 @@ namespace ErrorCodes StoragePtr TableFunctionSQLite::executeImpl(const ASTPtr & /*ast_function*/, - ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const + ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { auto storage = std::make_shared(StorageID(getDatabaseName(), table_name), sqlite_db, @@ -42,7 +42,7 @@ StoragePtr TableFunctionSQLite::executeImpl(const ASTPtr & /*ast_function*/, } -ColumnsDescription TableFunctionSQLite::getActualTableStructure(ContextPtr /* context */) const +ColumnsDescription TableFunctionSQLite::getActualTableStructure(ContextPtr /* context */, bool /*is_insert_query*/) const { return StorageSQLite::getTableStructureFromData(sqlite_db, remote_table_name); } diff --git a/src/TableFunctions/TableFunctionSQLite.h b/src/TableFunctions/TableFunctionSQLite.h index fded5646b39..74318f058a9 100644 --- a/src/TableFunctions/TableFunctionSQLite.h +++ b/src/TableFunctions/TableFunctionSQLite.h @@ -18,11 +18,11 @@ public: private: StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, - const std::string & table_name, ColumnsDescription cached_columns) const override; + const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "SQLite"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; String database_path, remote_table_name; diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index 4ed204a2af3..8d5a023fc3b 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -118,7 +118,7 @@ StoragePtr TableFunctionURL::getStorage( configuration.http_method); } -ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { if (structure == "auto") { diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index 021eb71df53..5e58a36dde9 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -32,7 +32,7 @@ public: return signature; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context); diff --git a/src/TableFunctions/TableFunctionURLCluster.cpp b/src/TableFunctions/TableFunctionURLCluster.cpp index c94943db758..a2949278155 100644 --- a/src/TableFunctions/TableFunctionURLCluster.cpp +++ b/src/TableFunctions/TableFunctionURLCluster.cpp @@ -38,7 +38,7 @@ StoragePtr TableFunctionURLCluster::getStorage( format, compression_method, StorageID(getDatabaseName(), table_name), - getActualTableStructure(context), + getActualTableStructure(context, /* is_insert_query */ true), ConstraintsDescription{}, configuration, structure != "auto"); diff --git a/src/TableFunctions/TableFunctionValues.cpp b/src/TableFunctions/TableFunctionValues.cpp index cf0e20c624c..42a19874704 100644 --- a/src/TableFunctions/TableFunctionValues.cpp +++ b/src/TableFunctions/TableFunctionValues.cpp @@ -119,14 +119,14 @@ void TableFunctionValues::parseArguments(const ASTPtr & ast_function, ContextPtr structure = ColumnsDescription(names_and_types); } -ColumnsDescription TableFunctionValues::getActualTableStructure(ContextPtr /*context*/) const +ColumnsDescription TableFunctionValues::getActualTableStructure(ContextPtr /*context*/, bool /*is_insert_query*/) const { return structure; } -StoragePtr TableFunctionValues::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionValues::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); Block sample_block; for (const auto & name_type : columns.getOrdinary()) diff --git a/src/TableFunctions/TableFunctionValues.h b/src/TableFunctions/TableFunctionValues.h index 61ce5158086..7c87bff835e 100644 --- a/src/TableFunctions/TableFunctionValues.h +++ b/src/TableFunctions/TableFunctionValues.h @@ -14,10 +14,10 @@ public: std::string getName() const override { return name; } bool hasStaticStructure() const override { return true; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "Values"; } - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; static DataTypes getTypesFromArgument(const ASTPtr & arg, ContextPtr context); diff --git a/src/TableFunctions/TableFunctionView.cpp b/src/TableFunctions/TableFunctionView.cpp index 6b50e7e0611..2a50fb2d006 100644 --- a/src/TableFunctions/TableFunctionView.cpp +++ b/src/TableFunctions/TableFunctionView.cpp @@ -41,7 +41,7 @@ void TableFunctionView::parseArguments(const ASTPtr & ast_function, ContextPtr / throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}' requires a query argument.", getName()); } -ColumnsDescription TableFunctionView::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionView::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const { assert(create.select); assert(create.children.size() == 1); @@ -58,9 +58,9 @@ ColumnsDescription TableFunctionView::getActualTableStructure(ContextPtr context } StoragePtr TableFunctionView::executeImpl( - const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const + const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const { - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); auto res = std::make_shared(StorageID(getDatabaseName(), table_name), create, columns, ""); res->startup(); return res; diff --git a/src/TableFunctions/TableFunctionView.h b/src/TableFunctions/TableFunctionView.h index bbf072655ed..c679a1f315d 100644 --- a/src/TableFunctions/TableFunctionView.h +++ b/src/TableFunctions/TableFunctionView.h @@ -21,7 +21,7 @@ public: const ASTSelectWithUnionQuery & getSelectQuery() const; private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "View"; } @@ -29,7 +29,7 @@ private: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; ASTCreateQuery create; }; diff --git a/src/TableFunctions/TableFunctionViewIfPermitted.cpp b/src/TableFunctions/TableFunctionViewIfPermitted.cpp index 12762e24f7e..d7944df1b28 100644 --- a/src/TableFunctions/TableFunctionViewIfPermitted.cpp +++ b/src/TableFunctions/TableFunctionViewIfPermitted.cpp @@ -55,16 +55,16 @@ void TableFunctionViewIfPermitted::parseArguments(const ASTPtr & ast_function, C else_table_function = TableFunctionFactory::instance().get(else_ast, context); } -ColumnsDescription TableFunctionViewIfPermitted::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionViewIfPermitted::getActualTableStructure(ContextPtr context, bool is_insert_query) const { - return else_table_function->getActualTableStructure(context); + return else_table_function->getActualTableStructure(context, is_insert_query); } StoragePtr TableFunctionViewIfPermitted::executeImpl( - const ASTPtr & /* ast_function */, ContextPtr context, const std::string & table_name, ColumnsDescription /* cached_columns */) const + const ASTPtr & /* ast_function */, ContextPtr context, const std::string & table_name, ColumnsDescription /* cached_columns */, bool is_insert_query) const { StoragePtr storage; - auto columns = getActualTableStructure(context); + auto columns = getActualTableStructure(context, is_insert_query); if (isPermitted(context, columns)) { diff --git a/src/TableFunctions/TableFunctionViewIfPermitted.h b/src/TableFunctions/TableFunctionViewIfPermitted.h index 9fdb34f30ab..bee4e15bfa5 100644 --- a/src/TableFunctions/TableFunctionViewIfPermitted.h +++ b/src/TableFunctions/TableFunctionViewIfPermitted.h @@ -20,7 +20,7 @@ public: const ASTSelectWithUnionQuery & getSelectQuery() const; private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const String & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "ViewIfPermitted"; } @@ -28,7 +28,7 @@ private: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; bool isPermitted(const ContextPtr & context, const ColumnsDescription & else_columns) const; diff --git a/src/TableFunctions/TableFunctionZeros.cpp b/src/TableFunctions/TableFunctionZeros.cpp index 3c487362e1f..eb93626590e 100644 --- a/src/TableFunctions/TableFunctionZeros.cpp +++ b/src/TableFunctions/TableFunctionZeros.cpp @@ -20,14 +20,14 @@ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; template -ColumnsDescription TableFunctionZeros::getActualTableStructure(ContextPtr /*context*/) const +ColumnsDescription TableFunctionZeros::getActualTableStructure(ContextPtr /*context*/, bool /*is_insert_query*/) const { /// NOTE: https://bugs.llvm.org/show_bug.cgi?id=47418 return ColumnsDescription{{{"zero", std::make_shared()}}}; } template -StoragePtr TableFunctionZeros::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionZeros::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { if (const auto * function = ast_function->as()) { diff --git a/src/TableFunctions/TableFunctionZeros.h b/src/TableFunctions/TableFunctionZeros.h index eef1577673e..07d523ee37c 100644 --- a/src/TableFunctions/TableFunctionZeros.h +++ b/src/TableFunctions/TableFunctionZeros.h @@ -19,12 +19,12 @@ public: std::string getName() const override { return name; } bool hasStaticStructure() const override { return true; } private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; const char * getStorageTypeName() const override { return "SystemZeros"; } UInt64 evaluateArgument(ContextPtr context, ASTPtr & argument) const; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; }; From 56e45ba384e69ecabcaf1f147ef29a9b2e2dc872 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 6 Jul 2023 11:17:04 +0200 Subject: [PATCH 08/81] Added test for select query when container does not exist --- src/Storages/StorageAzureBlob.cpp | 4 ++-- tests/integration/test_storage_azure_blob_storage/test.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index c2c7700a62f..d0c51423376 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -342,7 +342,7 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co throw Exception( ErrorCodes::DATABASE_ACCESS_DENIED, "AzureBlobStorage container does not exist '{}'", - configuration.blob_path); + configuration.container); result->CreateIfNotExists(); } @@ -393,7 +393,7 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co throw Exception( ErrorCodes::DATABASE_ACCESS_DENIED, "AzureBlobStorage container does not exist '{}'", - configuration.blob_path); + configuration.container); result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); } } diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 6089466ff5d..87ed6e34116 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -611,3 +611,9 @@ def test_filter_using_file(cluster): query = f"select count(*) from azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_partition_tf_*.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') WHERE _file='test_partition_tf_3.csv'" assert azure_query(node, query) == "1\n" + +def test_read_from_not_existing_container(cluster): + node = cluster.instances["node"] + query = f"select * from azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont_not_exists', 'test_table.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto')" + expected_err_msg = "container does not exist" + assert expected_err_msg in node.query_and_get_error(query) From 69691fd28cebe0d6304bfd7b2fc3607656e816d1 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 6 Jul 2023 09:38:16 +0000 Subject: [PATCH 09/81] Automatic style fix --- tests/integration/test_storage_azure_blob_storage/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 87ed6e34116..4042e41c054 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -612,6 +612,7 @@ def test_filter_using_file(cluster): query = f"select count(*) from azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_partition_tf_*.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') WHERE _file='test_partition_tf_3.csv'" assert azure_query(node, query) == "1\n" + def test_read_from_not_existing_container(cluster): node = cluster.instances["node"] query = f"select * from azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont_not_exists', 'test_table.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto')" From 342af6442822386adff831cc6565c088f6f884e4 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 18 Jul 2023 18:20:03 +0200 Subject: [PATCH 10/81] Fixed issue when multiple requests attempt to create same container --- src/Storages/StorageAzureBlob.cpp | 55 +++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 6855585291a..7db95a25712 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -345,7 +345,17 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co "AzureBlobStorage container does not exist '{}'", configuration.container); - result->CreateIfNotExists(); + try + { + result->CreateIfNotExists(); + } catch (const Azure::Storage::StorageException & e) + { + if (!(e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict + && e.ReasonPhrase == "The specified container already exists.")) + { + throw; + } + } } } else @@ -369,20 +379,20 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co bool container_exists = containerExists(blob_service_client,configuration.container); + std::string final_url; + size_t pos = configuration.connection_url.find('?'); + if (pos != std::string::npos) + { + auto url_without_sas = configuration.connection_url.substr(0, pos); + final_url = url_without_sas + (url_without_sas.back() == '/' ? "" : "/") + configuration.container + + configuration.connection_url.substr(pos); + } + else + final_url + = configuration.connection_url + (configuration.connection_url.back() == '/' ? "" : "/") + configuration.container; + if (container_exists) { - std::string final_url; - size_t pos = configuration.connection_url.find('?'); - if (pos != std::string::npos) - { - auto url_without_sas = configuration.connection_url.substr(0, pos); - final_url = url_without_sas + (url_without_sas.back() == '/' ? "" : "/") + configuration.container - + configuration.connection_url.substr(pos); - } - else - final_url - = configuration.connection_url + (configuration.connection_url.back() == '/' ? "" : "/") + configuration.container; - if (storage_shared_key_credential) result = std::make_unique(final_url, storage_shared_key_credential); else @@ -395,7 +405,24 @@ AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration co ErrorCodes::DATABASE_ACCESS_DENIED, "AzureBlobStorage container does not exist '{}'", configuration.container); - result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); + try + { + result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); + } catch (const Azure::Storage::StorageException & e) + { + if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict + && e.ReasonPhrase == "The specified container already exists.") + { + if (storage_shared_key_credential) + result = std::make_unique(final_url, storage_shared_key_credential); + else + result = std::make_unique(final_url); + } + else + { + throw; + } + } } } From b300781fd8995a4e8feac0e58a9b756c17fe93f7 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 18 Jul 2023 17:48:39 +0000 Subject: [PATCH 11/81] Make better, add tests --- src/Formats/StructureToFormatSchemaUtils.cpp | 1 + src/Functions/generateRandomStructure.cpp | 507 ------------------ src/Functions/structureToFormatSchema.cpp | 20 +- .../02817_structure_to_schema.reference | 488 +++++++++++++++++ .../0_stateless/02817_structure_to_schema.sh | 89 +++ 5 files changed, 588 insertions(+), 517 deletions(-) delete mode 100644 src/Functions/generateRandomStructure.cpp create mode 100644 tests/queries/0_stateless/02817_structure_to_schema.reference create mode 100755 tests/queries/0_stateless/02817_structure_to_schema.sh diff --git a/src/Formats/StructureToFormatSchemaUtils.cpp b/src/Formats/StructureToFormatSchemaUtils.cpp index 02c9af2c65f..fd187c033ec 100644 --- a/src/Formats/StructureToFormatSchemaUtils.cpp +++ b/src/Formats/StructureToFormatSchemaUtils.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB { diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp deleted file mode 100644 index 5f2153ff89f..00000000000 --- a/src/Functions/generateRandomStructure.cpp +++ /dev/null @@ -1,507 +0,0 @@ -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int BAD_ARGUMENTS; -} - -class FunctionGenerateRandomStructure : public IFunction -{ -private: - enum class Type - { - Int8, - UInt8, - Bool, - Int16, - UInt16, - Int32, - UInt32, - Int64, - UInt64, - Float32, - Float64, - DateTime64, - Decimal32, - Decimal64, - Date, - Date32, - DateTime, - String, - FixedString, - IPv4, - IPv6, - Int128, - UInt128, - Int256, - UInt256, - Decimal128, - Decimal256, - Enum8, - Enum16, - Nullable, - LowCardinality, - Array, - Tuple, - Map, - Nested, - }; - - static constexpr std::array simple_types - { - Type::Int8, - Type::UInt8, - Type::Bool, - Type::Int16, - Type::UInt16, - Type::Int32, - Type::UInt32, - Type::Int64, - Type::UInt64, - Type::Float32, - Type::Float64, - Type::Date, - Type::Date32, - Type::DateTime, - Type::String, - Type::FixedString, - }; - - static constexpr std::array big_integer_types - { - Type::Int128, - Type::UInt128, - Type::Int256, - Type::UInt256, - }; - - static constexpr std::array decimal_types - { - Type::DateTime64, - Type::Decimal32, - Type::Decimal64, - }; - - static constexpr std::array big_decimal_types - { - Type::Decimal128, - Type::Decimal256, - }; - - static constexpr std::array enum_types - { - Type::Enum8, - Type::Enum16, - }; - - static constexpr std::array ip_types - { - Type::IPv4, - Type::IPv6, - }; - - static constexpr std::array complex_types - { - Type::Nullable, - Type::LowCardinality, - Type::Array, - Type::Tuple, - Type::Map, - Type::Nested, - }; - - static constexpr std::array map_key_types - { - Type::Int8, - Type::UInt8, - Type::Bool, - Type::Int16, - Type::UInt16, - Type::Int32, - Type::UInt32, - Type::Int64, - Type::UInt64, - Type::Date, - Type::Date32, - Type::DateTime, - Type::String, - Type::FixedString, - }; - - static constexpr std::array map_key_string_types - { - Type::String, - Type::FixedString - }; - - static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; - static constexpr size_t MAX_TUPLE_ELEMENTS = 16; - static constexpr size_t MAX_DATETIME64_PRECISION = 9; - static constexpr size_t MAX_DECIMAL32_PRECISION = 9; - static constexpr size_t MAX_DECIMAL64_PRECISION = 18; - static constexpr size_t MAX_DECIMAL128_PRECISION = 38; - static constexpr size_t MAX_DECIMAL256_PRECISION = 76; - static constexpr size_t MAX_DEPTH = 32; - -public: - static constexpr auto name = "generateRandomStructure"; - - static FunctionPtr create(ContextPtr /*context*/) - { - return std::make_shared(); - } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1, 2, 3, 4, 5, 6}; } - bool useDefaultImplementationForConstants() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (arguments.size() > 7) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 7", - getName(), arguments.size()); - - for (size_t i = 0; i != 2; ++i) - { - if (arguments.size() == i) - break; - - if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", - i + 1, - arguments[i]->getName(), - getName()); - } - } - - for (size_t i = 2; i != 7; ++i) - { - if (arguments.size() <= i) - break; - - if (!isUInt8(arguments[i])) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the {} argument of function {}, expected UInt8", - i + 1, - arguments[i]->getName(), - getName()); - } - } - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - size_t seed = randomSeed(); - size_t number_of_columns = 0; - - if (!arguments.empty() && !arguments[0].column->onlyNull()) - { - number_of_columns = arguments[0].column->getUInt(0); - if (number_of_columns > MAX_NUMBER_OF_COLUMNS) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Maximum allowed number of columns is {}, got {}", - MAX_NUMBER_OF_COLUMNS, - number_of_columns); - } - - if (arguments.size() > 1 && !arguments[1].column->onlyNull()) - seed = arguments[1].column->getUInt(0); - - bool allow_big_numbers = true; - if (arguments.size() > 2) - allow_big_numbers = arguments[2].column->getBool(0); - - bool allow_enums = true; - if (arguments.size() > 3) - allow_enums = arguments[3].column->getBool(0); - - bool allow_decimals = true; - if (arguments.size() > 4) - allow_decimals = arguments[4].column->getBool(0); - - bool allow_ip = true; - if (arguments.size() > 5) - allow_ip = arguments[5].column->getBool(0); - - bool only_string_map_key = false; - if (arguments.size() > 6) - only_string_map_key = arguments[6].column->getBool(0); - - pcg64 rng(seed); - if (number_of_columns == 0) - number_of_columns = generateNumberOfColumns(rng); - - auto col_res = ColumnString::create(); - String generated_structure; - for (size_t i = 0; i != number_of_columns; ++i) - { - if (i != 0) - generated_structure += ", "; - String column_name = "c" + std::to_string(i + 1); - auto type = generateRandomType(column_name, rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, only_string_map_key); - generated_structure += column_name + " " + type; - } - col_res->insert(generated_structure); - return ColumnConst::create(std::move(col_res), input_rows_count); - } - -private: - - size_t generateNumberOfColumns(pcg64 & rng) const - { - return rng() % MAX_NUMBER_OF_COLUMNS + 1; - } - - /// Helper struct to call generateRandomTypeImpl with lots of bool template arguments without writing big if/else over all bool variables. - template - struct Dispatcher - { - static auto call(const FunctionGenerateRandomStructure * f, const String & column_name, pcg64 & rng) - { - return f->generateRandomTypeImpl(column_name, rng); - } - - template - static auto call(const FunctionGenerateRandomStructure * f, const String & column_name, pcg64 & rng, bool b, Args1... ar1) - { - if (b) - return Dispatcher::call(f, column_name, rng, ar1...); - else - return Dispatcher::call(f, column_name, rng, ar1...); - } - - friend FunctionGenerateRandomStructure; - }; - - String generateRandomType(const String & column_name, pcg64 & rng, bool allow_big_numbers, bool allow_enums, bool allow_decimals, bool allow_ip, bool allow_only_string_map_keys) const - { - return Dispatcher<>::call(this, column_name, rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, allow_only_string_map_keys, true); - } - - template - String generateRandomTypeImpl(const String & column_name, pcg64 & rng, size_t depth = 0) const - { - constexpr auto all_types = getAllTypes(); - auto type = all_types[rng() % all_types.size()]; - - switch (type) - { - case Type::FixedString: - return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - case Type::DateTime64: - return "DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")"; - case Type::Decimal32: - return "Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")"; - case Type::Decimal64: - return "Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")"; - case Type::Decimal128: - return "Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")"; - case Type::Decimal256: - return "Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")"; - case Type::Enum8: - return "Enum8(" + generateEnumValues(column_name, rng) + ")"; - case Type::Enum16: - return "Enum16(" + generateEnumValues(column_name, rng) + ")"; - case Type::LowCardinality: - return "LowCardinality(" + generateLowCardinalityNestedType(rng) + ")"; - case Type::Nullable: - { - auto nested_type = generateRandomTypeImpl(column_name, rng, depth + 1); - return "Nullable(" + nested_type + ")"; - } - case Type::Array: - { - auto nested_type = generateRandomTypeImpl(column_name, rng, depth + 1); - return "Array(" + nested_type + ")"; - } - case Type::Map: - { - auto key_type = generateMapKeyType(rng); - auto value_type = generateRandomTypeImpl(column_name, rng, depth + 1); - return "Map(" + key_type + ", " + value_type + ")"; - } - case Type::Tuple: - { - size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; - bool named_tuple = rng() % 2; - String tuple_type = "Tuple("; - for (size_t i = 0; i != elements; ++i) - { - if (i != 0) - tuple_type += ", "; - - String element_name = "e" + std::to_string(i + 1); - if (named_tuple) - tuple_type += element_name + " "; - tuple_type += generateRandomTypeImpl(element_name, rng, depth + 1); - } - return tuple_type + ")"; - } - case Type::Nested: - { - size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; - String nested_type = "Nested("; - for (size_t i = 0; i != elements; ++i) - { - if (i != 0) - nested_type += ", "; - String element_name = "e" + std::to_string(i + 1); - auto element_type = generateRandomTypeImpl(element_name, rng, depth + 1); - nested_type += element_name + " " + element_type; - } - return nested_type + ")"; - } - default: - return String(magic_enum::enum_name(type)); - } - } - - template - String generateMapKeyType(pcg64 & rng) const - { - Type type; - if constexpr (allow_only_string_map_keys) - type = map_key_string_types[rng() % map_key_string_types.size()]; - else - type = map_key_types[rng() % map_key_types.size()]; - - if (type == Type::FixedString) - return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - return String(magic_enum::enum_name(type)); - } - - String generateLowCardinalityNestedType(pcg64 & rng) const - { - /// Support only String and FixedString. - String nested_type; - if (rng() % 2) - nested_type = "String"; - else - nested_type = "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - return rng() % 2 ? nested_type : "Nullable(" + nested_type + ")"; - } - - String generateEnumValues(const String & column_name, pcg64 & rng) const - { - /// Don't generate big enums, because it will lead to really big strings - /// and slowness of this function, and it can lead to `Max query size exceeded` - /// while using this function with generateRandom. - ssize_t num_values = rng() % 16 + 1; - String result; - for (ssize_t i = 0; i != num_values; ++i) - { - if (i != 0) - result += ", "; - result += "'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(i); - } - return result; - } - - template - static constexpr auto getAllTypes() - { - constexpr size_t big_integer_types_size = big_integer_types.size() * allow_big_numbers; - constexpr size_t enum_types_size = enum_types.size() * allow_enums; - constexpr size_t decimal_types_size = decimal_types.size() * allow_decimals; - constexpr size_t big_decimal_types_size = big_decimal_types.size() * allow_big_numbers * allow_decimals; - constexpr size_t ip_types_size = ip_types.size() * allow_ip; - constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; - - constexpr size_t result_size = simple_types.size() + big_integer_types_size + enum_types_size + decimal_types_size - + big_decimal_types_size + ip_types_size + complex_types_size; - std::array result; - size_t index = 0; - - for (size_t i = 0; i != simple_types.size(); ++i, ++index) - result[index] = simple_types[i]; - - for (size_t i = 0; i != big_integer_types_size; ++i, ++index) - result[index] = big_integer_types[i]; - - for (size_t i = 0; i != enum_types_size; ++i, ++index) - result[index] = enum_types[i]; - - for (size_t i = 0; i != decimal_types_size; ++i, ++index) - result[index] = decimal_types[i]; - - for (size_t i = 0; i != big_decimal_types_size; ++i, ++index) - result[index] = big_decimal_types[i]; - - for (size_t i = 0; i != ip_types_size; ++i, ++index) - result[index] = ip_types[i]; - - for (size_t i = 0; i != complex_types_size; ++i, ++index) - result[index] = complex_types[i]; - - return result; - } -}; - - -REGISTER_FUNCTION(GenerateRandomStructure) -{ - factory.registerFunction( - { - R"( -Generates a random table structure. -This function takes 4 optional constant arguments: -1) the number of column in the result structure (random by default) -2) random seed (random by default) -3) flag that indicates if big number types can be used (true by default) -4) flag that indicates if enum types can be used (true by default) -5) flag that indicates if decimal types can be used (true by default) -6) flag that indicates if ip types (IPv4, IPv6) can be used (true by default) -7) flag that indicates if map keys should be only String or FixedString (false by default) -The maximum number of columns is 128. -The function returns a value of type String. -)", - Documentation::Examples{ - {"random", "SELECT generateRandomStructure()"}, - {"with specified number of arguments", "SELECT generateRandomStructure(10)"}, - {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, - {"without big number types", "SELECT generateRandomStructure(10, NULL, false)"}, - {"without enum types", "SELECT generateRandomStructure(10, NULL, true, false)"}, - {"without decimal types", "SELECT generateRandomStructure(10, NULL, true, true, false)"}, - {"without ip types", "SELECT generateRandomStructure(10, NULL, true, true, true, false)"}, - {"with only string mak key types", "SELECT generateRandomStructure(10, NULL, true, true, true, true, true)"}, - }, - Documentation::Categories{"Random"} - }, - FunctionFactory::CaseSensitive); -} - -} diff --git a/src/Functions/structureToFormatSchema.cpp b/src/Functions/structureToFormatSchema.cpp index 332db44e06e..8c561595504 100644 --- a/src/Functions/structureToFormatSchema.cpp +++ b/src/Functions/structureToFormatSchema.cpp @@ -105,15 +105,15 @@ private: REGISTER_FUNCTION(StructureToCapnProtoSchema) { - factory.registerFunction>( + factory.registerFunction>(FunctionDocumentation { - R"( + .description=R"( )", - Documentation::Examples{ - {"random", "SELECT structureToCapnProtoSchema()"}, + .examples{ + {"random", "SELECT structureToCapnProtoSchema('s String, x UInt32', 'MessageName')", ""}, }, - Documentation::Categories{"Other"} + .categories{"Other"} }, FunctionFactory::CaseSensitive); } @@ -121,15 +121,15 @@ REGISTER_FUNCTION(StructureToCapnProtoSchema) REGISTER_FUNCTION(StructureToProtobufSchema) { - factory.registerFunction>( + factory.registerFunction>(FunctionDocumentation { - R"( + .description=R"( )", - Documentation::Examples{ - {"random", "SELECT structureToCapnProtoSchema()"}, + .examples{ + {"random", "SELECT structureToCapnProtoSchema()", ""}, }, - Documentation::Categories{"Other"} + .categories{"Other"} }, FunctionFactory::CaseSensitive); } diff --git a/tests/queries/0_stateless/02817_structure_to_schema.reference b/tests/queries/0_stateless/02817_structure_to_schema.reference new file mode 100644 index 00000000000..a1aed3f171f --- /dev/null +++ b/tests/queries/0_stateless/02817_structure_to_schema.reference @@ -0,0 +1,488 @@ +CapnProto +Numbers +@0xfcfddd851150d4a7; + +struct Message +{ + int8 @0 : Int8; + uint8 @1 : UInt8; + int16 @2 : Int16; + uint16 @3 : UInt16; + int32 @4 : Int32; + uint32 @5 : UInt32; + int64 @6 : Int64; + uint64 @7 : UInt64; + int128 @8 : Data; + uint128 @9 : Data; + int256 @10 : Data; + uint256 @11 : Data; + float32 @12 : Float32; + float64 @13 : Float64; + decimal32 @14 : Int32; + decimal64 @15 : Int64; + decimal128 @16 : Data; + decimal256 @17 : Data; +} +Dates +@0xf6c4d408fe41b545; + +struct Message +{ + data @0 : UInt16; + date32 @1 : Int32; + datetime @2 : UInt32; + datatime64 @3 : Int64; +} +Strings +@0xaa8865faa0622d35; + +struct Message +{ + string @0 : Data; + fixedstring @1 : Data; +} +Special +@0xc4a88da5454bdc56; + +struct Message +{ + ipv4 @0 : UInt32; + ipv6 @1 : Data; + uuid @2 : Data; +} +Nullable +@0xec817ea81ba08bcf; + +struct Message +{ + struct Nullable + { + union + { + value @0 : UInt32; + null @1 : Void; + } + } + nullable @0 : Nullable; +} +Enums +@0x9d57ed1a12d87f28; + +struct Message +{ + enum Enum8 + { + v1 @0; + v2 @1; + v3 @2; + v4 @3; + } + enum8 @0 : Enum8; + enum Enum16 + { + v5 @0; + v6 @1; + v7 @2; + v8 @3; + v9 @4; + } + enum16 @1 : Enum16; +} +Arrays +@0xc10c035fde2e533a; + +struct Message +{ + arr1 @0 : List(UInt32); + arr2 @1 : List(List(List(UInt32))); +} +Tuples +@0xfbb3d48432e54b68; + +struct Message +{ + struct Tuple1 + { + e1 @0 : UInt32; + e2 @1 : Data; + e3 @2 : UInt32; + } + tuple1 @0 : Tuple1; + struct Tuple2 + { + struct E1 + { + e1 @0 : UInt32; + struct E2 + { + e1 @0 : Data; + e2 @1 : UInt32; + } + e2 @1 : E2; + e3 @2 : Data; + } + e1 @0 : E1; + struct E2 + { + e1 @0 : Data; + e2 @1 : UInt32; + } + e2 @1 : E2; + } + tuple2 @1 : Tuple2; +} +Maps +@0xfdb6e3129839665e; + +struct Message +{ + struct Map1 + { + struct Entry + { + key @0 : Data; + value @1 : UInt32; + } + entries @0 : List(Entry); + } + map1 @0 : Map1; + struct Map2 + { + struct Entry + { + struct Value + { + struct Entry + { + struct Value + { + struct Entry + { + key @0 : Data; + value @1 : UInt32; + } + entries @0 : List(Entry); + } + key @0 : Data; + value @1 : Value; + } + entries @0 : List(Entry); + } + key @0 : Data; + value @1 : Value; + } + entries @0 : List(Entry); + } + map2 @1 : Map2; +} +Complex +@0x8bfcfe1141c8e85c; + +struct Message +{ + struct C1 + { + struct E1 + { + struct Entry + { + struct Value + { + union + { + value @0 : UInt32; + null @1 : Void; + } + } + key @0 : Data; + value @1 : List(List(Value)); + } + entries @0 : List(Entry); + } + e1 @0 : List(E1); + struct E2 + { + struct Entry + { + struct Value + { + struct E1 + { + union + { + value @0 : Data; + null @1 : Void; + } + } + e1 @0 : List(List(E1)); + struct E2 + { + e1 @0 : UInt32; + struct E2 + { + struct E1 + { + union + { + value @0 : Data; + null @1 : Void; + } + } + e1 @0 : List(List(E1)); + e2 @1 : UInt32; + } + e2 @1 : E2; + } + e2 @1 : List(E2); + } + key @0 : Data; + value @1 : Value; + } + entries @0 : List(Entry); + } + e2 @1 : List(E2); + } + c1 @0 : C1; +} +Read/write with no schema +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +Output schema +@0xa43428a60b94a646; + +struct Message +{ + number @0 : UInt64; +} +Bad output schema path +2 +2 +Protobuf +Numbers +syntax = "proto3"; + +message Message +{ + int32 int8 = 1; + uint32 uint8 = 2; + int32 int16 = 3; + uint32 uint16 = 4; + int32 int32 = 5; + uint32 uint32 = 6; + int64 int64 = 7; + uint64 uint64 = 8; + bytes int128 = 9; + bytes uint128 = 10; + bytes int256 = 11; + bytes uint256 = 12; + float float32 = 13; + double float64 = 14; + bytes decimal32 = 15; + bytes decimal64 = 16; + bytes decimal128 = 17; + bytes decimal256 = 18; +} +Dates +syntax = "proto3"; + +message Message +{ + uint32 data = 1; + int32 date32 = 2; + uint32 datetime = 3; + uint64 datatime64 = 4; +} +Strings +syntax = "proto3"; + +message Message +{ + bytes string = 1; + bytes fixedstring = 2; +} +Special +syntax = "proto3"; + +message Message +{ + uint32 ipv4 = 1; + bytes ipv6 = 2; + bytes uuid = 3; +} +Nullable +syntax = "proto3"; + +message Message +{ + uint32 nullable = 1; +} +Enums +syntax = "proto3"; + +message Message +{ + enum Enum8 + { + v1 = 0; + v2 = 1; + v3 = 2; + v4 = 3; + } + Enum8 enum8 = 1; + enum Enum16 + { + v5 = 0; + v6 = 1; + v7 = 2; + v8 = 3; + v9 = 4; + } + Enum16 enum16 = 2; +} +Arrays +syntax = "proto3"; + +message Message +{ + repeated uint32 arr1 = 1; + message Arr2 + { + message Arr2 + { + repeated uint32 arr2 = 1; + } + repeated Arr2 arr2 = 1; + } + repeated Arr2 arr2 = 2; +} +Tuples +syntax = "proto3"; + +message Message +{ + message Tuple1 + { + uint32 e1 = 1; + bytes e2 = 2; + uint32 e3 = 3; + } + Tuple1 tuple1 = 1; + message Tuple2 + { + message E1 + { + uint32 e1 = 1; + message E2 + { + bytes e1 = 1; + uint32 e2 = 2; + } + E2 e2 = 2; + bytes e3 = 3; + } + E1 e1 = 1; + message E2 + { + bytes e1 = 1; + uint32 e2 = 2; + } + E2 e2 = 2; + } + Tuple2 tuple2 = 2; +} +Maps +syntax = "proto3"; + +message Message +{ + map map1 = 1; + message Map2Value + { + message Map2ValueValue + { + map map2ValueValue = 1; + } + map map2Value = 1; + } + map map2 = 2; +} +Complex +syntax = "proto3"; + +message Message +{ + message C1 + { + message E1 + { + message E1Value + { + message E1Value + { + repeated uint32 e1Value = 1; + } + repeated E1Value e1Value = 1; + } + map e1 = 1; + } + repeated E1 e1 = 1; + message E2 + { + message E2Value + { + message E1 + { + repeated bytes e1 = 1; + } + repeated E1 e1 = 1; + message E2 + { + uint32 e1 = 1; + message E2 + { + message E1 + { + repeated bytes e1 = 1; + } + repeated E1 e1 = 1; + uint32 e2 = 2; + } + E2 e2 = 2; + } + repeated E2 e2 = 2; + } + map e2 = 1; + } + repeated E2 e2 = 2; + } + C1 c1 = 1; +} +Read/write with no schema +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +Output schema +syntax = "proto3"; + +message Message +{ + uint64 number = 1; +} +Bad output schema path +2 +2 diff --git a/tests/queries/0_stateless/02817_structure_to_schema.sh b/tests/queries/0_stateless/02817_structure_to_schema.sh new file mode 100755 index 00000000000..8aaf35acb33 --- /dev/null +++ b/tests/queries/0_stateless/02817_structure_to_schema.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME-data +SCHEMA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME-schema + +function test_structure() +{ + format=$1 + ext=$2 + structure=$3 + + $CLICKHOUSE_LOCAL -q "select structureTo${format}Schema('$structure') format TSVRaw" > $SCHEMA_FILE.$ext + cat $SCHEMA_FILE.$ext + + $CLICKHOUSE_LOCAL -q "select * from generateRandom('$structure', 42) limit 10 format $format settings format_schema='$SCHEMA_FILE:Message', format_capn_proto_enum_comparising_mode='by_names'" > $DATA_FILE + $CLICKHOUSE_LOCAL -q "select * from file('$DATA_FILE', $format, '$structure') format Null settings format_schema='$SCHEMA_FILE:Message', format_capn_proto_enum_comparising_mode='by_names'" + +} + +function test_format() +{ + format=$1 + ext=$2 + + echo $format + + echo Numbers + numbers='int8 Int8, uint8 UInt8, int16 Int16, uint16 UInt16, int32 Int32, uint32 UInt32, int64 Int64, uint64 UInt64, int128 Int128, uint128 UInt128, int256 Int256, uint256 UInt256, float32 Float32, float64 Float64, decimal32 Decimal32(3), decimal64 Decimal64(10), decimal128 Decimal128(20), decimal256 Decimal256(40)' + test_structure $format $ext "$numbers" + + echo Dates + dates='data Date, date32 Date32, datetime DateTime, datatime64 DateTime64(9)' + test_structure $format $ext "$dates" + + echo Strings + strings='string String, fixedstring FixedString(42)' + test_structure $format $ext "$strings" + + echo Special + special='ipv4 IPv4, ipv6 IPv6, uuid UUID' + test_structure $format $ext "$special" + + echo Nullable + nullable='nullable Nullable(UInt32)' + test_structure $format $ext "$nullable" + + echo Enums + enums="enum8 Enum8(''v1'' = -100, ''v2'' = -10, ''v3'' = 0, ''v4'' = 42), enum16 Enum16(''v5'' = -2000, ''v6'' = -1000, ''v7'' = 0, ''v8'' = 1000, ''v9'' = 2000)" + test_structure $format $ext "$enums" + + echo Arrays + arrays='arr1 Array(UInt32), arr2 Array(Array(Array(UInt32)))' + test_structure $format $ext "$arrays" + + echo Tuples + tuples='tuple1 Tuple(e1 UInt32, e2 String, e3 DateTime), tuple2 Tuple(e1 Tuple(e1 UInt32, e2 Tuple(e1 String, e2 DateTime), e3 String), e2 Tuple(e1 String, e2 UInt32))' + test_structure $format $ext "$tuples" + + echo Maps + maps='map1 Map(String, UInt32), map2 Map(String, Map(String, Map(String, UInt32)))' + test_structure $format $ext "$maps" + + echo Complex + complex='c1 Array(Tuple(e1 Map(String, Array(Array(Nullable(UInt32)))), e2 Map(String, Tuple(e1 Array(Array(Nullable(String))), e2 Nested(e1 UInt32, e2 Tuple(e1 Array(Array(Nullable(String))), e2 UInt32))))))' + test_structure $format $ext "$complex" + + echo "Read/write with no schema" + $CLICKHOUSE_LOCAL -q "select * from numbers(10) format $format" > $DATA_FILE + $CLICKHOUSE_LOCAL -q "select * from file('$DATA_FILE', $format, 'number UInt64')" + + echo "Output schema" + $CLICKHOUSE_LOCAL -q "select * from numbers(10) format $format settings output_format_schema='$SCHEMA_FILE.$ext'" > $DATA_FILE + cat $SCHEMA_FILE.$ext + + echo "Bad output schema path" + $CLICKHOUSE_CLIENT -q "insert into function file('$DATA_FILE', $format) select * from numbers(10) settings output_format_schema='/tmp/schema.$ext'" 2>&1 | grep "BAD_ARGUMENTS" -c + $CLICKHOUSE_CLIENT -q "insert into function file('$DATA_FILE', $format) select * from numbers(10) settings output_format_schema='../../schema.$ext'" 2>&1 | grep "BAD_ARGUMENTS" -c +} + +test_format CapnProto capnp +test_format Protobuf proto + +rm $DATA_FILE +rm $SCHEMA_FILE* + From 7c69449a0371a907404c263f90fa0414ea6dcb78 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 18 Jul 2023 17:58:42 +0000 Subject: [PATCH 12/81] Clean --- src/TableFunctions/TableFunctionGenerateRandom.cpp | 3 --- src/TableFunctions/TableFunctionS3.cpp | 1 - 2 files changed, 4 deletions(-) diff --git a/src/TableFunctions/TableFunctionGenerateRandom.cpp b/src/TableFunctions/TableFunctionGenerateRandom.cpp index 8d02b9bca51..08059796660 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.cpp +++ b/src/TableFunctions/TableFunctionGenerateRandom.cpp @@ -56,9 +56,6 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co args[0] = evaluateConstantExpressionAsLiteral(args[0], context); } - /// Allow constant expression for structure argument, it can be generated using generateRandomStructure function. - args[0] = evaluateConstantExpressionAsLiteral(args[0], context); - // All the arguments must be literals. for (const auto & arg : args) { diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index 5b81aa72914..c8cc0cddd30 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -306,7 +306,6 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) bool TableFunctionS3::supportsReadingSubsetOfColumns() { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); } From da6a31bb628de3f038fbf786b2102cbb3a7fee2d Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 19 Jul 2023 13:26:09 +0000 Subject: [PATCH 13/81] Fix tests and style --- src/Formats/StructureToCapnProtoSchema.cpp | 2 +- src/Formats/StructureToProtobufSchema.cpp | 2 +- src/Functions/structureToFormatSchema.cpp | 20 +++++++++++++------ .../0_stateless/02817_structure_to_schema.sh | 1 + 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/Formats/StructureToCapnProtoSchema.cpp b/src/Formats/StructureToCapnProtoSchema.cpp index 56e2155e5bd..9f4d96b7c8a 100644 --- a/src/Formats/StructureToCapnProtoSchema.cpp +++ b/src/Formats/StructureToCapnProtoSchema.cpp @@ -188,7 +188,7 @@ String prepareEnumAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) { TypeIndex type_id = data_type->getTypeId(); - + switch (data_type->getTypeId()) { case TypeIndex::Nullable: diff --git a/src/Formats/StructureToProtobufSchema.cpp b/src/Formats/StructureToProtobufSchema.cpp index 086a6ff6864..4a704e8d428 100644 --- a/src/Formats/StructureToProtobufSchema.cpp +++ b/src/Formats/StructureToProtobufSchema.cpp @@ -168,7 +168,7 @@ String prepareEnumAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) { TypeIndex type_id = data_type->getTypeId(); - + switch (data_type->getTypeId()) { case TypeIndex::Nullable: diff --git a/src/Functions/structureToFormatSchema.cpp b/src/Functions/structureToFormatSchema.cpp index 8c561595504..f2bd78e4fb7 100644 --- a/src/Functions/structureToFormatSchema.cpp +++ b/src/Functions/structureToFormatSchema.cpp @@ -19,8 +19,6 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; } template @@ -108,10 +106,14 @@ REGISTER_FUNCTION(StructureToCapnProtoSchema) factory.registerFunction>(FunctionDocumentation { .description=R"( - +Function that converts ClickHouse table structure to CapnProto format schema )", .examples{ - {"random", "SELECT structureToCapnProtoSchema('s String, x UInt32', 'MessageName')", ""}, + {"random", "SELECT structureToCapnProtoSchema('s String, x UInt32', 'MessageName') format TSVRaw", "struct MessageName\n" +"{\n" +" s @0 : Data;\n" +" x @1 : UInt32;\n" +"}"}, }, .categories{"Other"} }, @@ -124,10 +126,16 @@ REGISTER_FUNCTION(StructureToProtobufSchema) factory.registerFunction>(FunctionDocumentation { .description=R"( - +Function that converts ClickHouse table structure to Protobuf format schema )", .examples{ - {"random", "SELECT structureToCapnProtoSchema()", ""}, + {"random", "SELECT structureToCapnProtoSchema('s String, x UInt32', 'MessageName') format TSVRaw", "syntax = \"proto3\";\n" +"\n" +"message MessageName\n" +"{\n" +" bytes s = 1;\n" +" uint32 x = 2;\n" +"}"}, }, .categories{"Other"} }, diff --git a/tests/queries/0_stateless/02817_structure_to_schema.sh b/tests/queries/0_stateless/02817_structure_to_schema.sh index 8aaf35acb33..76c5b1a1d85 100755 --- a/tests/queries/0_stateless/02817_structure_to_schema.sh +++ b/tests/queries/0_stateless/02817_structure_to_schema.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 10cdaac32f64c6346625ab2ee7e8af89fb2e0b22 Mon Sep 17 00:00:00 2001 From: zvonand Date: Wed, 19 Jul 2023 16:32:09 +0200 Subject: [PATCH 14/81] update Settings.h --- src/Core/Settings.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f1a314fed37..97e297c6feb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1011,7 +1011,8 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ + M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ + M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. From 710e9a817de5c5b0f47ea1a0ae0ac29ac8e48cb4 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 20 Jul 2023 12:59:05 +0000 Subject: [PATCH 15/81] Fix tests --- src/Functions/structureToFormatSchema.cpp | 2 +- .../0_stateless/02817_structure_to_schema.out | 468 ++++++++++++++++++ .../02817_structure_to_schema.reference | 22 - .../0_stateless/02817_structure_to_schema.sh | 4 +- 4 files changed, 471 insertions(+), 25 deletions(-) create mode 100644 tests/queries/0_stateless/02817_structure_to_schema.out diff --git a/src/Functions/structureToFormatSchema.cpp b/src/Functions/structureToFormatSchema.cpp index f2bd78e4fb7..406da372c04 100644 --- a/src/Functions/structureToFormatSchema.cpp +++ b/src/Functions/structureToFormatSchema.cpp @@ -27,7 +27,7 @@ class FunctionStructureToFormatSchema : public IFunction public: static constexpr auto name = Impl::name; - FunctionStructureToFormatSchema(ContextPtr context_) : context(std::move(context_)) + explicit FunctionStructureToFormatSchema(ContextPtr context_) : context(std::move(context_)) { } diff --git a/tests/queries/0_stateless/02817_structure_to_schema.out b/tests/queries/0_stateless/02817_structure_to_schema.out new file mode 100644 index 00000000000..ad45343139b --- /dev/null +++ b/tests/queries/0_stateless/02817_structure_to_schema.out @@ -0,0 +1,468 @@ +CapnProto +Numbers + +struct Message +{ + int8 @0 : Int8; + uint8 @1 : UInt8; + int16 @2 : Int16; + uint16 @3 : UInt16; + int32 @4 : Int32; + uint32 @5 : UInt32; + int64 @6 : Int64; + uint64 @7 : UInt64; + int128 @8 : Data; + uint128 @9 : Data; + int256 @10 : Data; + uint256 @11 : Data; + float32 @12 : Float32; + float64 @13 : Float64; + decimal32 @14 : Int32; + decimal64 @15 : Int64; + decimal128 @16 : Data; + decimal256 @17 : Data; +} +Dates + +struct Message +{ + data @0 : UInt16; + date32 @1 : Int32; + datetime @2 : UInt32; + datatime64 @3 : Int64; +} +Strings + +struct Message +{ + string @0 : Data; + fixedstring @1 : Data; +} +Special + +struct Message +{ + ipv4 @0 : UInt32; + ipv6 @1 : Data; + uuid @2 : Data; +} +Nullable + +struct Message +{ + struct Nullable + { + union + { + value @0 : UInt32; + null @1 : Void; + } + } + nullable @0 : Nullable; +} +Enums + +struct Message +{ + enum Enum8 + { + v1 @0; + v2 @1; + v3 @2; + v4 @3; + } + enum8 @0 : Enum8; + enum Enum16 + { + v5 @0; + v6 @1; + v7 @2; + v8 @3; + v9 @4; + } + enum16 @1 : Enum16; +} +Arrays + +struct Message +{ + arr1 @0 : List(UInt32); + arr2 @1 : List(List(List(UInt32))); +} +Tuples + +struct Message +{ + struct Tuple1 + { + e1 @0 : UInt32; + e2 @1 : Data; + e3 @2 : UInt32; + } + tuple1 @0 : Tuple1; + struct Tuple2 + { + struct E1 + { + e1 @0 : UInt32; + struct E2 + { + e1 @0 : Data; + e2 @1 : UInt32; + } + e2 @1 : E2; + e3 @2 : Data; + } + e1 @0 : E1; + struct E2 + { + e1 @0 : Data; + e2 @1 : UInt32; + } + e2 @1 : E2; + } + tuple2 @1 : Tuple2; +} +Maps + +struct Message +{ + struct Map1 + { + struct Entry + { + key @0 : Data; + value @1 : UInt32; + } + entries @0 : List(Entry); + } + map1 @0 : Map1; + struct Map2 + { + struct Entry + { + struct Value + { + struct Entry + { + struct Value + { + struct Entry + { + key @0 : Data; + value @1 : UInt32; + } + entries @0 : List(Entry); + } + key @0 : Data; + value @1 : Value; + } + entries @0 : List(Entry); + } + key @0 : Data; + value @1 : Value; + } + entries @0 : List(Entry); + } + map2 @1 : Map2; +} +Complex + +struct Message +{ + struct C1 + { + struct E1 + { + struct Entry + { + struct Value + { + union + { + value @0 : UInt32; + null @1 : Void; + } + } + key @0 : Data; + value @1 : List(List(Value)); + } + entries @0 : List(Entry); + } + e1 @0 : List(E1); + struct E2 + { + struct Entry + { + struct Value + { + struct E1 + { + union + { + value @0 : Data; + null @1 : Void; + } + } + e1 @0 : List(List(E1)); + struct E2 + { + e1 @0 : UInt32; + struct E2 + { + struct E1 + { + union + { + value @0 : Data; + null @1 : Void; + } + } + e1 @0 : List(List(E1)); + e2 @1 : UInt32; + } + e2 @1 : E2; + } + e2 @1 : List(E2); + } + key @0 : Data; + value @1 : Value; + } + entries @0 : List(Entry); + } + e2 @1 : List(E2); + } + c1 @0 : C1; +} +Read/write with no schema +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +Output schema +@0xda5297bc0d5b501a; + +struct Message +{ + number @0 : UInt64; +} +Bad output schema path +2 +2 +Protobuf +Numbers + +message Message +{ + int32 int8 = 1; + uint32 uint8 = 2; + int32 int16 = 3; + uint32 uint16 = 4; + int32 int32 = 5; + uint32 uint32 = 6; + int64 int64 = 7; + uint64 uint64 = 8; + bytes int128 = 9; + bytes uint128 = 10; + bytes int256 = 11; + bytes uint256 = 12; + float float32 = 13; + double float64 = 14; + bytes decimal32 = 15; + bytes decimal64 = 16; + bytes decimal128 = 17; + bytes decimal256 = 18; +} +Dates + +message Message +{ + uint32 data = 1; + int32 date32 = 2; + uint32 datetime = 3; + uint64 datatime64 = 4; +} +Strings + +message Message +{ + bytes string = 1; + bytes fixedstring = 2; +} +Special + +message Message +{ + uint32 ipv4 = 1; + bytes ipv6 = 2; + bytes uuid = 3; +} +Nullable + +message Message +{ + uint32 nullable = 1; +} +Enums + +message Message +{ + enum Enum8 + { + v1 = 0; + v2 = 1; + v3 = 2; + v4 = 3; + } + Enum8 enum8 = 1; + enum Enum16 + { + v5 = 0; + v6 = 1; + v7 = 2; + v8 = 3; + v9 = 4; + } + Enum16 enum16 = 2; +} +Arrays + +message Message +{ + repeated uint32 arr1 = 1; + message Arr2 + { + message Arr2 + { + repeated uint32 arr2 = 1; + } + repeated Arr2 arr2 = 1; + } + repeated Arr2 arr2 = 2; +} +Tuples + +message Message +{ + message Tuple1 + { + uint32 e1 = 1; + bytes e2 = 2; + uint32 e3 = 3; + } + Tuple1 tuple1 = 1; + message Tuple2 + { + message E1 + { + uint32 e1 = 1; + message E2 + { + bytes e1 = 1; + uint32 e2 = 2; + } + E2 e2 = 2; + bytes e3 = 3; + } + E1 e1 = 1; + message E2 + { + bytes e1 = 1; + uint32 e2 = 2; + } + E2 e2 = 2; + } + Tuple2 tuple2 = 2; +} +Maps + +message Message +{ + map map1 = 1; + message Map2Value + { + message Map2ValueValue + { + map map2ValueValue = 1; + } + map map2Value = 1; + } + map map2 = 2; +} +Complex + +message Message +{ + message C1 + { + message E1 + { + message E1Value + { + message E1Value + { + repeated uint32 e1Value = 1; + } + repeated E1Value e1Value = 1; + } + map e1 = 1; + } + repeated E1 e1 = 1; + message E2 + { + message E2Value + { + message E1 + { + repeated bytes e1 = 1; + } + repeated E1 e1 = 1; + message E2 + { + uint32 e1 = 1; + message E2 + { + message E1 + { + repeated bytes e1 = 1; + } + repeated E1 e1 = 1; + uint32 e2 = 2; + } + E2 e2 = 2; + } + repeated E2 e2 = 2; + } + map e2 = 1; + } + repeated E2 e2 = 2; + } + C1 c1 = 1; +} +Read/write with no schema +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +Output schema +syntax = "proto3"; + +message Message +{ + uint64 number = 1; +} +Bad output schema path +2 +2 diff --git a/tests/queries/0_stateless/02817_structure_to_schema.reference b/tests/queries/0_stateless/02817_structure_to_schema.reference index a1aed3f171f..1f39a8ed50e 100644 --- a/tests/queries/0_stateless/02817_structure_to_schema.reference +++ b/tests/queries/0_stateless/02817_structure_to_schema.reference @@ -1,6 +1,5 @@ CapnProto Numbers -@0xfcfddd851150d4a7; struct Message { @@ -24,7 +23,6 @@ struct Message decimal256 @17 : Data; } Dates -@0xf6c4d408fe41b545; struct Message { @@ -34,7 +32,6 @@ struct Message datatime64 @3 : Int64; } Strings -@0xaa8865faa0622d35; struct Message { @@ -42,7 +39,6 @@ struct Message fixedstring @1 : Data; } Special -@0xc4a88da5454bdc56; struct Message { @@ -51,7 +47,6 @@ struct Message uuid @2 : Data; } Nullable -@0xec817ea81ba08bcf; struct Message { @@ -66,7 +61,6 @@ struct Message nullable @0 : Nullable; } Enums -@0x9d57ed1a12d87f28; struct Message { @@ -89,7 +83,6 @@ struct Message enum16 @1 : Enum16; } Arrays -@0xc10c035fde2e533a; struct Message { @@ -97,7 +90,6 @@ struct Message arr2 @1 : List(List(List(UInt32))); } Tuples -@0xfbb3d48432e54b68; struct Message { @@ -132,7 +124,6 @@ struct Message tuple2 @1 : Tuple2; } Maps -@0xfdb6e3129839665e; struct Message { @@ -176,7 +167,6 @@ struct Message map2 @1 : Map2; } Complex -@0x8bfcfe1141c8e85c; struct Message { @@ -256,7 +246,6 @@ Read/write with no schema 8 9 Output schema -@0xa43428a60b94a646; struct Message { @@ -267,7 +256,6 @@ Bad output schema path 2 Protobuf Numbers -syntax = "proto3"; message Message { @@ -291,7 +279,6 @@ message Message bytes decimal256 = 18; } Dates -syntax = "proto3"; message Message { @@ -301,7 +288,6 @@ message Message uint64 datatime64 = 4; } Strings -syntax = "proto3"; message Message { @@ -309,7 +295,6 @@ message Message bytes fixedstring = 2; } Special -syntax = "proto3"; message Message { @@ -318,14 +303,12 @@ message Message bytes uuid = 3; } Nullable -syntax = "proto3"; message Message { uint32 nullable = 1; } Enums -syntax = "proto3"; message Message { @@ -348,7 +331,6 @@ message Message Enum16 enum16 = 2; } Arrays -syntax = "proto3"; message Message { @@ -364,7 +346,6 @@ message Message repeated Arr2 arr2 = 2; } Tuples -syntax = "proto3"; message Message { @@ -399,7 +380,6 @@ message Message Tuple2 tuple2 = 2; } Maps -syntax = "proto3"; message Message { @@ -415,7 +395,6 @@ message Message map map2 = 2; } Complex -syntax = "proto3"; message Message { @@ -477,7 +456,6 @@ Read/write with no schema 8 9 Output schema -syntax = "proto3"; message Message { diff --git a/tests/queries/0_stateless/02817_structure_to_schema.sh b/tests/queries/0_stateless/02817_structure_to_schema.sh index 76c5b1a1d85..3b79fa099a8 100755 --- a/tests/queries/0_stateless/02817_structure_to_schema.sh +++ b/tests/queries/0_stateless/02817_structure_to_schema.sh @@ -15,7 +15,7 @@ function test_structure() structure=$3 $CLICKHOUSE_LOCAL -q "select structureTo${format}Schema('$structure') format TSVRaw" > $SCHEMA_FILE.$ext - cat $SCHEMA_FILE.$ext + tail -n +2 $SCHEMA_FILE.$ext $CLICKHOUSE_LOCAL -q "select * from generateRandom('$structure', 42) limit 10 format $format settings format_schema='$SCHEMA_FILE:Message', format_capn_proto_enum_comparising_mode='by_names'" > $DATA_FILE $CLICKHOUSE_LOCAL -q "select * from file('$DATA_FILE', $format, '$structure') format Null settings format_schema='$SCHEMA_FILE:Message', format_capn_proto_enum_comparising_mode='by_names'" @@ -75,7 +75,7 @@ function test_format() echo "Output schema" $CLICKHOUSE_LOCAL -q "select * from numbers(10) format $format settings output_format_schema='$SCHEMA_FILE.$ext'" > $DATA_FILE - cat $SCHEMA_FILE.$ext + tail -n +2 $SCHEMA_FILE.$ext echo "Bad output schema path" $CLICKHOUSE_CLIENT -q "insert into function file('$DATA_FILE', $format) select * from numbers(10) settings output_format_schema='/tmp/schema.$ext'" 2>&1 | grep "BAD_ARGUMENTS" -c From c8b128aad442c763f1b85c20b8389d240466f97f Mon Sep 17 00:00:00 2001 From: Yury Bogomolov Date: Fri, 21 Jul 2023 01:06:49 +0400 Subject: [PATCH 16/81] add predefined macros support in kafka engine settings --- src/Storages/Kafka/StorageKafka.cpp | 15 ++++--- src/Storages/Kafka/StorageKafka.h | 2 + tests/integration/test_storage_kafka/test.py | 42 ++++++++++++++++++++ 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 2aba76c1a3f..8cc4fd90f8d 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -250,15 +250,20 @@ StorageKafka::StorageKafka( : IStorage(table_id_) , WithContext(context_->getGlobalContext()) , kafka_settings(std::move(kafka_settings_)) - , topics(parseTopics(getContext()->getMacros()->expand(kafka_settings->kafka_topic_list.value))) - , brokers(getContext()->getMacros()->expand(kafka_settings->kafka_broker_list.value)) - , group(getContext()->getMacros()->expand(kafka_settings->kafka_group_name.value)) + , macros_info([&table_id_](){ + Macros::MacroExpansionInfo info; + info.table_id = table_id_; + return info; + }()) + , topics(parseTopics(getContext()->getMacros()->expand(kafka_settings->kafka_topic_list.value, macros_info))) + , brokers(getContext()->getMacros()->expand(kafka_settings->kafka_broker_list.value, macros_info)) + , group(getContext()->getMacros()->expand(kafka_settings->kafka_group_name.value, macros_info)) , client_id( kafka_settings->kafka_client_id.value.empty() ? getDefaultClientId(table_id_) - : getContext()->getMacros()->expand(kafka_settings->kafka_client_id.value)) + : getContext()->getMacros()->expand(kafka_settings->kafka_client_id.value, macros_info)) , format_name(getContext()->getMacros()->expand(kafka_settings->kafka_format.value)) , max_rows_per_message(kafka_settings->kafka_max_rows_per_message.value) - , schema_name(getContext()->getMacros()->expand(kafka_settings->kafka_schema.value)) + , schema_name(getContext()->getMacros()->expand(kafka_settings->kafka_schema.value, macros_info)) , num_consumers(kafka_settings->kafka_num_consumers.value) , log(&Poco::Logger::get("StorageKafka (" + table_id_.table_name + ")")) , semaphore(0, static_cast(num_consumers)) diff --git a/src/Storages/Kafka/StorageKafka.h b/src/Storages/Kafka/StorageKafka.h index 09aa091ef18..72875e77b12 100644 --- a/src/Storages/Kafka/StorageKafka.h +++ b/src/Storages/Kafka/StorageKafka.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -79,6 +80,7 @@ public: private: // Configuration and state std::unique_ptr kafka_settings; + Macros::MacroExpansionInfo macros_info; const Names topics; const String brokers; const String group; diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 9a6d3e0513c..621fc72a607 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -444,6 +444,48 @@ def test_kafka_settings_new_syntax(kafka_cluster): assert members[0]["client_id"] == "instance test 1234" +def test_kafka_settings_predefined_macros(kafka_cluster): + instance.query( + """ + CREATE TABLE test.kafka (key UInt64, value UInt64) + ENGINE = Kafka + SETTINGS kafka_broker_list = '{kafka_broker}:19092', + kafka_topic_list = '{database}_{table}_topic', + kafka_group_name = '{database}_{table}_group', + kafka_format = '{kafka_format_json_each_row}', + kafka_row_delimiter = '\\n', + kafka_commit_on_select = 1, + kafka_client_id = '{database}_{table} test 1234', + kafka_skip_broken_messages = 1; + """ + ) + + messages = [] + for i in range(25): + messages.append(json.dumps({"key": i, "value": i})) + kafka_produce(kafka_cluster, "test_kafka", messages) + + # Insert couple of malformed messages. + kafka_produce(kafka_cluster, "test_kafka", ["}{very_broken_message,"]) + kafka_produce(kafka_cluster, "test_kafka", ["}another{very_broken_message,"]) + + messages = [] + for i in range(25, 50): + messages.append(json.dumps({"key": i, "value": i})) + kafka_produce(kafka_cluster, "test_kafka", messages) + + result = "" + while True: + result += instance.query("SELECT * FROM test.kafka", ignore_error=True) + if kafka_check_result(result): + break + + kafka_check_result(result, True) + + members = describe_consumer_group(kafka_cluster, "new") + assert members[0]["client_id"] == "test_kafka test 1234" + + def test_kafka_json_as_string(kafka_cluster): kafka_produce( kafka_cluster, From 538d1f7c173cae793c65e48dca039d2460de1e4a Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 18 Jul 2023 18:55:34 +0200 Subject: [PATCH 17/81] Kill the runner process with all subprocesses --- tests/ci/worker/init_runner.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/ci/worker/init_runner.sh b/tests/ci/worker/init_runner.sh index 171dd743f90..5939ae8ced2 100644 --- a/tests/ci/worker/init_runner.sh +++ b/tests/ci/worker/init_runner.sh @@ -102,7 +102,8 @@ check_proceed_spot_termination() { runner_pid=$(pgrep Runner.Listener) if [ -n "$runner_pid" ]; then # Kill the runner to not allow it cancelling the job - kill -9 "$runner_pid" + # shellcheck disable=SC2046 + kill -9 $(list_children "$runner_pid") fi sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" terminate_and_exit @@ -234,6 +235,19 @@ is_job_assigned() { || return 1 } +list_children () { + local children + children=$(ps --ppid "$1" -o pid=) + if [ -z "$children" ]; then + return + fi + + for pid in $children; do + list_children "$pid" + done + echo "$children" +} + while true; do runner_pid=$(pgrep Runner.Listener) echo "Got runner pid '$runner_pid'" @@ -275,7 +289,8 @@ while true; do echo "During the metadata check the job was assigned, continue" continue fi - kill -9 "$runner_pid" + # shellcheck disable=SC2046 + kill -9 $(list_children "$runner_pid") sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" terminate_on_event fi From da37f32d4b4ed31eb740ed21ee910dcccba16fa0 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 19 Jul 2023 15:11:14 +0200 Subject: [PATCH 18/81] Attempt to delete the runner before shutting down --- tests/ci/worker/init_runner.sh | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/tests/ci/worker/init_runner.sh b/tests/ci/worker/init_runner.sh index 5939ae8ced2..4fbfa748d16 100644 --- a/tests/ci/worker/init_runner.sh +++ b/tests/ci/worker/init_runner.sh @@ -12,7 +12,8 @@ echo "Running init script" export DEBIAN_FRONTEND=noninteractive export RUNNER_HOME=/home/ubuntu/actions-runner -export RUNNER_URL="https://github.com/ClickHouse" +export RUNNER_ORG="ClickHouse" +export RUNNER_URL="https://github.com/${RUNNER_ORG}" # Funny fact, but metadata service has fixed IP INSTANCE_ID=$(ec2metadata --instance-id) export INSTANCE_ID @@ -282,18 +283,11 @@ while true; do RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$runner_pid" 2>/dev/null || date +%s) )) echo "The runner is launched $RUNNER_AGE seconds ago and still has hot received the job" if (( 60 < RUNNER_AGE )); then - echo "Check if the instance should tear down" - if ! no_terminating_metadata; then - # Another check if the worker still didn't start - if is_job_assigned; then - echo "During the metadata check the job was assigned, continue" - continue - fi - # shellcheck disable=SC2046 - kill -9 $(list_children "$runner_pid") - sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" - terminate_on_event - fi + echo "Attempt to delete the runner for a graceful shutdown" + sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" \ + || continue + echo "Runner didn't launch or have assigned jobs after ${RUNNER_AGE} seconds, shutting down" + terminate_and_exit fi fi sleep 5 From a117997d5b9b1386963b7f1d76e2d0a34155198c Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 21 Jul 2023 16:12:04 +0200 Subject: [PATCH 19/81] Show the runner type in the job logs --- tests/ci/worker/init_runner.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ci/worker/init_runner.sh b/tests/ci/worker/init_runner.sh index 4fbfa748d16..d452c985407 100644 --- a/tests/ci/worker/init_runner.sh +++ b/tests/ci/worker/init_runner.sh @@ -173,6 +173,7 @@ set -uo pipefail echo "Runner's public DNS: $(ec2metadata --public-hostname)" echo "Runner's labels: ${LABELS}" +echo "Runner's instance type: $(ec2metadata --instance-type)" EOF # Create a post-run script that will restart docker daemon before the job started From 083be24a74d34b9e33ae5d3751cdbbb958d9f6fb Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 21 Jul 2023 17:11:06 +0200 Subject: [PATCH 20/81] Improve logic of checking valid cached values in lambdas --- .../lambda_shared_package/lambda_shared/__init__.py | 9 +++++++++ .../ci/lambda_shared_package/lambda_shared/token.py | 11 +++++------ tests/ci/terminate_runner_lambda/app.py | 13 ++++--------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/tests/ci/lambda_shared_package/lambda_shared/__init__.py b/tests/ci/lambda_shared_package/lambda_shared/__init__.py index c56994cc86a..aa88342fcc3 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/__init__.py +++ b/tests/ci/lambda_shared_package/lambda_shared/__init__.py @@ -219,3 +219,12 @@ def list_runners(access_token: str) -> RunnerDescriptions: result.append(desc) return result + + +def cached_value_is_valid(updated_at: float, ttl: float) -> bool: + "a common function to identify if cachable value is still valid" + if updated_at == 0: + return False + if time.time() - ttl < updated_at: + return True + return False diff --git a/tests/ci/lambda_shared_package/lambda_shared/token.py b/tests/ci/lambda_shared_package/lambda_shared/token.py index d3bf15ab259..6d5653f6a58 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/token.py +++ b/tests/ci/lambda_shared_package/lambda_shared/token.py @@ -8,6 +8,8 @@ import boto3 # type: ignore import jwt import requests # type: ignore +from . import cached_value_is_valid + def get_key_and_app_from_aws() -> Tuple[str, int]: secret_name = "clickhouse_github_secret_key" @@ -68,7 +70,7 @@ def get_access_token_by_key_app(private_key: str, app_id: int) -> str: @dataclass class CachedToken: - time: int + time: float value: str updating: bool = False @@ -81,12 +83,9 @@ def get_cached_access_token() -> str: return _cached_token.value # Indicate that the value is updating now, so the cached value can be # used. The first setting and close-to-ttl are not counted as update - if _cached_token.time != 0 or time.time() - 590 < _cached_token.time: - _cached_token.updating = True - else: - _cached_token.updating = False + _cached_token.updating = cached_value_is_valid(_cached_token.time, 590) private_key, app_id = get_key_and_app_from_aws() - _cached_token.time = int(time.time()) + _cached_token.time = time.time() _cached_token.value = get_access_token_by_key_app(private_key, app_id) _cached_token.updating = False return _cached_token.value diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index ed198d855b9..01d1f80ad29 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -9,13 +9,13 @@ from typing import Any, Dict, List import boto3 # type: ignore -from lambda_shared import RunnerDescriptions, list_runners +from lambda_shared import RunnerDescriptions, list_runners, cached_value_is_valid from lambda_shared.token import get_access_token_by_key_app, get_cached_access_token @dataclass class CachedInstances: - time: int + time: float value: dict updating: bool = False @@ -27,17 +27,12 @@ def get_cached_instances() -> dict: """return cached instances description with updating it once per five minutes""" if time.time() - 250 < cached_instances.time or cached_instances.updating: return cached_instances.value - # Indicate that the value is updating now, so the cached value can be - # used. The first setting and close-to-ttl are not counted as update - if cached_instances.time != 0 or time.time() - 300 < cached_instances.time: - cached_instances.updating = True - else: - cached_instances.updating = False + cached_instances.updating = cached_value_is_valid(cached_instances.time, 300) ec2_client = boto3.client("ec2") instances_response = ec2_client.describe_instances( Filters=[{"Name": "instance-state-name", "Values": ["running"]}] ) - cached_instances.time = int(time.time()) + cached_instances.time = time.time() cached_instances.value = { instance["InstanceId"]: instance for reservation in instances_response["Reservations"] From 101214cb8f6b3f9d3e9eed9858436259fe411976 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 21 Jul 2023 17:11:36 +0200 Subject: [PATCH 21/81] Fix broken termination lambda by caching runners for at most 15 seconds --- tests/ci/terminate_runner_lambda/app.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index 01d1f80ad29..010f7dd6734 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -42,6 +42,28 @@ def get_cached_instances() -> dict: return cached_instances.value +@dataclass +class CachedRunners: + time: float + value: RunnerDescriptions + updating: bool = False + + +cached_runners = CachedRunners(0, []) + + +def get_cached_runners(access_token: str) -> RunnerDescriptions: + """From time to time request to GH api costs up to 3 seconds, and + it's a disaster from the termination lambda perspective""" + if time.time() - 5 < cached_runners.time or cached_instances.updating: + return cached_runners.value + cached_runners.updating = cached_value_is_valid(cached_runners.time, 15) + cached_runners.value = list_runners(access_token) + cached_runners.time = time.time() + cached_runners.updating = False + return cached_runners.value + + def how_many_instances_to_kill(event_data: dict) -> Dict[str, int]: data_array = event_data["CapacityToTerminate"] to_kill_by_zone = {} # type: Dict[str, int] @@ -99,7 +121,7 @@ def main(access_token: str, event: dict) -> Dict[str, List[str]]: ) print("Time spent on the requests to AWS: ", time.time() - start) - runners = list_runners(access_token) + runners = get_cached_runners(access_token) runner_ids = set(runner.name for runner in runners) # We used to delete potential hosts to terminate from GitHub runners pool, # but the documentation states: From eb8e7f49f7579016069ea1b4ca37c78eef1fb831 Mon Sep 17 00:00:00 2001 From: Zhiguo Zhou Date: Thu, 13 Jul 2023 22:44:31 +0800 Subject: [PATCH 22/81] Rename: extend the tests to QueryTree validation --- ...date_predicate_optimizations_ast_query_tree_rewrite.reference} | 0 ...02785_date_predicate_optimizations_ast_query_tree_rewrite.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02785_date_predicate_optimizations_ast_rewrite.reference => 02785_date_predicate_optimizations_ast_query_tree_rewrite.reference} (100%) rename tests/queries/0_stateless/{02785_date_predicate_optimizations_ast_rewrite.sql => 02785_date_predicate_optimizations_ast_query_tree_rewrite.sql} (100%) diff --git a/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_rewrite.reference b/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.reference similarity index 100% rename from tests/queries/0_stateless/02785_date_predicate_optimizations_ast_rewrite.reference rename to tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.reference diff --git a/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_rewrite.sql b/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.sql similarity index 100% rename from tests/queries/0_stateless/02785_date_predicate_optimizations_ast_rewrite.sql rename to tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.sql From b88fc4277a0a85b1b6d590d26e1b796cc5d6c138 Mon Sep 17 00:00:00 2001 From: Zhiguo Zhou Date: Thu, 13 Jul 2023 22:48:05 +0800 Subject: [PATCH 23/81] Add a pass in Analyzer for time filter optimization with preimage The preimage has been identified as the general solution to the optimization of predicates with time converters (#15257). PR #50951 implemented this solution via the AST rewrite. As a follow-up, this commit extends the optimization to the experi- mental analyzer by replacing the sub-QueryTree of the time filter with its preimage-transformed one. The optimization is implemented as a new pass in the Analyzer. --- ...ateOrDateTimeConverterWithPreimagePass.cpp | 218 ++++ ...eDateOrDateTimeConverterWithPreimagePass.h | 24 + src/Analyzer/QueryTreePassManager.cpp | 2 + ...783_date_predicate_optimizations.reference | 50 + .../02783_date_predicate_optimizations.sql | 50 + ...mizations_ast_query_tree_rewrite.reference | 1035 +++++++++++++++++ ...e_optimizations_ast_query_tree_rewrite.sql | 28 + 7 files changed, 1407 insertions(+) create mode 100644 src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp create mode 100644 src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.h diff --git a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp new file mode 100644 index 00000000000..f7f69598c4b --- /dev/null +++ b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp @@ -0,0 +1,218 @@ +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace +{ + +class OptimizeDateOrDateTimeConverterWithPreimageVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + + explicit OptimizeDateOrDateTimeConverterWithPreimageVisitor(ContextPtr context) + : Base(std::move(context)) + {} + + static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*child*/) + { + const static std::unordered_set relations = { + "equals", + "notEquals", + "less", + "greater", + "lessOrEquals", + "greaterOrEquals", + }; + + if (const auto * function = node->as()) + { + return !relations.contains(function->getFunctionName()); + } + + return true; + } + + void visitImpl(QueryTreeNodePtr & node) const + { + const static std::unordered_map swap_relations = { + {"equals", "equals"}, + {"notEquals", "notEquals"}, + {"less", "greater"}, + {"greater", "less"}, + {"lessOrEquals", "greaterOrEquals"}, + {"greaterOrEquals", "lessOrEquals"}, + }; + + const auto * function = node->as(); + + if (!function || !swap_relations.contains(function->getFunctionName())) return; + + if (function->getArguments().getNodes().size() != 2) return; + + size_t func_id = function->getArguments().getNodes().size(); + + for (size_t i = 0; i < function->getArguments().getNodes().size(); i++) + { + if (const auto * func = function->getArguments().getNodes()[i]->as()) + { + func_id = i; + } + } + + if (func_id == function->getArguments().getNodes().size()) return; + + size_t literal_id = 1 - func_id; + const auto * literal = function->getArguments().getNodes()[literal_id]->as(); + + if (!literal || literal->getValue().getType() != Field::Types::UInt64) return; + + String comparator = literal_id > func_id ? function->getFunctionName(): swap_relations.at(function->getFunctionName()); + + const auto * func_node = function->getArguments().getNodes()[func_id]->as(); + /// Currently we only handle single-argument functions. + if (!func_node || func_node->getArguments().getNodes().size() != 1) return; + + const auto * column_id = func_node->getArguments().getNodes()[0]->as(); + if (!column_id) return; + + const auto & converter = FunctionFactory::instance().tryGet(func_node->getFunctionName(), getContext()); + if (!converter) return; + + ColumnsWithTypeAndName args; + args.emplace_back(column_id->getColumnType(), "tmp"); + auto converter_base = converter->build(args); + if (!converter_base || !converter_base->hasInformationAboutPreimage()) return; + + auto preimage_range = converter_base->getPreimage(*(column_id->getColumnType()), literal->getValue()); + if (!preimage_range) return; + + const auto new_node = generateOptimizedDateFilter(comparator, *column_id, *preimage_range); + + if (!new_node) return; + + node = new_node; + } + +private: + QueryTreeNodePtr generateOptimizedDateFilter(const String & comparator, const ColumnNode & column_node, const std::pair& range) const + { + const DateLUTImpl & date_lut = DateLUT::instance("UTC"); + + String start_date_or_date_time; + String end_date_or_date_time; + + if (isDateOrDate32(column_node.getColumnType().get())) + { + start_date_or_date_time = date_lut.dateToString(range.first.get()); + end_date_or_date_time = date_lut.dateToString(range.second.get()); + } + else if (isDateTime(column_node.getColumnType().get()) || isDateTime64(column_node.getColumnType().get())) + { + start_date_or_date_time = date_lut.timeToString(range.first.get()); + end_date_or_date_time = date_lut.timeToString(range.second.get()); + } + else [[unlikely]] return {}; + + if (comparator == "equals") + { + const auto lhs = std::make_shared("greaterOrEquals"); + lhs->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + lhs->getArguments().getNodes().push_back(std::make_shared(start_date_or_date_time)); + resolveOrdinaryFunctionNode(*lhs, lhs->getFunctionName()); + + const auto rhs = std::make_shared("less"); + rhs->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + rhs->getArguments().getNodes().push_back(std::make_shared(end_date_or_date_time)); + resolveOrdinaryFunctionNode(*rhs, rhs->getFunctionName()); + + const auto new_date_filter = std::make_shared("and"); + new_date_filter->getArguments().getNodes() = {lhs, rhs}; + resolveOrdinaryFunctionNode(*new_date_filter, new_date_filter->getFunctionName()); + + return new_date_filter; + } + else if (comparator == "notEquals") + { + const auto lhs = std::make_shared("less"); + lhs->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + lhs->getArguments().getNodes().push_back(std::make_shared(start_date_or_date_time)); + resolveOrdinaryFunctionNode(*lhs, lhs->getFunctionName()); + + const auto rhs = std::make_shared("greaterOrEquals"); + rhs->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + rhs->getArguments().getNodes().push_back(std::make_shared(end_date_or_date_time)); + resolveOrdinaryFunctionNode(*rhs, rhs->getFunctionName()); + + const auto new_date_filter = std::make_shared("or"); + new_date_filter->getArguments().getNodes() = {lhs, rhs}; + resolveOrdinaryFunctionNode(*new_date_filter, new_date_filter->getFunctionName()); + + return new_date_filter; + } + else if (comparator == "greater") + { + const auto new_date_filter = std::make_shared("greaterOrEquals"); + new_date_filter->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + new_date_filter->getArguments().getNodes().push_back(std::make_shared(end_date_or_date_time)); + resolveOrdinaryFunctionNode(*new_date_filter, new_date_filter->getFunctionName()); + + return new_date_filter; + } + else if (comparator == "lessOrEquals") + { + const auto new_date_filter = std::make_shared("less"); + new_date_filter->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + new_date_filter->getArguments().getNodes().push_back(std::make_shared(end_date_or_date_time)); + resolveOrdinaryFunctionNode(*new_date_filter, new_date_filter->getFunctionName()); + + return new_date_filter; + } + else if (comparator == "less" || comparator == "greaterOrEquals") + { + const auto new_date_filter = std::make_shared(comparator); + new_date_filter->getArguments().getNodes().push_back(std::make_shared(column_node.getColumn(), column_node.getColumnSource())); + new_date_filter->getArguments().getNodes().push_back(std::make_shared(start_date_or_date_time)); + resolveOrdinaryFunctionNode(*new_date_filter, new_date_filter->getFunctionName()); + + return new_date_filter; + } + else [[unlikely]] + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Expected equals, notEquals, less, lessOrEquals, greater, greaterOrEquals. Actual {}", + comparator); + } + } + + void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const + { + auto function = FunctionFactory::instance().get(function_name, getContext()); + function_node.resolveAsFunction(function->build(function_node.getArgumentColumns())); + } +}; + +} + +void OptimizeDateOrDateTimeConverterWithPreimagePass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +{ + OptimizeDateOrDateTimeConverterWithPreimageVisitor visitor(std::move(context)); + visitor.visit(query_tree_node); +} + +} diff --git a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.h b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.h new file mode 100644 index 00000000000..c29f18e330b --- /dev/null +++ b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.h @@ -0,0 +1,24 @@ +#pragma once + +#include + +namespace DB +{ + +/** Replace predicate having Date/DateTime converters with their preimages to improve performance. + * Given a Date column c, toYear(c) = 2023 -> c >= '2023-01-01' AND c < '2024-01-01' + * Or if c is a DateTime column, toYear(c) = 2023 -> c >= '2023-01-01 00:00:00' AND c < '2024-01-01 00:00:00'. + * The similar optimization also applies to other converters. + */ +class OptimizeDateOrDateTimeConverterWithPreimagePass final : public IQueryTreePass +{ +public: + String getName() override { return "OptimizeDateOrDateTimeConverterWithPreimagePass"; } + + String getDescription() override { return "Replace predicate having Date/DateTime converters with their preimages"; } + + void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + +}; + +} diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index a6da2a66615..cd3abd9593e 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -42,6 +42,7 @@ #include #include #include +#include namespace DB { @@ -278,6 +279,7 @@ void addQueryTreePasses(QueryTreePassManager & manager) manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); + manager.addPass(std::make_unique()); } } diff --git a/tests/queries/0_stateless/02783_date_predicate_optimizations.reference b/tests/queries/0_stateless/02783_date_predicate_optimizations.reference index 872a5dd1d7d..5c3cab9bf4a 100644 --- a/tests/queries/0_stateless/02783_date_predicate_optimizations.reference +++ b/tests/queries/0_stateless/02783_date_predicate_optimizations.reference @@ -1,5 +1,7 @@ 2021-12-31 23:00:00 0 2021-12-31 23:00:00 0 +2021-12-31 23:00:00 0 +2021-12-31 23:00:00 0 Date 2 3 @@ -13,6 +15,18 @@ Date 4 1 4 +2 +3 +2 +4 +1 +3 +3 +2 +1 +4 +1 +4 DateTime 2 3 @@ -26,6 +40,18 @@ DateTime 4 1 4 +2 +3 +2 +4 +1 +3 +3 +2 +1 +4 +1 +4 Date32 2 3 @@ -39,6 +65,18 @@ Date32 4 1 4 +2 +3 +2 +4 +1 +3 +3 +2 +1 +4 +1 +4 DateTime64 2 3 @@ -52,3 +90,15 @@ DateTime64 4 1 4 +2 +3 +2 +4 +1 +3 +3 +2 +1 +4 +1 +4 diff --git a/tests/queries/0_stateless/02783_date_predicate_optimizations.sql b/tests/queries/0_stateless/02783_date_predicate_optimizations.sql index 0a2fa6cc93b..4da8cebff1c 100644 --- a/tests/queries/0_stateless/02783_date_predicate_optimizations.sql +++ b/tests/queries/0_stateless/02783_date_predicate_optimizations.sql @@ -11,6 +11,8 @@ INSERT INTO source values ('2021-12-31 23:00:00', 0); SELECT * FROM source WHERE toYYYYMM(ts) = 202112; SELECT * FROM source WHERE toYear(ts) = 2021; +SELECT * FROM source WHERE toYYYYMM(ts) = 202112 SETTINGS allow_experimental_analyzer=1; +SELECT * FROM source WHERE toYear(ts) = 2021 SETTINGS allow_experimental_analyzer=1; DROP TABLE IF EXISTS source; CREATE TABLE source @@ -44,6 +46,18 @@ SELECT count(*) FROM source WHERE toYear(dt) < 2023; SELECT count(*) FROM source WHERE toYear(dt) <= 2023; SELECT count(*) FROM source WHERE toYear(dt) > 2023; SELECT count(*) FROM source WHERE toYear(dt) >= 2023; +SELECT count(*) FROM source WHERE toYYYYMM(dt) = 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt) <> 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt) < 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt) <= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt) > 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt) >= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt) = 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt) <> 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt) < 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt) <= 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt) > 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt) >= 2023 SETTINGS allow_experimental_analyzer=1; SELECT 'DateTime'; SELECT count(*) FROM source WHERE toYYYYMM(ts) = 202312; @@ -58,6 +72,18 @@ SELECT count(*) FROM source WHERE toYear(ts) < 2023; SELECT count(*) FROM source WHERE toYear(ts) <= 2023; SELECT count(*) FROM source WHERE toYear(ts) > 2023; SELECT count(*) FROM source WHERE toYear(ts) >= 2023; +SELECT count(*) FROM source WHERE toYYYYMM(ts) = 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts) <> 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts) < 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts) <= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts) > 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts) >= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts) = 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts) <> 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts) < 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts) <= 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts) > 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts) >= 2023 SETTINGS allow_experimental_analyzer=1; SELECT 'Date32'; SELECT count(*) FROM source WHERE toYYYYMM(dt_32) = 202312; @@ -72,6 +98,18 @@ SELECT count(*) FROM source WHERE toYear(dt_32) < 2023; SELECT count(*) FROM source WHERE toYear(dt_32) <= 2023; SELECT count(*) FROM source WHERE toYear(dt_32) > 2023; SELECT count(*) FROM source WHERE toYear(dt_32) >= 2023; +SELECT count(*) FROM source WHERE toYYYYMM(dt_32) = 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt_32) <> 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt_32) < 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt_32) <= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt_32) > 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(dt_32) >= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt_32) = 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt_32) <> 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt_32) < 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt_32) <= 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt_32) > 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(dt_32) >= 2023 SETTINGS allow_experimental_analyzer=1; SELECT 'DateTime64'; SELECT count(*) FROM source WHERE toYYYYMM(ts_64) = 202312; @@ -86,4 +124,16 @@ SELECT count(*) FROM source WHERE toYear(ts_64) < 2023; SELECT count(*) FROM source WHERE toYear(ts_64) <= 2023; SELECT count(*) FROM source WHERE toYear(ts_64) > 2023; SELECT count(*) FROM source WHERE toYear(ts_64) >= 2023; +SELECT count(*) FROM source WHERE toYYYYMM(ts_64) = 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts_64) <> 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts_64) < 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts_64) <= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts_64) > 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYYYYMM(ts_64) >= 202312 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts_64) = 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts_64) <> 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts_64) < 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts_64) <= 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts_64) > 2023 SETTINGS allow_experimental_analyzer=1; +SELECT count(*) FROM source WHERE toYear(ts_64) >= 2023 SETTINGS allow_experimental_analyzer=1; DROP TABLE source; diff --git a/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.reference b/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.reference index 9235e7e106a..0fd2f694aeb 100644 --- a/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.reference +++ b/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.reference @@ -1,87 +1,1122 @@ SELECT value1 FROM date_t WHERE ((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((date1 < \'1993-01-01\') OR (date1 >= \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: or, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 < \'1993-01-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 >= \'1994-01-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 < \'1994-01-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 >= \'1993-01-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((date1 >= \'1993-01-01\') AND (date1 < \'1998-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 15, constant_value: \'1998-01-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\')) OR ((date1 >= \'1994-01-01\') AND (date1 < \'1995-01-01\'))) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: or, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + FUNCTION id: 10, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + COLUMN id: 12, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 13, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 14, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 15, nodes: 2 + COLUMN id: 16, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 17, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 18, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + FUNCTION id: 20, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 21, nodes: 2 + COLUMN id: 22, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 23, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 24, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 25, nodes: 2 + COLUMN id: 26, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 27, constant_value: \'1995-01-01\', constant_value_type: String + FUNCTION id: 28, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 29, nodes: 2 + FUNCTION id: 30, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 31, nodes: 2 + COLUMN id: 32, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 33, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 34, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 35, nodes: 2 + COLUMN id: 32, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 36, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1, toYear(date1) AS year1 FROM date_t WHERE ((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + year1 UInt16 + PROJECTION + LIST id: 1, nodes: 2 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + FUNCTION id: 4, function_name: toYear, function_type: ordinary, result_type: UInt16 + ARGUMENTS + LIST id: 5, nodes: 1 + COLUMN id: 6, column_name: date1, result_type: Date, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 7, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 8, nodes: 2 + FUNCTION id: 9, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 10, nodes: 2 + FUNCTION id: 11, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 12, nodes: 2 + COLUMN id: 13, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 14, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 15, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 16, nodes: 2 + COLUMN id: 17, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 18, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 19, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 20, nodes: 2 + FUNCTION id: 21, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 22, nodes: 2 + COLUMN id: 23, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 25, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 26, nodes: 2 + COLUMN id: 23, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 27, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 < \'1993-01-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t PREWHERE (date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\') WHERE ((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + PREWHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 10, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + COLUMN id: 12, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 13, constant_value: \'1994-01-01\', constant_value_type: String + WHERE + FUNCTION id: 14, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 15, nodes: 2 + FUNCTION id: 16, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 18, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 19, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 20, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 21, nodes: 2 + COLUMN id: 18, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 22, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((id >= 1) AND (id <= 3)) AND ((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\')) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 9, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 10, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + COLUMN id: 8, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 12, constant_value: UInt64_3, constant_value_type: UInt8 + HAVING + FUNCTION id: 13, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 14, nodes: 2 + FUNCTION id: 15, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 16, nodes: 2 + COLUMN id: 17, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 18, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 19, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 20, nodes: 2 + COLUMN id: 21, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 22, constant_value: \'1994-01-01\', constant_value_type: String + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (toYYYYMM(date1) = 199300) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: equals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: toYYYYMM, function_type: ordinary, result_type: UInt32 + ARGUMENTS + LIST id: 9, nodes: 1 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: UInt64_199300, constant_value_type: UInt32 + FUNCTION id: 12, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + FUNCTION id: 14, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 15, nodes: 2 + COLUMN id: 16, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 17, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 18, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 16, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 20, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (toYYYYMM(date1) = 199313) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: equals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: toYYYYMM, function_type: ordinary, result_type: UInt32 + ARGUMENTS + LIST id: 9, nodes: 1 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: UInt64_199313, constant_value_type: UInt32 + FUNCTION id: 12, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + FUNCTION id: 14, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 15, nodes: 2 + COLUMN id: 16, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 17, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 18, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 16, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 20, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((date1 >= \'1993-12-01\') AND (date1 < \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-12-01\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((date1 >= \'1992-03-01\') AND (date1 < \'1992-04-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1992-03-01\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 15, constant_value: \'1992-04-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((date1 < \'1992-03-01\') OR (date1 >= \'1992-04-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: or, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1992-03-01\', constant_value_type: String + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 15, constant_value: \'1992-04-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 < \'1992-03-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1992-03-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 >= \'1992-04-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1992-04-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 < \'1992-04-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1992-04-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE (date1 >= \'1992-03-01\') AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 8, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 9, constant_value: \'1992-03-01\', constant_value_type: String + FUNCTION id: 10, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 11, nodes: 2 + FUNCTION id: 12, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 16, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + COLUMN id: 14, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 18, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date_t WHERE ((date1 >= \'1992-03-01\') OR ((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\'))) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: or, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 11, constant_value: \'1992-03-01\', constant_value_type: String + FUNCTION id: 12, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + FUNCTION id: 14, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 15, nodes: 2 + COLUMN id: 16, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 17, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 18, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: date1, result_type: Date, source_id: 3 + CONSTANT id: 21, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 22, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + FUNCTION id: 24, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 25, nodes: 2 + COLUMN id: 26, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 27, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 28, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 29, nodes: 2 + COLUMN id: 26, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 30, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM datetime_t WHERE ((date1 >= \'1993-01-01 00:00:00\') AND (date1 < \'1994-01-01 00:00:00\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.datetime_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: DateTime, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-01-01 00:00:00\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: DateTime, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01 00:00:00\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM datetime_t WHERE ((date1 >= \'1993-12-01 00:00:00\') AND (date1 < \'1994-01-01 00:00:00\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.datetime_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: DateTime, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-12-01 00:00:00\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: DateTime, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01 00:00:00\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date32_t WHERE ((date1 >= \'1993-01-01\') AND (date1 < \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date32_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date32, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-01-01\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date32, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM date32_t WHERE ((date1 >= \'1993-12-01\') AND (date1 < \'1994-01-01\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.date32_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: Date32, source_id: 3 + CONSTANT id: 11, constant_value: \'1993-12-01\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: Date32, source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM datetime64_t WHERE ((date1 >= \'1993-01-01 00:00:00\') AND (date1 < \'1994-01-01 00:00:00\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.datetime64_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: DateTime64(3), source_id: 3 + CONSTANT id: 11, constant_value: \'1993-01-01 00:00:00\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: DateTime64(3), source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01 00:00:00\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 SELECT value1 FROM datetime64_t WHERE ((date1 >= \'1993-12-01 00:00:00\') AND (date1 < \'1994-01-01 00:00:00\')) AND ((id >= 1) AND (id <= 3)) +QUERY id: 0 + PROJECTION COLUMNS + value1 String + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: value1, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.datetime64_t + WHERE + FUNCTION id: 4, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + FUNCTION id: 6, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 7, nodes: 2 + FUNCTION id: 8, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 9, nodes: 2 + COLUMN id: 10, column_name: date1, result_type: DateTime64(3), source_id: 3 + CONSTANT id: 11, constant_value: \'1993-12-01 00:00:00\', constant_value_type: String + FUNCTION id: 12, function_name: less, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 13, nodes: 2 + COLUMN id: 14, column_name: date1, result_type: DateTime64(3), source_id: 3 + CONSTANT id: 15, constant_value: \'1994-01-01 00:00:00\', constant_value_type: String + FUNCTION id: 16, function_name: and, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 17, nodes: 2 + FUNCTION id: 18, function_name: greaterOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 19, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 21, constant_value: UInt64_1, constant_value_type: UInt8 + FUNCTION id: 22, function_name: lessOrEquals, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 23, nodes: 2 + COLUMN id: 20, column_name: id, result_type: UInt32, source_id: 3 + CONSTANT id: 24, constant_value: UInt64_3, constant_value_type: UInt8 + SETTINGS allow_experimental_analyzer=1 diff --git a/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.sql b/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.sql index 266be59b0a3..9cc8dd74e5d 100644 --- a/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.sql +++ b/tests/queries/0_stateless/02785_date_predicate_optimizations_ast_query_tree_rewrite.sql @@ -2,46 +2,74 @@ DROP TABLE IF EXISTS date_t; CREATE TABLE date_t (id UInt32, value1 String, date1 Date) ENGINE ReplacingMergeTree() ORDER BY id; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) <> 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) <> 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) < 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) < 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) > 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) > 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) <= 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) <= 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) >= 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) >= 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYear(date1) BETWEEN 1993 AND 1997 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYear(date1) BETWEEN 1993 AND 1997 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE (toYear(date1) = 1993 OR toYear(date1) = 1994) AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE (toYear(date1) = 1993 OR toYear(date1) = 1994) AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1, toYear(date1) as year1 FROM date_t WHERE year1 = 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1, toYear(date1) as year1 FROM date_t WHERE year1 = 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE 1993 > toYear(date1) AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE 1993 > toYear(date1) AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t PREWHERE toYear(date1) = 1993 WHERE id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t PREWHERE toYear(date1) = 1993 WHERE id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE id BETWEEN 1 AND 3 HAVING toYear(date1) = 1993; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE id BETWEEN 1 AND 3 HAVING toYear(date1) = 1993 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199300 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199300 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199313 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199313 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199203 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) = 199203 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) <> 199203 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) <> 199203 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) < 199203 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) < 199203 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) > 199203 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) > 199203 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) <= 199203 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) <= 199203 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE toYYYYMM(date1) >= 199203 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE toYYYYMM(date1) >= 199203 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date_t WHERE (toYYYYMM(date1) >= 199203 OR toYear(date1) = 1993) AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date_t WHERE (toYYYYMM(date1) >= 199203 OR toYear(date1) = 1993) AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; DROP TABLE date_t; DROP TABLE IF EXISTS datetime_t; CREATE TABLE datetime_t (id UInt32, value1 String, date1 Datetime) ENGINE ReplacingMergeTree() ORDER BY id; EXPLAIN SYNTAX SELECT value1 FROM datetime_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM datetime_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM datetime_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM datetime_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; DROP TABLE datetime_t; DROP TABLE IF EXISTS date32_t; CREATE TABLE date32_t (id UInt32, value1 String, date1 Date32) ENGINE ReplacingMergeTree() ORDER BY id; EXPLAIN SYNTAX SELECT value1 FROM date32_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date32_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM date32_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM date32_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; DROP TABLE date32_t; DROP TABLE IF EXISTS datetime64_t; CREATE TABLE datetime64_t (id UInt32, value1 String, date1 Datetime64) ENGINE ReplacingMergeTree() ORDER BY id; EXPLAIN SYNTAX SELECT value1 FROM datetime64_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM datetime64_t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; EXPLAIN SYNTAX SELECT value1 FROM datetime64_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3; +EXPLAIN QUERY TREE run_passes=1 SELECT value1 FROM datetime64_t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3 SETTINGS allow_experimental_analyzer=1; DROP TABLE datetime64_t; From 9ff409f16ed9ba875904651e353b80d1782d9b1d Mon Sep 17 00:00:00 2001 From: Zhiguo Zhou Date: Mon, 24 Jul 2023 13:40:06 +0800 Subject: [PATCH 24/81] Enforce type check for function arguments in preimage optimization Date/Date32/DateTime/DateTime64 columns are required for arguments of time converters, such as toYear and toYYYYMM. --- .../Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp index f7f69598c4b..7205ac299a9 100644 --- a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp +++ b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp @@ -91,6 +91,9 @@ public: const auto * column_id = func_node->getArguments().getNodes()[0]->as(); if (!column_id) return; + const auto * column_type = column_id->getColumnType().get(); + if (!isDateOrDate32(column_type) && !isDateTime(column_type) && !isDateTime64(column_type)) return; + const auto & converter = FunctionFactory::instance().tryGet(func_node->getFunctionName(), getContext()); if (!converter) return; From 8a753de296b5d6914b2cbeecd8b4cdda941db1d3 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 24 Jul 2023 19:57:29 +0200 Subject: [PATCH 25/81] Delete 02817_structure_to_schema.out --- .../0_stateless/02817_structure_to_schema.out | 468 ------------------ 1 file changed, 468 deletions(-) delete mode 100644 tests/queries/0_stateless/02817_structure_to_schema.out diff --git a/tests/queries/0_stateless/02817_structure_to_schema.out b/tests/queries/0_stateless/02817_structure_to_schema.out deleted file mode 100644 index ad45343139b..00000000000 --- a/tests/queries/0_stateless/02817_structure_to_schema.out +++ /dev/null @@ -1,468 +0,0 @@ -CapnProto -Numbers - -struct Message -{ - int8 @0 : Int8; - uint8 @1 : UInt8; - int16 @2 : Int16; - uint16 @3 : UInt16; - int32 @4 : Int32; - uint32 @5 : UInt32; - int64 @6 : Int64; - uint64 @7 : UInt64; - int128 @8 : Data; - uint128 @9 : Data; - int256 @10 : Data; - uint256 @11 : Data; - float32 @12 : Float32; - float64 @13 : Float64; - decimal32 @14 : Int32; - decimal64 @15 : Int64; - decimal128 @16 : Data; - decimal256 @17 : Data; -} -Dates - -struct Message -{ - data @0 : UInt16; - date32 @1 : Int32; - datetime @2 : UInt32; - datatime64 @3 : Int64; -} -Strings - -struct Message -{ - string @0 : Data; - fixedstring @1 : Data; -} -Special - -struct Message -{ - ipv4 @0 : UInt32; - ipv6 @1 : Data; - uuid @2 : Data; -} -Nullable - -struct Message -{ - struct Nullable - { - union - { - value @0 : UInt32; - null @1 : Void; - } - } - nullable @0 : Nullable; -} -Enums - -struct Message -{ - enum Enum8 - { - v1 @0; - v2 @1; - v3 @2; - v4 @3; - } - enum8 @0 : Enum8; - enum Enum16 - { - v5 @0; - v6 @1; - v7 @2; - v8 @3; - v9 @4; - } - enum16 @1 : Enum16; -} -Arrays - -struct Message -{ - arr1 @0 : List(UInt32); - arr2 @1 : List(List(List(UInt32))); -} -Tuples - -struct Message -{ - struct Tuple1 - { - e1 @0 : UInt32; - e2 @1 : Data; - e3 @2 : UInt32; - } - tuple1 @0 : Tuple1; - struct Tuple2 - { - struct E1 - { - e1 @0 : UInt32; - struct E2 - { - e1 @0 : Data; - e2 @1 : UInt32; - } - e2 @1 : E2; - e3 @2 : Data; - } - e1 @0 : E1; - struct E2 - { - e1 @0 : Data; - e2 @1 : UInt32; - } - e2 @1 : E2; - } - tuple2 @1 : Tuple2; -} -Maps - -struct Message -{ - struct Map1 - { - struct Entry - { - key @0 : Data; - value @1 : UInt32; - } - entries @0 : List(Entry); - } - map1 @0 : Map1; - struct Map2 - { - struct Entry - { - struct Value - { - struct Entry - { - struct Value - { - struct Entry - { - key @0 : Data; - value @1 : UInt32; - } - entries @0 : List(Entry); - } - key @0 : Data; - value @1 : Value; - } - entries @0 : List(Entry); - } - key @0 : Data; - value @1 : Value; - } - entries @0 : List(Entry); - } - map2 @1 : Map2; -} -Complex - -struct Message -{ - struct C1 - { - struct E1 - { - struct Entry - { - struct Value - { - union - { - value @0 : UInt32; - null @1 : Void; - } - } - key @0 : Data; - value @1 : List(List(Value)); - } - entries @0 : List(Entry); - } - e1 @0 : List(E1); - struct E2 - { - struct Entry - { - struct Value - { - struct E1 - { - union - { - value @0 : Data; - null @1 : Void; - } - } - e1 @0 : List(List(E1)); - struct E2 - { - e1 @0 : UInt32; - struct E2 - { - struct E1 - { - union - { - value @0 : Data; - null @1 : Void; - } - } - e1 @0 : List(List(E1)); - e2 @1 : UInt32; - } - e2 @1 : E2; - } - e2 @1 : List(E2); - } - key @0 : Data; - value @1 : Value; - } - entries @0 : List(Entry); - } - e2 @1 : List(E2); - } - c1 @0 : C1; -} -Read/write with no schema -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -Output schema -@0xda5297bc0d5b501a; - -struct Message -{ - number @0 : UInt64; -} -Bad output schema path -2 -2 -Protobuf -Numbers - -message Message -{ - int32 int8 = 1; - uint32 uint8 = 2; - int32 int16 = 3; - uint32 uint16 = 4; - int32 int32 = 5; - uint32 uint32 = 6; - int64 int64 = 7; - uint64 uint64 = 8; - bytes int128 = 9; - bytes uint128 = 10; - bytes int256 = 11; - bytes uint256 = 12; - float float32 = 13; - double float64 = 14; - bytes decimal32 = 15; - bytes decimal64 = 16; - bytes decimal128 = 17; - bytes decimal256 = 18; -} -Dates - -message Message -{ - uint32 data = 1; - int32 date32 = 2; - uint32 datetime = 3; - uint64 datatime64 = 4; -} -Strings - -message Message -{ - bytes string = 1; - bytes fixedstring = 2; -} -Special - -message Message -{ - uint32 ipv4 = 1; - bytes ipv6 = 2; - bytes uuid = 3; -} -Nullable - -message Message -{ - uint32 nullable = 1; -} -Enums - -message Message -{ - enum Enum8 - { - v1 = 0; - v2 = 1; - v3 = 2; - v4 = 3; - } - Enum8 enum8 = 1; - enum Enum16 - { - v5 = 0; - v6 = 1; - v7 = 2; - v8 = 3; - v9 = 4; - } - Enum16 enum16 = 2; -} -Arrays - -message Message -{ - repeated uint32 arr1 = 1; - message Arr2 - { - message Arr2 - { - repeated uint32 arr2 = 1; - } - repeated Arr2 arr2 = 1; - } - repeated Arr2 arr2 = 2; -} -Tuples - -message Message -{ - message Tuple1 - { - uint32 e1 = 1; - bytes e2 = 2; - uint32 e3 = 3; - } - Tuple1 tuple1 = 1; - message Tuple2 - { - message E1 - { - uint32 e1 = 1; - message E2 - { - bytes e1 = 1; - uint32 e2 = 2; - } - E2 e2 = 2; - bytes e3 = 3; - } - E1 e1 = 1; - message E2 - { - bytes e1 = 1; - uint32 e2 = 2; - } - E2 e2 = 2; - } - Tuple2 tuple2 = 2; -} -Maps - -message Message -{ - map map1 = 1; - message Map2Value - { - message Map2ValueValue - { - map map2ValueValue = 1; - } - map map2Value = 1; - } - map map2 = 2; -} -Complex - -message Message -{ - message C1 - { - message E1 - { - message E1Value - { - message E1Value - { - repeated uint32 e1Value = 1; - } - repeated E1Value e1Value = 1; - } - map e1 = 1; - } - repeated E1 e1 = 1; - message E2 - { - message E2Value - { - message E1 - { - repeated bytes e1 = 1; - } - repeated E1 e1 = 1; - message E2 - { - uint32 e1 = 1; - message E2 - { - message E1 - { - repeated bytes e1 = 1; - } - repeated E1 e1 = 1; - uint32 e2 = 2; - } - E2 e2 = 2; - } - repeated E2 e2 = 2; - } - map e2 = 1; - } - repeated E2 e2 = 2; - } - C1 c1 = 1; -} -Read/write with no schema -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -Output schema -syntax = "proto3"; - -message Message -{ - uint64 number = 1; -} -Bad output schema path -2 -2 From 10a8a8dd74acfa9d8ea161c60a44ac7c5abff562 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 16 Jun 2023 23:24:10 -0400 Subject: [PATCH 26/81] Implementing new commands for keeper-client --- programs/keeper-client/Commands.cpp | 79 ++++++++++++++++++++++++- programs/keeper-client/Commands.h | 25 ++++++++ programs/keeper-client/KeeperClient.cpp | 2 + programs/keeper-client/Parser.cpp | 1 + 4 files changed, 106 insertions(+), 1 deletion(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 05928a0d20b..d63f87ac8a6 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -24,8 +24,11 @@ void LSCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) con else path = client->cwd; - for (const auto & child : client->zookeeper->getChildren(path)) + auto children = client->zookeeper->getChildren(path); + std::sort(children.begin(), children.end()); + for (const auto & child : children) std::cout << child << " "; + std::cout << "\n"; } @@ -130,6 +133,80 @@ void GetCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) co std::cout << client->zookeeper->get(client->getAbsolutePath(query->args[0].safeGet())) << "\n"; } +bool GetStatCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return true; + + node->args.push_back(std::move(arg)); + return true; +} + +void GetStatCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + Coordination::Stat stat; + String path; + if (!query->args.empty()) + path = client->getAbsolutePath(query->args[0].safeGet()); + else + path = client->cwd; + + client->zookeeper->get(path, &stat); + + std::cout << "cZxid = " << stat.czxid << "\n"; + std::cout << "mZxid = " << stat.mzxid << "\n"; + std::cout << "ctime = " << stat.ctime << "\n"; + std::cout << "mtime = " << stat.mtime << "\n"; + std::cout << "version = " << stat.version << "\n"; + std::cout << "cversion = " << stat.cversion << "\n"; + std::cout << "aversion = " << stat.aversion << "\n"; + std::cout << "ephemeralOwner = " << stat.ephemeralOwner << "\n"; + std::cout << "dataLength = " << stat.dataLength << "\n"; + std::cout << "numChildren = " << stat.numChildren << "\n"; + std::cout << "pzxid = " << stat.pzxid << "\n"; +} + +bool FindSupperNodes::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + ASTPtr threshold; + if (!ParserUnsignedInteger{}.parse(pos, threshold, expected)) + return false; + + node->args.push_back(threshold->as().value); + + String path; + if (!parseKeeperPath(pos, expected, path)) + path = "."; + + node->args.push_back(std::move(path)); + return true; +} + +void FindSupperNodes::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + auto threshold = query->args[0].safeGet(); + auto path = client->getAbsolutePath(query->args[1].safeGet()); + + Coordination::Stat stat; + client->zookeeper->get(path, &stat); + + if (stat.numChildren >= static_cast(threshold)) + { + std::cout << path << "\t" << stat.numChildren << "\n"; + return; + } + + auto children = client->zookeeper->getChildren(path); + std::sort(children.begin(), children.end()); + for (auto & child : children) + { + auto next_query = *query; + next_query.args[1] = DB::Field(path / child); + FindSupperNodes{}.execute(&next_query, client); + } +} + bool RMCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const { String arg; diff --git a/programs/keeper-client/Commands.h b/programs/keeper-client/Commands.h index e4debd53e42..c066aefdff1 100644 --- a/programs/keeper-client/Commands.h +++ b/programs/keeper-client/Commands.h @@ -84,6 +84,31 @@ class GetCommand : public IKeeperClientCommand String getHelpMessage() const override { return "get -- Returns the node's value"; } }; +class GetStatCommand : public IKeeperClientCommand +{ + String getName() const override { return "getstat"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "getstat [path] -- Returns the node's stat (default `.`)"; } +}; + +class FindSupperNodes : public IKeeperClientCommand +{ + String getName() const override { return "findsupernodes"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override + { + return "findsupernodes [path] -- Finds nodes with number of children larger than some threshold for the given path (default `.`)"; + } +}; + class RMCommand : public IKeeperClientCommand { String getName() const override { return "rm"; } diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index f41dca1e27a..09156b5e226 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -177,6 +177,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) std::make_shared(), std::make_shared(), std::make_shared(), + std::make_shared(), + std::make_shared(), std::make_shared(), std::make_shared(), std::make_shared(), diff --git a/programs/keeper-client/Parser.cpp b/programs/keeper-client/Parser.cpp index 3420ccb2219..fe46058fcc1 100644 --- a/programs/keeper-client/Parser.cpp +++ b/programs/keeper-client/Parser.cpp @@ -58,6 +58,7 @@ bool KeeperParser::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; String command_name(pos->begin, pos->end); + std::transform(command_name.begin(), command_name.end(), command_name.begin(), [](unsigned char c) { return std::tolower(c); }); Command command; auto iter = KeeperClient::commands.find(command_name); From 68b78f52ec170b4bcbea7b79090cae7a3bfc642d Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 20 Jun 2023 23:41:48 -0400 Subject: [PATCH 27/81] Add DeleteStableBackups, FindBigFamily --- programs/keeper-client/Commands.cpp | 97 ++++++++++++++++++++++++- programs/keeper-client/Commands.h | 59 ++++++++++++--- programs/keeper-client/KeeperClient.cpp | 2 + 3 files changed, 144 insertions(+), 14 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index d63f87ac8a6..5c8c228097e 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -1,5 +1,6 @@ #include "Commands.h" +#include #include "KeeperClient.h" @@ -203,10 +204,102 @@ void FindSupperNodes::execute(const ASTKeeperQuery * query, KeeperClient * clien { auto next_query = *query; next_query.args[1] = DB::Field(path / child); - FindSupperNodes{}.execute(&next_query, client); + execute(&next_query, client); } } +bool DeleteStableBackups::parse(IParser::Pos & /* pos */, std::shared_ptr & /* node */, Expected & /* expected */) const +{ + return true; +} + +void DeleteStableBackups::execute(const ASTKeeperQuery * /* query */, KeeperClient * client) const +{ + client->askConfirmation( + "You are going to delete all inactive backups in /clickhouse/backups.", + [client] + { + String backup_root = "/clickhouse/backups"; + auto backups = client->zookeeper->getChildren(backup_root); + + for (auto & child : backups) + { + String backup_path = backup_root + "/" + child; + std::cout << "Found backup " << backup_path << ", checking if it's active\n"; + + String stage_path = backup_path + "/stage"; + auto stages = client->zookeeper->getChildren(stage_path); + + bool is_active = false; + for (auto & stage : stages) + { + if (startsWith(stage, "alive")) + { + is_active = true; + break; + } + } + + if (is_active) + { + std::cout << "Backup " << backup_path << " is active, not going to delete\n"; + continue; + } + + std::cout << "Backup " << backup_path << " is not active, deleting it\n"; + client->zookeeper->removeRecursive(backup_path); + } + }); +} + +bool FindBigFamily::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String path; + if (!parseKeeperPath(pos, expected, path)) + path = "."; + + node->args.push_back(std::move(path)); + + ASTPtr count; + if (ParserUnsignedInteger{}.parse(pos, count, expected)) + node->args.push_back(count->as().value); + else + node->args.push_back(UInt64(10)); + + return true; +} + +void FindBigFamily::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + auto path = client->getAbsolutePath(query->args[0].safeGet()); + auto n = query->args[1].safeGet(); + + std::vector> result; + + std::queue queue; + queue.push(path); + while (!queue.empty()) + { + auto next_path = queue.front(); + queue.pop(); + + auto children = client->zookeeper->getChildren(next_path); + std::transform(children.cbegin(), children.cend(), children.begin(), [&](const String & child) { return next_path / child; }); + + auto response = client->zookeeper->get(children); + + for (size_t i = 0; i < response.size(); ++i) + { + result.emplace_back(response[i].stat.numChildren, children[i]); + queue.push(children[i]); + } + } + + std::sort(result.begin(), result.end(), std::greater()); + for (UInt64 i = 0; i < std::min(result.size(), static_cast(n)); ++i) + std::cout << std::get<1>(result[i]) << "\t" << std::get<0>(result[i]) << "\n"; +} + bool RMCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const { String arg; @@ -247,7 +340,7 @@ bool HelpCommand::parse(IParser::Pos & /* pos */, std::shared_ptrgetHelpMessage() << "\n"; + std::cout << pair.second->generateHelpString() << "\n"; } bool FourLetterWordCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const diff --git a/programs/keeper-client/Commands.h b/programs/keeper-client/Commands.h index c066aefdff1..f428191f694 100644 --- a/programs/keeper-client/Commands.h +++ b/programs/keeper-client/Commands.h @@ -21,6 +21,12 @@ public: virtual String getName() const = 0; virtual ~IKeeperClientCommand() = default; + + String generateHelpString() const + { + return fmt::vformat(getHelpMessage(), fmt::make_format_args(getName())); + } + }; using Command = std::shared_ptr; @@ -34,7 +40,7 @@ class LSCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "ls [path] -- Lists the nodes for the given path (default: cwd)"; } + String getHelpMessage() const override { return "{} [path] -- Lists the nodes for the given path (default: cwd)"; } }; class CDCommand : public IKeeperClientCommand @@ -45,7 +51,7 @@ class CDCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "cd [path] -- Change the working path (default `.`)"; } + String getHelpMessage() const override { return "{} [path] -- Change the working path (default `.`)"; } }; class SetCommand : public IKeeperClientCommand @@ -58,7 +64,7 @@ class SetCommand : public IKeeperClientCommand String getHelpMessage() const override { - return "set [version] -- Updates the node's value. Only update if version matches (default: -1)"; + return "{} [version] -- Updates the node's value. Only update if version matches (default: -1)"; } }; @@ -70,7 +76,7 @@ class CreateCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "create -- Creates new node"; } + String getHelpMessage() const override { return "{} -- Creates new node"; } }; class GetCommand : public IKeeperClientCommand @@ -81,7 +87,7 @@ class GetCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "get -- Returns the node's value"; } + String getHelpMessage() const override { return "{} -- Returns the node's value"; } }; class GetStatCommand : public IKeeperClientCommand @@ -92,12 +98,12 @@ class GetStatCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "getstat [path] -- Returns the node's stat (default `.`)"; } + String getHelpMessage() const override { return "{} [path] -- Returns the node's stat (default `.`)"; } }; class FindSupperNodes : public IKeeperClientCommand { - String getName() const override { return "findsupernodes"; } + String getName() const override { return "find_super_nodes"; } bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; @@ -105,10 +111,39 @@ class FindSupperNodes : public IKeeperClientCommand String getHelpMessage() const override { - return "findsupernodes [path] -- Finds nodes with number of children larger than some threshold for the given path (default `.`)"; + return "{} [path] -- Finds nodes with number of children larger than some threshold for the given path (default `.`)"; } }; +class DeleteStableBackups : public IKeeperClientCommand +{ + String getName() const override { return "delete_stable_backups"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override + { + return "{} -- Deletes ClickHouse nodes used for backups that are now inactive"; + } +}; + +class FindBigFamily : public IKeeperClientCommand +{ + String getName() const override { return "find_big_family"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override + { + return "{} [path] [n] -- Returns the top n nodes with the biggest family in the subtree (default path = `.` and n = 10)"; + } +}; + + class RMCommand : public IKeeperClientCommand { String getName() const override { return "rm"; } @@ -117,7 +152,7 @@ class RMCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "remove -- Remove the node"; } + String getHelpMessage() const override { return "{} -- Remove the node"; } }; class RMRCommand : public IKeeperClientCommand @@ -128,7 +163,7 @@ class RMRCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "rmr -- Recursively deletes path. Confirmation required"; } + String getHelpMessage() const override { return "{} -- Recursively deletes path. Confirmation required"; } }; class HelpCommand : public IKeeperClientCommand @@ -139,7 +174,7 @@ class HelpCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "help -- Prints this message"; } + String getHelpMessage() const override { return "{} -- Prints this message"; } }; class FourLetterWordCommand : public IKeeperClientCommand @@ -150,7 +185,7 @@ class FourLetterWordCommand : public IKeeperClientCommand void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; - String getHelpMessage() const override { return "flwc -- Executes four-letter-word command"; } + String getHelpMessage() const override { return "{} -- Executes four-letter-word command"; } }; } diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 09156b5e226..cac3e9d2996 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -179,6 +179,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) std::make_shared(), std::make_shared(), std::make_shared(), + std::make_shared(), + std::make_shared(), std::make_shared(), std::make_shared(), std::make_shared(), From bb2a749b9478cf1fa1daccc6fb765d80ca456b4a Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 21 Jul 2023 18:12:26 -0400 Subject: [PATCH 28/81] Update programs/keeper-client/Commands.cpp Co-authored-by: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> --- programs/keeper-client/Commands.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 5c8c228097e..9b2539506e5 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -157,6 +157,7 @@ void GetStatCommand::execute(const ASTKeeperQuery * query, KeeperClient * client std::cout << "cZxid = " << stat.czxid << "\n"; std::cout << "mZxid = " << stat.mzxid << "\n"; + std::cout << "pZxid = " << stat.pzxid << "\n"; std::cout << "ctime = " << stat.ctime << "\n"; std::cout << "mtime = " << stat.mtime << "\n"; std::cout << "version = " << stat.version << "\n"; @@ -165,7 +166,6 @@ void GetStatCommand::execute(const ASTKeeperQuery * query, KeeperClient * client std::cout << "ephemeralOwner = " << stat.ephemeralOwner << "\n"; std::cout << "dataLength = " << stat.dataLength << "\n"; std::cout << "numChildren = " << stat.numChildren << "\n"; - std::cout << "pzxid = " << stat.pzxid << "\n"; } bool FindSupperNodes::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const From 3a838f09d1dda7b9ab8d01afe4fab2c32a376884 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 21 Jul 2023 18:14:30 -0400 Subject: [PATCH 29/81] Update programs/keeper-client/Commands.h Co-authored-by: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> --- programs/keeper-client/Commands.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/keeper-client/Commands.h b/programs/keeper-client/Commands.h index f428191f694..6b7f62474eb 100644 --- a/programs/keeper-client/Commands.h +++ b/programs/keeper-client/Commands.h @@ -92,7 +92,7 @@ class GetCommand : public IKeeperClientCommand class GetStatCommand : public IKeeperClientCommand { - String getName() const override { return "getstat"; } + String getName() const override { return "get_stat"; } bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; From ea672ba618a17d864932cc4a9a709ab186b0b27c Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Jul 2023 01:57:16 -0400 Subject: [PATCH 30/81] Add docs, add tests, fix small issues. --- .../utilities/clickhouse-keeper-client.md | 4 + programs/keeper-client/Commands.cpp | 10 +- programs/keeper-client/Commands.h | 2 +- programs/keeper-client/KeeperClient.cpp | 2 +- tests/integration/test_keeper_client/test.py | 123 +++++++++++++++--- 5 files changed, 113 insertions(+), 28 deletions(-) diff --git a/docs/en/operations/utilities/clickhouse-keeper-client.md b/docs/en/operations/utilities/clickhouse-keeper-client.md index 77f816fe428..37eb0bb71ff 100644 --- a/docs/en/operations/utilities/clickhouse-keeper-client.md +++ b/docs/en/operations/utilities/clickhouse-keeper-client.md @@ -51,3 +51,7 @@ keeper foo bar - `rmr ` -- Recursively deletes path. Confirmation required - `flwc ` -- Executes four-letter-word command - `help` -- Prints this message +- `get_stat [path]` -- Returns the node's stat (default `.`) +- `find_super_nodes [path]` -- Finds nodes with number of children larger than some threshold for the given path (default `.`) +- `delete_stable_backups` -- Deletes ClickHouse nodes used for backups that are now inactive +- `find_big_family [path] [n]` -- Returns the top n nodes with the biggest family in the subtree (default path = `.` and n = 10) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 9b2539506e5..985e844afdf 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -168,7 +168,7 @@ void GetStatCommand::execute(const ASTKeeperQuery * query, KeeperClient * client std::cout << "numChildren = " << stat.numChildren << "\n"; } -bool FindSupperNodes::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +bool FindSuperNodes::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const { ASTPtr threshold; if (!ParserUnsignedInteger{}.parse(pos, threshold, expected)) @@ -184,7 +184,7 @@ bool FindSupperNodes::parse(IParser::Pos & pos, std::shared_ptr return true; } -void FindSupperNodes::execute(const ASTKeeperQuery * query, KeeperClient * client) const +void FindSuperNodes::execute(const ASTKeeperQuery * query, KeeperClient * client) const { auto threshold = query->args[0].safeGet(); auto path = client->getAbsolutePath(query->args[1].safeGet()); @@ -200,7 +200,7 @@ void FindSupperNodes::execute(const ASTKeeperQuery * query, KeeperClient * clien auto children = client->zookeeper->getChildren(path); std::sort(children.begin(), children.end()); - for (auto & child : children) + for (const auto & child : children) { auto next_query = *query; next_query.args[1] = DB::Field(path / child); @@ -222,7 +222,7 @@ void DeleteStableBackups::execute(const ASTKeeperQuery * /* query */, KeeperClie String backup_root = "/clickhouse/backups"; auto backups = client->zookeeper->getChildren(backup_root); - for (auto & child : backups) + for (const auto & child : backups) { String backup_path = backup_root + "/" + child; std::cout << "Found backup " << backup_path << ", checking if it's active\n"; @@ -231,7 +231,7 @@ void DeleteStableBackups::execute(const ASTKeeperQuery * /* query */, KeeperClie auto stages = client->zookeeper->getChildren(stage_path); bool is_active = false; - for (auto & stage : stages) + for (const auto & stage : stages) { if (startsWith(stage, "alive")) { diff --git a/programs/keeper-client/Commands.h b/programs/keeper-client/Commands.h index 6b7f62474eb..093920cb10d 100644 --- a/programs/keeper-client/Commands.h +++ b/programs/keeper-client/Commands.h @@ -101,7 +101,7 @@ class GetStatCommand : public IKeeperClientCommand String getHelpMessage() const override { return "{} [path] -- Returns the node's stat (default `.`)"; } }; -class FindSupperNodes : public IKeeperClientCommand +class FindSuperNodes : public IKeeperClientCommand { String getName() const override { return "find_super_nodes"; } diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index cac3e9d2996..561a1f41f7a 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -178,7 +178,7 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) std::make_shared(), std::make_shared(), std::make_shared(), - std::make_shared(), + std::make_shared(), std::make_shared(), std::make_shared(), std::make_shared(), diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index 00c7908eeed..a7de8db4a8d 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -1,6 +1,7 @@ import pytest from helpers.client import CommandRequest from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV cluster = ClickHouseCluster(__file__) @@ -23,10 +24,8 @@ def started_cluster(): cluster.shutdown() -def test_base_commands(started_cluster): - _ = started_cluster - - command = CommandRequest( +def keeper_query(query): + return CommandRequest( [ started_cluster.server_bin_path, "keeper-client", @@ -35,29 +34,111 @@ def test_base_commands(started_cluster): "--port", str(cluster.zookeeper_port), "-q", - "create test_create_zk_node1 testvalue1;create test_create_zk_node_2 testvalue2;get test_create_zk_node1;", + query, ], stdin="", ) + +def test_big_family(): + command = keeper_query( + "create test_big_family foo;" + "create test_big_family/1 foo;" + "create test_big_family/1/1 foo;" + "create test_big_family/1/2 foo;" + "create test_big_family/1/3 foo;" + "create test_big_family/1/4 foo;" + "create test_big_family/1/5 foo;" + "create test_big_family/2/1 foo;" + "create test_big_family/2/2 foo;" + "create test_big_family/2/3 foo;" + "find_big_family test_big_family;" + ) + + assert command.get_answer() == TSV( + [ + ["/test_big_family/1", "5"], + ["/test_big_family/2", "3"], + ["/test_big_family/2/3", "0"], + ["/test_big_family/2/2", "0"], + ["/test_big_family/2/1", "0"], + ["/test_big_family/1/5", "0"], + ["/test_big_family/1/4", "0"], + ["/test_big_family/1/3", "0"], + ["/test_big_family/1/2", "0"], + ["/test_big_family/1/1", "0"], + ] + ) + + command = keeper_query("find_big_family test_big_family 1;") + + assert command.get_answer() == TSV( + [ + ["/test_big_family/1", "5"], + ] + ) + + +def test_find_super_nodes(): + command = keeper_query( + "create test_find_super_nodes/1 foo;" + "create test_find_super_nodes/1/1 foo;" + "create test_find_super_nodes/1/2 foo;" + "create test_find_super_nodes/1/3 foo;" + "create test_find_super_nodes/1/4 foo;" + "create test_find_super_nodes/1/5 foo;" + "create test_find_super_nodes/2/1 foo;" + "create test_find_super_nodes/2/2 foo;" + "create test_find_super_nodes/2/3 foo;" + "create test_find_super_nodes/2/4 foo;" + "cd test_find_super_nodes;" + "find_super_nodes 4;" + ) + + assert command.get_answer() == TSV( + [ + ["/test_find_super_nodes/1", "5"], + ["/test_find_super_nodes/2", "4"], + ] + ) + + +def test_delete_stable_backups(): + command = keeper_query( + "create /clickhouse foo;" + "create /clickhouse/backups foo;" + "create /clickhouse/backups/1 foo;" + "create /clickhouse/backups/1/stage foo;" + "create /clickhouse/backups/1/stage/alive123 foo;" + "create /clickhouse/backups/2 foo;" + "create /clickhouse/backups/2/stage foo;" + "create /clickhouse/backups/2/stage/dead123 foo;" + "delete_stable_backups;" + "y;" + "ls clickhouse/backups;" + ) + + assert command.get_answer() == ( + "You are going to delete all inactive backups in /clickhouse/backups. Continue?\n" + "Found backup /clickhouse/backups/1, checking if it's active\n" + "Backup /clickhouse/backups/1 is active, not going to delete\n" + "Found backup /clickhouse/backups/2, checking if it's active\n" + "Backup /clickhouse/backups/2 is not active, deleting it\n" + "1" + ) + + +def test_base_commands(): + command = keeper_query( + "create test_create_zk_node1 testvalue1;" + "create test_create_zk_node_2 testvalue2;" + "get test_create_zk_node1;" + ) + assert command.get_answer() == "testvalue1\n" -def test_four_letter_word_commands(started_cluster): - _ = started_cluster - - command = CommandRequest( - [ - started_cluster.server_bin_path, - "keeper-client", - "--host", - str(cluster.get_instance_ip("zoo1")), - "--port", - str(cluster.zookeeper_port), - "-q", - "ruok", - ], - stdin="", - ) +def test_four_letter_word_commands(): + command = keeper_query("ruok") assert command.get_answer() == "imok\n" From 6b845ae255c0b14371c879f912d7144b039337fe Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Jul 2023 11:13:53 -0400 Subject: [PATCH 31/81] Fix tests --- tests/integration/test_keeper_client/test.py | 63 ++++++++++++-------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index a7de8db4a8d..5115c7d3789 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -24,24 +24,30 @@ def started_cluster(): cluster.shutdown() -def keeper_query(query): - return CommandRequest( - [ - started_cluster.server_bin_path, - "keeper-client", - "--host", - str(cluster.get_instance_ip("zoo1")), - "--port", - str(cluster.zookeeper_port), - "-q", - query, - ], - stdin="", - ) +class KeeperClient: + def __init__(self, started_cluster: ClickHouseCluster): + self.cluster = started_cluster + + def query(self, query: str): + return CommandRequest( + [ + self.cluster.server_bin_path, + "keeper-client", + "--host", + str(cluster.get_instance_ip("zoo1")), + "--port", + str(cluster.zookeeper_port), + "-q", + query, + ], + stdin="", + ) -def test_big_family(): - command = keeper_query( +def test_big_family(started_cluster: ClickHouseCluster): + client = KeeperClient(started_cluster) + + command = client.query( "create test_big_family foo;" "create test_big_family/1 foo;" "create test_big_family/1/1 foo;" @@ -70,7 +76,7 @@ def test_big_family(): ] ) - command = keeper_query("find_big_family test_big_family 1;") + command = client.query("find_big_family test_big_family 1;") assert command.get_answer() == TSV( [ @@ -79,8 +85,10 @@ def test_big_family(): ) -def test_find_super_nodes(): - command = keeper_query( +def test_find_super_nodes(started_cluster: ClickHouseCluster): + client = KeeperClient(started_cluster) + + command = client.query( "create test_find_super_nodes/1 foo;" "create test_find_super_nodes/1/1 foo;" "create test_find_super_nodes/1/2 foo;" @@ -103,8 +111,10 @@ def test_find_super_nodes(): ) -def test_delete_stable_backups(): - command = keeper_query( +def test_delete_stable_backups(started_cluster: ClickHouseCluster): + client = KeeperClient(started_cluster) + + command = client.query( "create /clickhouse foo;" "create /clickhouse/backups foo;" "create /clickhouse/backups/1 foo;" @@ -128,8 +138,10 @@ def test_delete_stable_backups(): ) -def test_base_commands(): - command = keeper_query( +def test_base_commands(started_cluster: ClickHouseCluster): + client = KeeperClient(started_cluster) + + command = client.query( "create test_create_zk_node1 testvalue1;" "create test_create_zk_node_2 testvalue2;" "get test_create_zk_node1;" @@ -138,7 +150,8 @@ def test_base_commands(): assert command.get_answer() == "testvalue1\n" -def test_four_letter_word_commands(): - command = keeper_query("ruok") +def test_four_letter_word_commands(started_cluster: ClickHouseCluster): + client = KeeperClient(started_cluster) + command = client.query("ruok") assert command.get_answer() == "imok\n" From bc86c26e4e75172fa8cb0665c26ff7b1228372a3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 26 Jul 2023 23:37:20 +0300 Subject: [PATCH 32/81] Update src/Formats/StructureToFormatSchemaUtils.cpp Co-authored-by: Antonio Andelic --- src/Formats/StructureToFormatSchemaUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/StructureToFormatSchemaUtils.cpp b/src/Formats/StructureToFormatSchemaUtils.cpp index fd187c033ec..edd29d5f6ed 100644 --- a/src/Formats/StructureToFormatSchemaUtils.cpp +++ b/src/Formats/StructureToFormatSchemaUtils.cpp @@ -61,7 +61,7 @@ namespace const auto * it = find_first_symbols<'_', '.'>(begin, end); String first = String(begin, it); String second = it == end ? "" : String(it + 1, end); - return {first, second}; + return {std::move(first), std::move(second)}; } } From 83fd3d4c89c9e247816ed619eecf6cb66bdb0154 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 26 Jul 2023 21:59:13 +0000 Subject: [PATCH 33/81] Additional checks related to DistinctSortedChunkTransform In response to suspicious crash reports from sqlancer for DISTINCT queries --- base/base/StringRef.h | 3 +++ .../Transforms/DistinctSortedChunkTransform.cpp | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/base/base/StringRef.h b/base/base/StringRef.h index f428b7c747f..448bc102b41 100644 --- a/base/base/StringRef.h +++ b/base/base/StringRef.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -274,6 +275,8 @@ struct CRC32Hash if (size == 0) return 0; + chassert(pos); + if (size < 8) { return static_cast(hashLessThan8(x.data, x.size)); diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp index 28a3260d742..720543991c5 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp @@ -126,9 +126,20 @@ bool DistinctSortedChunkTransform::isKey(const size_t key_pos, const size_t row_ bool DistinctSortedChunkTransform::isLatestKeyFromPrevChunk(const size_t row_pos) const { - for (size_t i = 0; i < sorted_columns.size(); ++i) + for (size_t i = 0, s = sorted_columns.size(); i < s; ++i) { - const int res = prev_chunk_latest_key[i]->compareAt(0, row_pos, *sorted_columns[i], sorted_columns_descr[i].nulls_direction); + const auto & sorted_column = *sorted_columns[i]; + /// temporary hardening due to suspious crashes in sqlancer tests + if (unlikely(sorted_column.size() <= row_pos)) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Unexpected size of a sorted column: size {}, row_pos {}, column position {}, type {}", + sorted_column.size(), + row_pos, + i, + sorted_column.getFamilyName()); + + const int res = prev_chunk_latest_key[i]->compareAt(0, row_pos, sorted_column, sorted_columns_descr[i].nulls_direction); if (res != 0) return false; } From 0cd2d7449b2057828fd653cc00155e7e4d4f6c09 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 27 Jul 2023 00:01:25 +0200 Subject: [PATCH 34/81] Use for-range loop, add comment --- src/Formats/StructureToFormatSchemaUtils.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Formats/StructureToFormatSchemaUtils.cpp b/src/Formats/StructureToFormatSchemaUtils.cpp index edd29d5f6ed..a9374647ebc 100644 --- a/src/Formats/StructureToFormatSchemaUtils.cpp +++ b/src/Formats/StructureToFormatSchemaUtils.cpp @@ -35,11 +35,14 @@ void endNested(WriteBuffer & buf, size_t indent) String getSchemaFieldName(const String & column_name) { String result = column_name; - size_t i = 0; - while (i < result.size() && isupper(result[i])) + /// Replace all first uppercase letters to lower-case, + /// because fields in CapnProto schema must begin with a lower-case letter. + /// Don't replace all letters to lower-case to remain camelCase field names. + for (auto & symbol : result) { - result[i] = tolower(result[i]); - ++i; + if (islower(symbol)) + break; + symbol = tolower(symbol); } return result; } From b5fc34b770cdbc555d1f9d45516af530da9ab5e5 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 27 Jul 2023 12:20:33 +0000 Subject: [PATCH 35/81] Rename setting disable_url_encoding to enable_url_encoding and add a test --- base/poco/Foundation/include/Poco/URI.h | 4 ++-- base/poco/Foundation/src/URI.cpp | 24 +++++++++---------- docs/en/engines/table-engines/special/url.md | 2 +- docs/en/operations/settings/settings.md | 6 ++--- docs/en/sql-reference/table-functions/url.md | 2 +- src/Core/Settings.h | 2 +- src/Storages/StorageURL.cpp | 2 +- .../02833_url_without_path_encoding.reference | 2 ++ .../02833_url_without_path_encoding.sh | 12 ++++++++++ 9 files changed, 35 insertions(+), 21 deletions(-) create mode 100644 tests/queries/0_stateless/02833_url_without_path_encoding.reference create mode 100755 tests/queries/0_stateless/02833_url_without_path_encoding.sh diff --git a/base/poco/Foundation/include/Poco/URI.h b/base/poco/Foundation/include/Poco/URI.h index eba8109253d..30654504e0b 100644 --- a/base/poco/Foundation/include/Poco/URI.h +++ b/base/poco/Foundation/include/Poco/URI.h @@ -57,7 +57,7 @@ public: URI(); /// Creates an empty URI. - explicit URI(const std::string & uri, bool disable_url_encoding = false); + explicit URI(const std::string & uri, bool enable_url_encoding = true); /// Parses an URI from the given string. Throws a /// SyntaxException if the uri is not valid. @@ -362,7 +362,7 @@ private: std::string _query; std::string _fragment; - bool _disable_url_encoding = false; + bool _enable_url_encoding = true; }; diff --git a/base/poco/Foundation/src/URI.cpp b/base/poco/Foundation/src/URI.cpp index 3354c69d188..41e331bb080 100644 --- a/base/poco/Foundation/src/URI.cpp +++ b/base/poco/Foundation/src/URI.cpp @@ -36,8 +36,8 @@ URI::URI(): } -URI::URI(const std::string& uri, bool decode_and_encode_path): - _port(0), _disable_url_encoding(decode_and_encode_path) +URI::URI(const std::string& uri, bool enable_url_encoding): + _port(0), _enable_url_encoding(enable_url_encoding) { parse(uri); } @@ -108,7 +108,7 @@ URI::URI(const URI& uri): _path(uri._path), _query(uri._query), _fragment(uri._fragment), - _disable_url_encoding(uri._disable_url_encoding) + _enable_url_encoding(uri._enable_url_encoding) { } @@ -121,7 +121,7 @@ URI::URI(const URI& baseURI, const std::string& relativeURI): _path(baseURI._path), _query(baseURI._query), _fragment(baseURI._fragment), - _disable_url_encoding(baseURI._disable_url_encoding) + _enable_url_encoding(baseURI._enable_url_encoding) { resolve(relativeURI); } @@ -153,7 +153,7 @@ URI& URI::operator = (const URI& uri) _path = uri._path; _query = uri._query; _fragment = uri._fragment; - _disable_url_encoding = uri._disable_url_encoding; + _enable_url_encoding = uri._enable_url_encoding; } return *this; } @@ -184,7 +184,7 @@ void URI::swap(URI& uri) std::swap(_path, uri._path); std::swap(_query, uri._query); std::swap(_fragment, uri._fragment); - std::swap(_disable_url_encoding, uri._disable_url_encoding); + std::swap(_enable_url_encoding, uri._enable_url_encoding); } @@ -687,18 +687,18 @@ void URI::decode(const std::string& str, std::string& decodedStr, bool plusAsSpa void URI::encodePath(std::string & encodedStr) const { - if (_disable_url_encoding) - encodedStr = _path; - else + if (_enable_url_encoding) encode(_path, RESERVED_PATH, encodedStr); + else + encodedStr = _path; } void URI::decodePath(const std::string & encodedStr) { - if (_disable_url_encoding) - _path = encodedStr; - else + if (_enable_url_encoding) decode(encodedStr, _path); + else + _path = encodedStr; } bool URI::isWellKnownPort() const diff --git a/docs/en/engines/table-engines/special/url.md b/docs/en/engines/table-engines/special/url.md index f556df0a088..5a5e1564180 100644 --- a/docs/en/engines/table-engines/special/url.md +++ b/docs/en/engines/table-engines/special/url.md @@ -106,4 +106,4 @@ For partitioning by month, use the `toYYYYMM(date_column)` expression, where `da ## Storage Settings {#storage-settings} - [engine_url_skip_empty_files](/docs/en/operations/settings/settings.md#engine_url_skip_empty_files) - allows to skip empty files while reading. Disabled by default. -- [disable_url_encoding](/docs/en/operations/settings/settings.md#disable_url_encoding) -allows to disable decoding/encoding path in uri. Disabled by default. +- [enable_url_encoding](/docs/en/operations/settings/settings.md#enable_url_encoding) - allows to enable/disable decoding/encoding path in uri. Enabled by default. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 8dfb6c0d225..db25ef10c91 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3468,11 +3468,11 @@ Possible values: Default value: `0`. -## disable_url_encoding {#disable_url_encoding} +## enable_url_encoding {#enable_url_encoding} -Allows to disable decoding/encoding path in uri in [URL](../../engines/table-engines/special/url.md) engine tables. +Allows to enable/disable decoding/encoding path in uri in [URL](../../engines/table-engines/special/url.md) engine tables. -Disabled by default. +Enabled by default. ## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index 677ed011960..859de86f019 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -56,7 +56,7 @@ Character `|` inside patterns is used to specify failover addresses. They are it ## Storage Settings {#storage-settings} - [engine_url_skip_empty_files](/docs/en/operations/settings/settings.md#engine_url_skip_empty_files) - allows to skip empty files while reading. Disabled by default. -- [disable_url_encoding](/docs/en/operations/settings/settings.md#disable_url_encoding) - allows to disable decoding/encoding path in uri. Disabled by default. +- [enable_url_encoding](/docs/en/operations/settings/settings.md#enable_url_encoding) - allows to enable/disable decoding/encoding path in uri. Enabled by default. **See Also** diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c69d132ea25..9823f8e9c95 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -622,7 +622,7 @@ class IColumn; M(Bool, engine_file_allow_create_multiple_files, false, "Enables or disables creating a new file on each insert in file engine tables if format has suffix.", 0) \ M(Bool, engine_file_skip_empty_files, false, "Allows to skip empty files in file table engine", 0) \ M(Bool, engine_url_skip_empty_files, false, "Allows to skip empty files in url table engine", 0) \ - M(Bool, disable_url_encoding, false, " Allows to disable decoding/encoding path in uri in URL table engine", 0) \ + M(Bool, enable_url_encoding, false, " Allows to enable/disable decoding/encoding path in uri in URL table engine", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ M(Bool, database_replicated_enforce_synchronous_settings, false, "Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, alter_sync). Not recommended to enable these settings.", 0) \ diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index db8cb6b42de..41eb18ab541 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -371,7 +371,7 @@ std::pair> StorageURLSource: for (; option != end; ++option) { bool skip_url_not_found_error = glob_url && read_settings.http_skip_not_found_url_for_globs && option == std::prev(end); - auto request_uri = Poco::URI(*option, context->getSettingsRef().disable_url_encoding); + auto request_uri = Poco::URI(*option, context->getSettingsRef().enable_url_encoding); for (const auto & [param, value] : params) request_uri.addQueryParameter(param, value); diff --git a/tests/queries/0_stateless/02833_url_without_path_encoding.reference b/tests/queries/0_stateless/02833_url_without_path_encoding.reference new file mode 100644 index 00000000000..d5626230d71 --- /dev/null +++ b/tests/queries/0_stateless/02833_url_without_path_encoding.reference @@ -0,0 +1,2 @@ +4 +test%2Fa.tsv diff --git a/tests/queries/0_stateless/02833_url_without_path_encoding.sh b/tests/queries/0_stateless/02833_url_without_path_encoding.sh new file mode 100755 index 00000000000..699b1c8e99b --- /dev/null +++ b/tests/queries/0_stateless/02833_url_without_path_encoding.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "select count() from url('http://localhost:11111/test%2Fa.tsv') settings enable_url_encoding=0" + +# Grep 'test%2Fa.tsv' to ensure that path wasn't encoded/decoded +$CLICKHOUSE_CLIENT -q "select count() from url('http://localhost:11111/test%2Fa.tsv') settings enable_url_encoding=1" 2>&1 | grep -o "test%2Fa.tsv" -m1 + From faca843ac0d2fd1b740d009a6cef1c9060346ff6 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 27 Jul 2023 18:24:40 +0200 Subject: [PATCH 36/81] Add utility to parse a backup metadata file and print information about the backup. --- utils/backup/print_backup_info.py | 208 ++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100755 utils/backup/print_backup_info.py diff --git a/utils/backup/print_backup_info.py b/utils/backup/print_backup_info.py new file mode 100755 index 00000000000..54e5c745a8c --- /dev/null +++ b/utils/backup/print_backup_info.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# -*- coding: UTF-8 -*- +""" +print_backup_info: Extract information about a backup from ".backup" file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Usage: print_backup_info +""" +import sys +import os +import xml.etree.ElementTree as ET + + +def main(): + if len(sys.argv) != 2: + print(__doc__) + sys.exit(1) + backup_xml = sys.argv[1] + + if not os.path.isfile(backup_xml): + print("error: {} does not exist".format(backup_xml)) + sys.exit(1) + + # Process the file line-by-line + tree = ET.parse(backup_xml) + root = tree.getroot() + contents = root.find("contents") + + version_node = root.find("version") + version = int(version_node.text) if (version_node != None) else None + + timestamp_node = root.find("timestamp") + timestamp = timestamp_node.text if (timestamp_node != None) else None + + base_backup_node = root.find("base_backup") + base_backup = base_backup_node.text if (base_backup_node != None) else None + + number_of_files = 0 + size_of_files = 0 + number_of_files_from_base_backup = 0 + size_of_files_from_base_backup = 0 + databases = set() + tables = {} + + for file in contents: + name = file.find("name").text + size = int(file.find("size").text) + + use_base_node = file.find("use_base") + use_base = (use_base_node.text == "true") if (use_base_node != None) else False + + if use_base: + base_size_node = file.find("base_size") + base_size = int(base_size_node.text) if (base_size_node != None) else size + else: + base_size = 0 + + data_file_node = file.find("data_file") + data_file = data_file_node.text if (data_file_node != None) else name + + has_data_file = name == data_file + + if has_data_file: + if size > base_size: + number_of_files += 1 + size_of_files += size - base_size + if base_size > 0: + number_of_files_from_base_backup += 1 + size_of_files_from_base_backup += base_size + + table_name = extract_table_name_from_path(name) + if table_name: + if table_name not in tables: + tables[table_name] = [0, 0, 0, 0] + if not name.endswith(".sql") and has_data_file: + table_info = tables[table_name] + if size > base_size: + table_info[0] += 1 + table_info[1] += size - base_size + if base_size > 0: + table_info[2] += 1 + table_info[3] += base_size + tables[table_name] = table_info + + database_name = extract_database_name_from_path(name) + if database_name: + databases.add(database_name) + + size_of_backup = size_of_files + os.path.getsize(backup_xml) + + print(f"version={version}") + print(f"timestamp={timestamp}") + print(f"base_backup={base_backup}") + print(f"size_of_backup={size_of_backup}") + print(f"number_of_files={number_of_files}") + print(f"size_of_files={size_of_files}") + print(f"number_of_files_from_base_backup={number_of_files_from_base_backup}") + print(f"size_of_files_from_base_backup={size_of_files_from_base_backup}") + print(f"number_of_databases={len(databases)}") + print(f"number_of_tables={len(tables)}") + + print() + + print(f"{len(databases)} database(s):") + for database_name in sorted(databases): + print(database_name) + + print() + + print(f"{len(tables)} table(s):") + table_info_format = "{:>70} | {:>20} | {:>20} | {:>26} | {:>30}" + table_info_separator_line = ( + "{:->70}-+-{:->20}-+-{:->20}-+-{:->26}-+-{:->30}".format("", "", "", "", "") + ) + table_info_title_line = table_info_format.format( + "table name", + "num_files", + "size_of_files", + "num_files_from_base_backup", + "size_of_files_from_base_backup", + ) + print(table_info_title_line) + print(table_info_separator_line) + for table_name in sorted(tables): + table_info = tables[table_name] + print( + table_info_format.format( + table_name, table_info[0], table_info[1], table_info[2], table_info[3] + ) + ) + + +# Extracts a table name from a path inside a backup. +# For example, extracts 'default.tbl' from 'shards/1/replicas/1/data/default/tbl/all_0_0_0/data.bin'. +def extract_table_name_from_path(path): + path = strip_shards_replicas_from_path(path) + if not path: + return None + if path.startswith("metadata/"): + path = path[len("metadata/") :] + sep = path.find("/") + if sep == -1: + return None + database_name = path[:sep] + path = path[sep + 1 :] + sep = path.find(".sql") + if sep == -1: + return None + table_name = path[:sep] + return database_name + "." + table_name + if path.startswith("data/"): + path = path[len("data/") :] + sep = path.find("/") + if sep == -1: + return None + database_name = path[:sep] + path = path[sep + 1 :] + sep = path.find("/") + if sep == -1: + return None + table_name = path[:sep] + return database_name + "." + table_name + return None + + +# Extracts a database name from a path inside a backup. +# For example, extracts 'default' from 'shards/1/replicas/1/data/default/tbl/all_0_0_0/data.bin'. +def extract_database_name_from_path(path): + path = strip_shards_replicas_from_path(path) + if not path: + return None + if path.startswith("metadata/"): + path = path[len("metadata/") :] + sep = path.find(".sql") + if sep == -1 or path.find("/") != -1: + return None + return path[:sep] + if path.startswith("data/"): + path = path[len("data/") :] + sep = path.find("/") + if sep == -1: + return None + return path[:sep] + return None + + +# Removes a prefix "shards//replicas//" from a path. +def strip_shards_replicas_from_path(path): + if path.startswith("shards"): + sep = path.find("/") + if sep == -1: + return None + sep = path.find("/", sep + 1) + if sep == -1: + return None + path = path[sep + 1 :] + if path.startswith("replicas"): + sep = path.find("/") + if sep == -1: + return None + sep = path.find("/", sep + 1) + if sep == -1: + return None + path = path[sep + 1 :] + return path + + +if __name__ == "__main__": + main() From d40dbdee6227cb0548f65abc900b1f7f91fd9959 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 27 Jun 2023 17:10:48 +0200 Subject: [PATCH 37/81] Fix usage of temporary directories during RESTORE Previously during RESTORE you may get the following warning in logs: TemporaryFileOnDisk: Temporary path 'tmp/21672kaaaaa' does not exist in './disks/s3_common_disk/' The reason is that it uses not disks/s3_common_disk/tmp but instead disks/s3_common_disk/disks/s3_common_disk/tmp path. Fix this by adding TemporaryFileOnDisk::getRelativePath() and use it when appropriate. And now - `find disks`, does not shows any more temporary leftovers. v2: rename getPath to getAbsolutePath Signed-off-by: Azat Khuzhin --- src/Disks/IO/WriteBufferFromTemporaryFile.cpp | 2 +- src/Disks/TemporaryFileOnDisk.cpp | 2 +- src/Disks/TemporaryFileOnDisk.h | 5 ++++- src/Interpreters/MergeJoin.cpp | 2 +- src/Interpreters/SortedBlocksWriter.cpp | 4 ++-- src/Interpreters/TemporaryDataOnDisk.cpp | 6 +++--- .../MergeTree/DataPartStorageOnDiskBase.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- src/Storages/StorageLog.cpp | 2 +- src/Storages/StorageMemory.cpp | 6 +++--- src/Storages/StorageStripeLog.cpp | 2 +- .../0_stateless/02801_backup_native_copy.sh | 4 ++-- .../02803_backup_tmp_files.reference | 0 .../0_stateless/02803_backup_tmp_files.sh | 18 ++++++++++++++++++ 14 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 tests/queries/0_stateless/02803_backup_tmp_files.reference create mode 100755 tests/queries/0_stateless/02803_backup_tmp_files.sh diff --git a/src/Disks/IO/WriteBufferFromTemporaryFile.cpp b/src/Disks/IO/WriteBufferFromTemporaryFile.cpp index 5bfbb2fa440..de494c03789 100644 --- a/src/Disks/IO/WriteBufferFromTemporaryFile.cpp +++ b/src/Disks/IO/WriteBufferFromTemporaryFile.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes } WriteBufferFromTemporaryFile::WriteBufferFromTemporaryFile(TemporaryFileOnDiskHolder && tmp_file_) - : WriteBufferFromFile(tmp_file_->getPath(), DBMS_DEFAULT_BUFFER_SIZE, O_RDWR | O_TRUNC | O_CREAT, /* throttler= */ {}, 0600) + : WriteBufferFromFile(tmp_file_->getAbsolutePath(), DBMS_DEFAULT_BUFFER_SIZE, O_RDWR | O_TRUNC | O_CREAT, /* throttler= */ {}, 0600) , tmp_file(std::move(tmp_file_)) { } diff --git a/src/Disks/TemporaryFileOnDisk.cpp b/src/Disks/TemporaryFileOnDisk.cpp index 8e5c8bcebbd..b9c9e238468 100644 --- a/src/Disks/TemporaryFileOnDisk.cpp +++ b/src/Disks/TemporaryFileOnDisk.cpp @@ -54,7 +54,7 @@ TemporaryFileOnDisk::TemporaryFileOnDisk(const DiskPtr & disk_, const String & p throw Exception(ErrorCodes::LOGICAL_ERROR, "Temporary file name is empty"); } -String TemporaryFileOnDisk::getPath() const +String TemporaryFileOnDisk::getAbsolutePath() const { return std::filesystem::path(disk->getPath()) / relative_path; } diff --git a/src/Disks/TemporaryFileOnDisk.h b/src/Disks/TemporaryFileOnDisk.h index bd82b9744ea..cccfc82cf9e 100644 --- a/src/Disks/TemporaryFileOnDisk.h +++ b/src/Disks/TemporaryFileOnDisk.h @@ -22,7 +22,10 @@ public: ~TemporaryFileOnDisk(); DiskPtr getDisk() const { return disk; } - String getPath() const; + /// Return absolute path (disk + relative_path) + String getAbsolutePath() const; + /// Return relative path (without disk) + const String & getRelativePath() const { return relative_path; } private: DiskPtr disk; diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index ceef1371f16..6f0c8f1bff4 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -1034,7 +1034,7 @@ std::shared_ptr MergeJoin::loadRightBlock(size_t pos) const { auto load_func = [&]() -> std::shared_ptr { - TemporaryFileStreamLegacy input(flushed_right_blocks[pos]->getPath(), materializeBlock(right_sample_block)); + TemporaryFileStreamLegacy input(flushed_right_blocks[pos]->getAbsolutePath(), materializeBlock(right_sample_block)); return std::make_shared(input.block_in->read()); }; diff --git a/src/Interpreters/SortedBlocksWriter.cpp b/src/Interpreters/SortedBlocksWriter.cpp index e09a66a38e6..c9f48ee1be9 100644 --- a/src/Interpreters/SortedBlocksWriter.cpp +++ b/src/Interpreters/SortedBlocksWriter.cpp @@ -39,7 +39,7 @@ namespace TemporaryFileOnDiskHolder flushToFile(const DiskPtr & disk, const Block & header, QueryPipelineBuilder pipeline, const String & codec) { auto tmp_file = std::make_unique(disk, CurrentMetrics::TemporaryFilesForJoin); - auto write_stat = TemporaryFileStreamLegacy::write(tmp_file->getPath(), header, std::move(pipeline), codec); + auto write_stat = TemporaryFileStreamLegacy::write(tmp_file->getAbsolutePath(), header, std::move(pipeline), codec); ProfileEvents::increment(ProfileEvents::ExternalProcessingCompressedBytesTotal, write_stat.compressed_bytes); ProfileEvents::increment(ProfileEvents::ExternalProcessingUncompressedBytesTotal, write_stat.uncompressed_bytes); @@ -267,7 +267,7 @@ SortedBlocksWriter::SortedFiles SortedBlocksWriter::finishMerge(std::function(file->getPath(), materializeBlock(sample_block))); + return Pipe(std::make_shared(file->getAbsolutePath(), materializeBlock(sample_block))); } diff --git a/src/Interpreters/TemporaryDataOnDisk.cpp b/src/Interpreters/TemporaryDataOnDisk.cpp index 69fef21dbab..0aa2a0b9f4a 100644 --- a/src/Interpreters/TemporaryDataOnDisk.cpp +++ b/src/Interpreters/TemporaryDataOnDisk.cpp @@ -235,9 +235,9 @@ TemporaryFileStream::TemporaryFileStream(TemporaryFileOnDiskHolder file_, const : parent(parent_) , header(header_) , file(std::move(file_)) - , out_writer(std::make_unique(std::make_unique(file->getPath()), header)) + , out_writer(std::make_unique(std::make_unique(file->getAbsolutePath()), header)) { - LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Writing to temporary file {}", file->getPath()); + LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Writing to temporary file {}", file->getAbsolutePath()); } TemporaryFileStream::TemporaryFileStream(FileSegmentsHolderPtr segments_, const Block & header_, TemporaryDataOnDisk * parent_) @@ -365,7 +365,7 @@ void TemporaryFileStream::release() String TemporaryFileStream::getPath() const { if (file) - return file->getPath(); + return file->getAbsolutePath(); if (segment_holder && !segment_holder->empty()) return segment_holder->front().getPathInLocalCache(); diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index e1921f45eda..a0492f5f38e 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -350,7 +350,7 @@ void DataPartStorageOnDiskBase::backup( temp_dir_it = temp_dirs->emplace(disk, std::make_shared(disk, "tmp/")).first; temp_dir_owner = temp_dir_it->second; - fs::path temp_dir = temp_dir_owner->getPath(); + fs::path temp_dir = temp_dir_owner->getRelativePath(); temp_part_dir = temp_dir / part_path_in_backup.relative_path(); disk->createDirectories(temp_part_dir); } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6179c70ca57..558f5b7515e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5266,7 +5266,7 @@ public: auto it = temp_dirs.find(disk); if (it == temp_dirs.end()) it = temp_dirs.emplace(disk, std::make_shared(disk, "tmp/")).first; - return it->second->getPath(); + return it->second->getRelativePath(); } private: diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index d8065b8bb3c..d02a51fab22 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -946,7 +946,7 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c fs::path data_path_in_backup_fs = data_path_in_backup; auto temp_dir_owner = std::make_shared(disk, "tmp/"); - fs::path temp_dir = temp_dir_owner->getPath(); + fs::path temp_dir = temp_dir_owner->getRelativePath(); disk->createDirectories(temp_dir); bool copy_encrypted = !backup_entries_collector.getBackupSettings().decrypt_files_from_encrypted_disks; diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 6ed0583bd44..c990d488969 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -314,7 +314,7 @@ namespace backup_entries.resize(file_paths.size()); temp_dir_owner.emplace(temp_disk); - fs::path temp_dir = temp_dir_owner->getPath(); + fs::path temp_dir = temp_dir_owner->getRelativePath(); temp_disk->createDirectories(temp_dir); /// Writing data.bin @@ -453,10 +453,10 @@ void StorageMemory::restoreDataImpl(const BackupPtr & backup, const String & dat if (!dynamic_cast(in.get())) { temp_data_file.emplace(temporary_disk); - auto out = std::make_unique(temp_data_file->getPath()); + auto out = std::make_unique(temp_data_file->getAbsolutePath()); copyData(*in, *out); out.reset(); - in = createReadBufferFromFileBase(temp_data_file->getPath(), {}); + in = createReadBufferFromFileBase(temp_data_file->getAbsolutePath(), {}); } std::unique_ptr in_from_file{static_cast(in.release())}; CompressedReadBufferFromFile compressed_in{std::move(in_from_file)}; diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index d8bbd523cbf..2f3b5f25ee4 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -544,7 +544,7 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec fs::path data_path_in_backup_fs = data_path_in_backup; auto temp_dir_owner = std::make_shared(disk, "tmp/"); - fs::path temp_dir = temp_dir_owner->getPath(); + fs::path temp_dir = temp_dir_owner->getRelativePath(); disk->createDirectories(temp_dir); bool copy_encrypted = !backup_entries_collector.getBackupSettings().decrypt_files_from_encrypted_disks; diff --git a/tests/queries/0_stateless/02801_backup_native_copy.sh b/tests/queries/0_stateless/02801_backup_native_copy.sh index 31a7cc3b410..b8ee97a7c7d 100755 --- a/tests/queries/0_stateless/02801_backup_native_copy.sh +++ b/tests/queries/0_stateless/02801_backup_native_copy.sh @@ -29,14 +29,14 @@ $CLICKHOUSE_CLIENT -nm -q " " query_id=$(random_str 10) -$CLICKHOUSE_CLIENT --send_logs_level=error --format Null --query_id $query_id -q "RESTORE TABLE data AS data_native_copy FROM S3(s3_conn, 'backups/$CLICKHOUSE_DATABASE/data_native_copy') SETTINGS allow_s3_native_copy=true" +$CLICKHOUSE_CLIENT --format Null --query_id $query_id -q "RESTORE TABLE data AS data_native_copy FROM S3(s3_conn, 'backups/$CLICKHOUSE_DATABASE/data_native_copy') SETTINGS allow_s3_native_copy=true" $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; SELECT query, ProfileEvents['S3CopyObject']>0 FROM system.query_log WHERE type = 'QueryFinish' AND event_date >= yesterday() AND current_database = '$CLICKHOUSE_DATABASE' AND query_id = '$query_id' " query_id=$(random_str 10) -$CLICKHOUSE_CLIENT --send_logs_level=error --format Null --query_id $query_id -q "RESTORE TABLE data AS data_no_native_copy FROM S3(s3_conn, 'backups/$CLICKHOUSE_DATABASE/data_no_native_copy') SETTINGS allow_s3_native_copy=false" +$CLICKHOUSE_CLIENT --format Null --query_id $query_id -q "RESTORE TABLE data AS data_no_native_copy FROM S3(s3_conn, 'backups/$CLICKHOUSE_DATABASE/data_no_native_copy') SETTINGS allow_s3_native_copy=false" $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; SELECT query, ProfileEvents['S3CopyObject']>0 FROM system.query_log WHERE type = 'QueryFinish' AND event_date >= yesterday() AND current_database = '$CLICKHOUSE_DATABASE' AND query_id = '$query_id' diff --git a/tests/queries/0_stateless/02803_backup_tmp_files.reference b/tests/queries/0_stateless/02803_backup_tmp_files.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02803_backup_tmp_files.sh b/tests/queries/0_stateless/02803_backup_tmp_files.sh new file mode 100755 index 00000000000..d86beae4923 --- /dev/null +++ b/tests/queries/0_stateless/02803_backup_tmp_files.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Tag: no-fasttest - requires S3 + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +set -e + +$CLICKHOUSE_CLIENT -nm -q " + drop table if exists data; + create table data (key Int) engine=MergeTree() order by tuple() settings disk='s3_disk'; + insert into data select * from numbers(10); +" + +$CLICKHOUSE_CLIENT --format Null -q "BACKUP TABLE data TO S3(s3_conn, 'backups/$CLICKHOUSE_DATABASE/data')" +$CLICKHOUSE_CLIENT --format Null -q "RESTORE TABLE data AS data_native_copy FROM S3(s3_conn, 'backups/$CLICKHOUSE_DATABASE/data')" From 67b0993bdf8a5de1dd3a68db5c5082ba0a3ff759 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 27 Jul 2023 18:54:41 +0000 Subject: [PATCH 38/81] Add documentation --- docs/en/interfaces/formats.md | 55 ++++++ .../operations/settings/settings-formats.md | 14 ++ .../functions/other-functions.md | 184 ++++++++++++++++++ 3 files changed, 253 insertions(+) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index ed2f010a632..f80dac1d110 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1704,6 +1704,34 @@ You can select data from a ClickHouse table and save them into some file in the ``` bash $ clickhouse-client --query = "SELECT * FROM test.hits FORMAT CapnProto SETTINGS format_schema = 'schema:Message'" ``` + +### Using autogenerated schema {#using-autogenerated-capn-proto-schema} + +If you don't have an external CapnProto schema for your data, you can still output/input data in CapnProto format by using autogenerated schema. +For example: + +```sql +SELECT * FROM test.hits format CapnProto SETTINGS format_capn_proto_use_autogenerated_schema=1 +``` + +In this case ClickHouse will autogenerate CapnProto schema according to the table structure using function [structureToCapnProtoSchema](../sql-reference/functions/other-functions.md#structure_to_capn_proto_schema) and will use this schema to serialize data in CapnProto format. + +You can also read CapnProto file with autogenerated schema (in this case the file must be created using the same schema): + +```bash +$ cat hits.bin | clickhouse-client --query "INSERT INTO test.hits SETTINGS format_capn_proto_use_autogenerated_schema=1 FORMAT CapnProto" +``` + +The setting [format_capn_proto_use_autogenerated_schema](../operations/settings/settings-formats.md#format_capn_proto_use_autogenerated_schema) is enabled by default and applies if [format_schema](../operations/settings/settings-formats.md#formatschema-format-schema) is not set. + +You can also save autogenerated schema in the file during input/output using setting [output_format_schema](../operations/settings/settings-formats.md#outputformatschema-output-format-schema). For example: + +```sql +SELECT * FROM test.hits format CapnProto SETTINGS format_capn_proto_use_autogenerated_schema=1, output_format_schema='path/to/schema/schema.capnp' +``` + +In this case autogenerated CapnProto schema will be saved in file `path/to/schema/schema.capnp`. + ## Prometheus {#prometheus} Expose metrics in [Prometheus text-based exposition format](https://prometheus.io/docs/instrumenting/exposition_formats/#text-based-format). @@ -1842,6 +1870,33 @@ ClickHouse inputs and outputs protobuf messages in the `length-delimited` format It means before every message should be written its length as a [varint](https://developers.google.com/protocol-buffers/docs/encoding#varints). See also [how to read/write length-delimited protobuf messages in popular languages](https://cwiki.apache.org/confluence/display/GEODE/Delimiting+Protobuf+Messages). +### Using autogenerated schema {#using-autogenerated-protobuf-schema} + +If you don't have an external Protobuf schema for your data, you can still output/input data in Protobuf format by using autogenerated schema. +For example: + +```sql +SELECT * FROM test.hits format Protobuf SETTINGS format_protobuf_use_autogenerated_schema=1 +``` + +In this case ClickHouse will autogenerate Protobuf schema according to the table structure using function [structureToProtobufSchema](../sql-reference/functions/other-functions.md#structure_to_protobuf_schema) and will use this schema to serialize data in Protobuf format. + +You can also read Protobuf file with autogenerated schema (in this case the file must be created using the same schema): + +```bash +$ cat hits.bin | clickhouse-client --query "INSERT INTO test.hits SETTINGS format_protobuf_use_autogenerated_schema=1 FORMAT Protobuf" +``` + +The setting [format_protobuf_use_autogenerated_schema](../operations/settings/settings-formats.md#format_protobuf_use_autogenerated_schema) is enabled by default and applies if [format_schema](../operations/settings/settings-formats.md#formatschema-format-schema) is not set. + +You can also save autogenerated schema in the file during input/output using setting [output_format_schema](../operations/settings/settings-formats.md#outputformatschema-output-format-schema). For example: + +```sql +SELECT * FROM test.hits format Protobuf SETTINGS format_protobuf_use_autogenerated_schema=1, output_format_schema='path/to/schema/schema.proto' +``` + +In this case autogenerated Protobuf schema will be saved in file `path/to/schema/schema.capnp`. + ## ProtobufSingle {#protobufsingle} Same as [Protobuf](#protobuf) but for storing/parsing single Protobuf message without length delimiters. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 0915c51806a..b54669cf8a8 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -321,6 +321,10 @@ If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` ar This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. +## output_format_schema {#output-format-schema} + +The path to the file where the automatically generated schema will be saved in [Cap’n Proto](../../interfaces/formats.md#capnproto-capnproto) or [Protobuf](../../interfaces/formats.md#protobuf-protobuf) formats. + ## output_format_enable_streaming {#output_format_enable_streaming} Enable streaming in output formats that support it. @@ -1308,6 +1312,11 @@ When serializing Nullable columns with Google wrappers, serialize default values Disabled by default. +### format_protobuf_use_autogenerated_schema {#format_capn_proto_use_autogenerated_schema} + +Use autogenerated Protobuf schema when [format_schema](#formatschema-format-schema) is not set. +The schema is generated from ClickHouse table structure using function [structureToProtobufSchema](../../sql-reference/functions/other-functions.md#structure_to_protobuf_schema) + ## Avro format settings {#avro-format-settings} ### input_format_avro_allow_missing_fields {#input_format_avro_allow_missing_fields} @@ -1593,6 +1602,11 @@ Possible values: Default value: `'by_values'`. +### format_capn_proto_use_autogenerated_schema {#format_capn_proto_use_autogenerated_schema} + +Use autogenerated CapnProto schema when [format_schema](#formatschema-format-schema) is not set. +The schema is generated from ClickHouse table structure using function [structureToCapnProtoSchema](../../sql-reference/functions/other-functions.md#structure_to_capnproto_schema) + ## MySQLDump format settings {#musqldump-format-settings} ### input_format_mysql_dump_table_name (#input_format_mysql_dump_table_name) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 26dcccfd42e..bfbd26551d3 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2552,3 +2552,187 @@ Result: This function can be used together with [generateRandom](../../sql-reference/table-functions/generate.md) to generate completely random tables. +## structureToCapnProtoSchema {#structure_to_capn_proto_schema} + +Converts ClickHouse table structure to CapnProto schema. + +**Syntax** + +``` sql +structureToCapnProtoSchema(structure) +``` + +**Arguments** + +- `structure` — Table structure in a format `column1_name column1_type, column2_name column2_type, ...`. +- `root_struct_name` — Name for root struct in CapnProto schema. Default value - `Message`; + +**Returned value** + +- CapnProto schema + +Type: [String](../../sql-reference/data-types/string.md). + +**Examples** + +Query: + +``` sql +SELECT structureToCapnProtoSchema('column1 String, column2 UInt32, column3 Array(String)') FORMAT RawBLOB +``` + +Result: + +``` text +@0xf96402dd754d0eb7; + +struct Message +{ + column1 @0 : Data; + column2 @1 : UInt32; + column3 @2 : List(Data); +} +``` + +Query: + +``` sql +SELECT structureToCapnProtoSchema('column1 Nullable(String), column2 Tuple(element1 UInt32, element2 Array(String)), column3 Map(String, String)') FORMAT RawBLOB +``` + +Result: + +``` text +@0xd1c8320fecad2b7f; + +struct Message +{ + struct Column1 + { + union + { + value @0 : Data; + null @1 : Void; + } + } + column1 @0 : Column1; + struct Column2 + { + element1 @0 : UInt32; + element2 @1 : List(Data); + } + column2 @1 : Column2; + struct Column3 + { + struct Entry + { + key @0 : Data; + value @1 : Data; + } + entries @0 : List(Entry); + } + column3 @2 : Column3; +} +``` + +Query: + +``` sql +SELECT structureToCapnProtoSchema('column1 String, column2 UInt32', 'Root') FORMAT RawBLOB +``` + +Result: + +``` text +@0x96ab2d4ab133c6e1; + +struct Root +{ + column1 @0 : Data; + column2 @1 : UInt32; +} +``` + +## structureToProtobufSchema {#structure_to_protobuf_schema} + +Converts ClickHouse table structure to Protobuf schema. + +**Syntax** + +``` sql +structureToProtobufSchema(structure) +``` + +**Arguments** + +- `structure` — Table structure in a format `column1_name column1_type, column2_name column2_type, ...`. +- `root_message_name` — Name for root message in Protobuf schema. Default value - `Message`; + +**Returned value** + +- Protobuf schema + +Type: [String](../../sql-reference/data-types/string.md). + +**Examples** + +Query: + +``` sql +SELECT structureToProtobufSchema('column1 String, column2 UInt32, column3 Array(String)') FORMAT RawBLOB +``` + +Result: + +``` text +syntax = "proto3"; + +message Message +{ + bytes column1 = 1; + uint32 column2 = 2; + repeated bytes column3 = 3; +} +``` + +Query: + +``` sql +SELECT structureToProtobufSchema('column1 Nullable(String), column2 Tuple(element1 UInt32, element2 Array(String)), column3 Map(String, String)') FORMAT RawBLOB +``` + +Result: + +``` text +syntax = "proto3"; + +message Message +{ + bytes column1 = 1; + message Column2 + { + uint32 element1 = 1; + repeated bytes element2 = 2; + } + Column2 column2 = 2; + map column3 = 3; +} +``` + +Query: + +``` sql +SELECT structureToProtobufSchema('column1 String, column2 UInt32', 'Root') FORMAT RawBLOB +``` + +Result: + +``` text +syntax = "proto3"; + +message Root +{ + bytes column1 = 1; + uint32 column2 = 2; +} +``` From b9c9933cc9eec7024e2af9085fff0e365ae6bad8 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 27 Jul 2023 18:56:23 +0000 Subject: [PATCH 39/81] Fix typo --- docs/en/interfaces/formats.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index f80dac1d110..a618de9e12b 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1707,7 +1707,7 @@ $ clickhouse-client --query = "SELECT * FROM test.hits FORMAT CapnProto SETTINGS ### Using autogenerated schema {#using-autogenerated-capn-proto-schema} -If you don't have an external CapnProto schema for your data, you can still output/input data in CapnProto format by using autogenerated schema. +If you don't have an external CapnProto schema for your data, you can still output/input data in CapnProto format using autogenerated schema. For example: ```sql @@ -1872,7 +1872,7 @@ See also [how to read/write length-delimited protobuf messages in popular langua ### Using autogenerated schema {#using-autogenerated-protobuf-schema} -If you don't have an external Protobuf schema for your data, you can still output/input data in Protobuf format by using autogenerated schema. +If you don't have an external Protobuf schema for your data, you can still output/input data in Protobuf format using autogenerated schema. For example: ```sql From 32d23b09049429512b7a20d32e7bda6a80685fcb Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 27 Jul 2023 21:28:28 +0200 Subject: [PATCH 40/81] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 6c88d63be49..3737f2264bf 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1081,6 +1081,7 @@ autocompletion autodetect autodetected autogenerated +autogenerate autogeneration autostart avgWeighted @@ -2217,6 +2218,8 @@ strtoll strtoull struct structs +structureToCapnProtoSchema +structureToProtobufSchema studentTTest studentttest subBitmap From 407fb58d4bd82d73886f77545d9053ea904b0e65 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 27 Jul 2023 20:53:08 +0000 Subject: [PATCH 41/81] Fix style check --- src/Processors/Transforms/DistinctSortedChunkTransform.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp index 720543991c5..91806a90083 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp @@ -5,6 +5,7 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int SET_SIZE_LIMIT_EXCEEDED; } From 5bd670d8c6ef266ce59625250a0861ed55ba4c47 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Thu, 27 Jul 2023 15:54:33 -0700 Subject: [PATCH 42/81] Improve endianness-independent support for hash functions --- contrib/libmetrohash/src/platform.h | 18 +- src/Functions/FunctionsHashing.h | 156 ++++++------------ .../0_stateless/00678_murmurhash.reference | 4 +- .../queries/0_stateless/00678_murmurhash.sql | 6 +- .../00746_hashing_tuples.reference | 14 +- .../0_stateless/00746_hashing_tuples.sql | 14 +- .../0_stateless/02534_keyed_siphash.reference | 4 +- .../0_stateless/02534_keyed_siphash.sql | 4 +- .../02552_siphash128_reference.reference | 2 +- .../02552_siphash128_reference.sql | 2 +- 10 files changed, 91 insertions(+), 133 deletions(-) diff --git a/contrib/libmetrohash/src/platform.h b/contrib/libmetrohash/src/platform.h index bc00e5a286b..9e83d11cb7c 100644 --- a/contrib/libmetrohash/src/platform.h +++ b/contrib/libmetrohash/src/platform.h @@ -17,7 +17,8 @@ #ifndef METROHASH_PLATFORM_H #define METROHASH_PLATFORM_H -#include +#include +#include #include // rotate right idiom recognized by most compilers @@ -33,6 +34,11 @@ inline static uint64_t read_u64(const void * const ptr) // so we use memcpy() which is the most portable. clang & gcc usually translates `memcpy()` into a single `load` instruction // when hardware supports it, so using memcpy() is efficient too. memcpy(&result, ptr, sizeof(result)); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + result = std::byteswap(result); +#endif + return result; } @@ -40,6 +46,11 @@ inline static uint64_t read_u32(const void * const ptr) { uint32_t result; memcpy(&result, ptr, sizeof(result)); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + result = std::byteswap(result); +#endif + return result; } @@ -47,6 +58,11 @@ inline static uint64_t read_u16(const void * const ptr) { uint16_t result; memcpy(&result, ptr, sizeof(result)); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + result = std::byteswap(result); +#endif + return result; } diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 279294b367c..929cb2f77f4 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -153,15 +153,10 @@ struct IntHash64Impl template T combineHashesFunc(T t1, T t2) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - T tmp; - reverseMemcpy(&tmp, &t1, sizeof(T)); - t1 = tmp; - reverseMemcpy(&tmp, &t2, sizeof(T)); - t2 = tmp; -#endif - T hashes[] = {t1, t2}; - return HashFunction::apply(reinterpret_cast(hashes), 2 * sizeof(T)); + transformEndianness(t1); + transformEndianness(t2); + const T hashes[] {t1, t2}; + return HashFunction::apply(reinterpret_cast(hashes), sizeof(hashes)); } @@ -184,21 +179,14 @@ struct HalfMD5Impl MD5_Update(&ctx, reinterpret_cast(begin), size); MD5_Final(buf.char_data, &ctx); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return buf.uint64_data; /// No need to flip bytes on big endian machines -#else - return std::byteswap(buf.uint64_data); /// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t -#endif + /// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t + transformEndianness(buf.uint64_data); + return buf.uint64_data; } static UInt64 combineHashes(UInt64 h1, UInt64 h2) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - h1 = std::byteswap(h1); - h2 = std::byteswap(h2); -#endif - UInt64 hashes[] = {h1, h2}; - return apply(reinterpret_cast(hashes), 16); + return combineHashesFunc(h1, h2); } /// If true, it will use intHash32 or intHash64 to hash POD types. This behaviour is intended for better performance of some functions. @@ -311,15 +299,8 @@ struct SipHash64Impl static constexpr auto name = "sipHash64"; using ReturnType = UInt64; - static UInt64 apply(const char * begin, size_t size) - { - return sipHash64(begin, size); - } - - static UInt64 combineHashes(UInt64 h1, UInt64 h2) - { - return combineHashesFunc(h1, h2); - } + static UInt64 apply(const char * begin, size_t size) { return sipHash64(begin, size); } + static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return combineHashesFunc(h1, h2); } static constexpr bool use_int_hash_for_pods = false; }; @@ -336,12 +317,10 @@ struct SipHash64KeyedImpl static UInt64 combineHashesKeyed(const Key & key, UInt64 h1, UInt64 h2) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - h1 = std::byteswap(h1); - h2 = std::byteswap(h2); -#endif - UInt64 hashes[] = {h1, h2}; - return applyKeyed(key, reinterpret_cast(hashes), 2 * sizeof(UInt64)); + transformEndianness(h1); + transformEndianness(h2); + const UInt64 hashes[]{h1, h2}; + return applyKeyed(key, reinterpret_cast(hashes), sizeof(hashes)); } static constexpr bool use_int_hash_for_pods = false; @@ -353,15 +332,8 @@ struct SipHash128Impl using ReturnType = UInt128; - static UInt128 combineHashes(UInt128 h1, UInt128 h2) - { - return combineHashesFunc(h1, h2); - } - - static UInt128 apply(const char * data, const size_t size) - { - return sipHash128(data, size); - } + static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc(h1, h2); } + static UInt128 apply(const char * data, const size_t size) { return sipHash128(data, size); } static constexpr bool use_int_hash_for_pods = false; }; @@ -378,15 +350,10 @@ struct SipHash128KeyedImpl static UInt128 combineHashesKeyed(const Key & key, UInt128 h1, UInt128 h2) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - UInt128 tmp; - reverseMemcpy(&tmp, &h1, sizeof(UInt128)); - h1 = tmp; - reverseMemcpy(&tmp, &h2, sizeof(UInt128)); - h2 = tmp; -#endif - UInt128 hashes[] = {h1, h2}; - return applyKeyed(key, reinterpret_cast(hashes), 2 * sizeof(UInt128)); + transformEndianness(h1); + transformEndianness(h2); + const UInt128 hashes[]{h1, h2}; + return applyKeyed(key, reinterpret_cast(hashes), sizeof(hashes)); } static constexpr bool use_int_hash_for_pods = false; @@ -531,10 +498,7 @@ struct MurmurHash3Impl64 return h[0] ^ h[1]; } - static UInt64 combineHashes(UInt64 h1, UInt64 h2) - { - return IntHash64Impl::apply(h1) ^ h2; - } + static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return IntHash64Impl::apply(h1) ^ h2; } static constexpr bool use_int_hash_for_pods = false; }; @@ -552,10 +516,7 @@ struct MurmurHash3Impl128 return *reinterpret_cast(bytes); } - static UInt128 combineHashes(UInt128 h1, UInt128 h2) - { - return combineHashesFunc(h1, h2); - } + static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc(h1, h2); } static constexpr bool use_int_hash_for_pods = false; }; @@ -1040,11 +1001,10 @@ private: if (const ColVecType * col_from = checkAndGetColumn(column)) { const typename ColVecType::Container & vec_from = col_from->getData(); - size_t size = vec_from.size(); + const size_t size = vec_from.size(); for (size_t i = 0; i < size; ++i) { ToType hash; - if constexpr (Impl::use_int_hash_for_pods) { if constexpr (std::is_same_v) @@ -1058,13 +1018,8 @@ private: hash = JavaHashImpl::apply(vec_from[i]); else { - FromType value = vec_from[i]; - if constexpr (std::endian::native == std::endian::big) - { - FromType value_reversed; - reverseMemcpy(&value_reversed, &value, sizeof(value)); - value = value_reversed; - } + auto value = vec_from[i]; + transformEndianness(value); hash = apply(key, reinterpret_cast(&value), sizeof(value)); } } @@ -1078,8 +1033,8 @@ private: else if (auto col_from_const = checkAndGetColumnConst(column)) { auto value = col_from_const->template getValue(); - ToType hash; + ToType hash; if constexpr (Impl::use_int_hash_for_pods) { if constexpr (std::is_same_v) @@ -1093,17 +1048,12 @@ private: hash = JavaHashImpl::apply(value); else { - if constexpr (std::endian::native == std::endian::big) - { - FromType value_reversed; - reverseMemcpy(&value_reversed, &value, sizeof(value)); - value = value_reversed; - } + transformEndianness(value); hash = apply(key, reinterpret_cast(&value), sizeof(value)); } } - size_t size = vec_to.size(); + const size_t size = vec_to.size(); if constexpr (first) vec_to.assign(size, hash); else @@ -1120,6 +1070,16 @@ private: { using ColVecType = ColumnVectorOrDecimal; + static const auto to_little_endian = [](auto & value) + { + // IPv6 addresses are parsed into four 32-bit components in big-endian ordering on both platforms, so no change is necessary. + // Reference: `parseIPv6orIPv4` in src/Common/formatIPv6.h. + if constexpr (std::endian::native == std::endian::big && std::is_same_v) + return; + + transformEndianness(value); + }; + if (const ColVecType * col_from = checkAndGetColumn(column)) { const typename ColVecType::Container & vec_from = col_from->getData(); @@ -1131,9 +1091,10 @@ private: hash = apply(key, reinterpret_cast(&vec_from[i]), sizeof(vec_from[i])); else { - char tmp_buffer[sizeof(vec_from[i])]; - reverseMemcpy(tmp_buffer, &vec_from[i], sizeof(vec_from[i])); - hash = apply(key, reinterpret_cast(tmp_buffer), sizeof(vec_from[i])); + auto value = vec_from[i]; + to_little_endian(value); + + hash = apply(key, reinterpret_cast(&value), sizeof(value)); } if constexpr (first) vec_to[i] = hash; @@ -1144,17 +1105,10 @@ private: else if (auto col_from_const = checkAndGetColumnConst(column)) { auto value = col_from_const->template getValue(); + to_little_endian(value); - ToType hash; - if constexpr (std::endian::native == std::endian::little) - hash = apply(key, reinterpret_cast(&value), sizeof(value)); - else - { - char tmp_buffer[sizeof(value)]; - reverseMemcpy(tmp_buffer, &value, sizeof(value)); - hash = apply(key, reinterpret_cast(tmp_buffer), sizeof(value)); - } - size_t size = vec_to.size(); + const auto hash = apply(key, reinterpret_cast(&value), sizeof(value)); + const size_t size = vec_to.size(); if constexpr (first) vec_to.assign(size, hash); else @@ -1423,6 +1377,9 @@ public: if constexpr (std::is_same_v) /// backward-compatible { + if (std::endian::native == std::endian::big) + std::ranges::for_each(col_to->getData(), transformEndianness); + auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128)); col_to_fixed_string->getChars() = std::move(*reinterpret_cast(&col_to->getData())); return col_to_fixed_string; @@ -1673,21 +1630,8 @@ struct ImplWyHash64 static constexpr auto name = "wyHash64"; using ReturnType = UInt64; - static UInt64 apply(const char * s, const size_t len) - { - return wyhash(s, len, 0, _wyp); - } - static UInt64 combineHashes(UInt64 h1, UInt64 h2) - { - union - { - UInt64 u64[2]; - char chars[16]; - }; - u64[0] = h1; - u64[1] = h2; - return apply(chars, 16); - } + static UInt64 apply(const char * s, const size_t len) { return wyhash(s, len, 0, _wyp); } + static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return combineHashesFunc(h1, h2); } static constexpr bool use_int_hash_for_pods = false; }; diff --git a/tests/queries/0_stateless/00678_murmurhash.reference b/tests/queries/0_stateless/00678_murmurhash.reference index fb4a00ba046..988c022f1bf 100644 --- a/tests/queries/0_stateless/00678_murmurhash.reference +++ b/tests/queries/0_stateless/00678_murmurhash.reference @@ -25,5 +25,5 @@ 9631199822919835226 4334672815104069193 4334672815104069193 -1 -1 +6145F501578671E2877DBA2BE487AF7E +16FE7483905CCE7A85670E43E4678877 diff --git a/tests/queries/0_stateless/00678_murmurhash.sql b/tests/queries/0_stateless/00678_murmurhash.sql index eda29fd17cd..705c62480a0 100644 --- a/tests/queries/0_stateless/00678_murmurhash.sql +++ b/tests/queries/0_stateless/00678_murmurhash.sql @@ -32,7 +32,5 @@ SELECT gccMurmurHash('foo'); SELECT gccMurmurHash('\x01'); SELECT gccMurmurHash(1); --- Comparison with reverse for big endian -SELECT hex(murmurHash3_128('foo')) = hex(reverse(unhex('6145F501578671E2877DBA2BE487AF7E'))) or hex(murmurHash3_128('foo')) = '6145F501578671E2877DBA2BE487AF7E'; --- Comparison with reverse for big endian -SELECT hex(murmurHash3_128('\x01')) = hex(reverse(unhex('16FE7483905CCE7A85670E43E4678877'))) or hex(murmurHash3_128('\x01')) = '16FE7483905CCE7A85670E43E4678877'; +SELECT hex(murmurHash3_128('foo')); +SELECT hex(murmurHash3_128('\x01')); diff --git a/tests/queries/0_stateless/00746_hashing_tuples.reference b/tests/queries/0_stateless/00746_hashing_tuples.reference index e3b896f6077..71d45be5a54 100644 --- a/tests/queries/0_stateless/00746_hashing_tuples.reference +++ b/tests/queries/0_stateless/00746_hashing_tuples.reference @@ -1,11 +1,11 @@ 12940785793559895259 17926972817233444501 7456555839952096623 -1 -1 -1 -1 -1 +CC45107CC4B79F62D831BEF2103C7CBF +DF2EC2F0669B000EDFF6ADEE264E7D68 +4CD1C30C38AB935D418B5269EF197B9E +9D78134EE48654D753CCA1B76185CF8E +389D16428D2AADEC9713905572F42864 955237314186186656 8175794665478042155 9325786087413524176 @@ -18,8 +18,8 @@ 8163029322371165472 8788309436660676487 236561483980029756 -1 -1 +8DD5527CC43D76F4760D26BE0F641F7E +F8F7AD9B6CD4CF117A71E277E2EC2931 12384823029245979431 4507350192761038840 1188926775431157506 diff --git a/tests/queries/0_stateless/00746_hashing_tuples.sql b/tests/queries/0_stateless/00746_hashing_tuples.sql index f17ad6fa77f..466a2184c65 100644 --- a/tests/queries/0_stateless/00746_hashing_tuples.sql +++ b/tests/queries/0_stateless/00746_hashing_tuples.sql @@ -4,11 +4,11 @@ SELECT sipHash64(1, 2, 3); SELECT sipHash64(1, 3, 2); SELECT sipHash64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))); -SELECT hex(sipHash128('foo')) = hex(reverse(unhex('CC45107CC4B79F62D831BEF2103C7CBF'))) or hex(sipHash128('foo')) = 'CC45107CC4B79F62D831BEF2103C7CBF'; -SELECT hex(sipHash128('\x01')) = hex(reverse(unhex('DF2EC2F0669B000EDFF6ADEE264E7D68'))) or hex(sipHash128('\x01')) = 'DF2EC2F0669B000EDFF6ADEE264E7D68'; -SELECT hex(sipHash128('foo', 'foo')) = hex(reverse(unhex('4CD1C30C38AB935D418B5269EF197B9E'))) or hex(sipHash128('foo', 'foo')) = '4CD1C30C38AB935D418B5269EF197B9E'; -SELECT hex(sipHash128('foo', 'foo', 'foo')) = hex(reverse(unhex('9D78134EE48654D753CCA1B76185CF8E'))) or hex(sipHash128('foo', 'foo', 'foo')) = '9D78134EE48654D753CCA1B76185CF8E'; -SELECT hex(sipHash128(1, 2, 3)) = hex(reverse(unhex('389D16428D2AADEC9713905572F42864'))) or hex(sipHash128(1, 2, 3)) = '389D16428D2AADEC9713905572F42864'; +SELECT hex(sipHash128('foo')); +SELECT hex(sipHash128('\x01')); +SELECT hex(sipHash128('foo', 'foo')); +SELECT hex(sipHash128('foo', 'foo', 'foo')); +SELECT hex(sipHash128(1, 2, 3)); SELECT halfMD5(1, 2, 3); SELECT halfMD5(1, 3, 2); @@ -26,8 +26,8 @@ SELECT murmurHash3_64(1, 2, 3); SELECT murmurHash3_64(1, 3, 2); SELECT murmurHash3_64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))); -SELECT hex(murmurHash3_128('foo', 'foo')) = hex(reverse(unhex('8DD5527CC43D76F4760D26BE0F641F7E'))) or hex(murmurHash3_128('foo', 'foo')) = '8DD5527CC43D76F4760D26BE0F641F7E'; -SELECT hex(murmurHash3_128('foo', 'foo', 'foo')) = hex(reverse(unhex('F8F7AD9B6CD4CF117A71E277E2EC2931'))) or hex(murmurHash3_128('foo', 'foo', 'foo')) = 'F8F7AD9B6CD4CF117A71E277E2EC2931'; +SELECT hex(murmurHash3_128('foo', 'foo')); +SELECT hex(murmurHash3_128('foo', 'foo', 'foo')); SELECT gccMurmurHash(1, 2, 3); SELECT gccMurmurHash(1, 3, 2); diff --git a/tests/queries/0_stateless/02534_keyed_siphash.reference b/tests/queries/0_stateless/02534_keyed_siphash.reference index ccc514e7ea2..33838d5298c 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.reference +++ b/tests/queries/0_stateless/02534_keyed_siphash.reference @@ -191,9 +191,9 @@ E51B38608EF25F57 1 1 E28DBDE7FE22E41C -1 +1CE422FEE7BD8DE20000000000000000 E28DBDE7FE22E41C -1 +1CE422FEE7BD8DE20000000000000000 Check bug with hashing of const integer values 11862823756610506724 11862823756610506724 diff --git a/tests/queries/0_stateless/02534_keyed_siphash.sql b/tests/queries/0_stateless/02534_keyed_siphash.sql index 900b99f548a..80180cc568f 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.sql +++ b/tests/queries/0_stateless/02534_keyed_siphash.sql @@ -269,9 +269,9 @@ select sipHash64Keyed(toUInt64(0), '1'); -- { serverError 48 } select sipHash128Keyed(toUInt64(0), '1'); -- { serverError 48 } select hex(sipHash64()); -SELECT hex(sipHash128()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000'; +SELECT hex(sipHash128()); select hex(sipHash64Keyed()); -SELECT hex(sipHash128Keyed()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128Keyed()) = '1CE422FEE7BD8DE20000000000000000'; +SELECT hex(sipHash128Keyed()); SELECT 'Check bug with hashing of const integer values'; DROP TABLE IF EXISTS tab; diff --git a/tests/queries/0_stateless/02552_siphash128_reference.reference b/tests/queries/0_stateless/02552_siphash128_reference.reference index d00491fd7e5..6aa6c7ae075 100644 --- a/tests/queries/0_stateless/02552_siphash128_reference.reference +++ b/tests/queries/0_stateless/02552_siphash128_reference.reference @@ -1 +1 @@ -1 +1CE422FEE7BD8DE20000000000000000 diff --git a/tests/queries/0_stateless/02552_siphash128_reference.sql b/tests/queries/0_stateless/02552_siphash128_reference.sql index 200954c3b57..f3cfa6093dc 100644 --- a/tests/queries/0_stateless/02552_siphash128_reference.sql +++ b/tests/queries/0_stateless/02552_siphash128_reference.sql @@ -1 +1 @@ -SELECT hex(sipHash128Reference()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000'; +SELECT hex(sipHash128Reference()); From 0e15f098d19577845197cd025a53390c26f001e6 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 28 Jul 2023 02:09:57 -0400 Subject: [PATCH 43/81] Yet another fix test --- programs/keeper-client/Commands.cpp | 17 +++-- tests/integration/test_keeper_client/test.py | 80 +++++++++----------- 2 files changed, 46 insertions(+), 51 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 985e844afdf..cbdcde4f89a 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -27,8 +27,15 @@ void LSCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) con auto children = client->zookeeper->getChildren(path); std::sort(children.begin(), children.end()); + + bool need_space = false; for (const auto & child : children) - std::cout << child << " "; + { + if (std::exchange(need_space, true)) + std::cout << " "; + + std::cout << child; + } std::cout << "\n"; } @@ -194,7 +201,7 @@ void FindSuperNodes::execute(const ASTKeeperQuery * query, KeeperClient * client if (stat.numChildren >= static_cast(threshold)) { - std::cout << path << "\t" << stat.numChildren << "\n"; + std::cout << static_cast(path) << "\t" << stat.numChildren << "\n"; return; } @@ -219,15 +226,15 @@ void DeleteStableBackups::execute(const ASTKeeperQuery * /* query */, KeeperClie "You are going to delete all inactive backups in /clickhouse/backups.", [client] { - String backup_root = "/clickhouse/backups"; + fs::path backup_root = "/clickhouse/backups"; auto backups = client->zookeeper->getChildren(backup_root); for (const auto & child : backups) { - String backup_path = backup_root + "/" + child; + auto backup_path = backup_root / child; std::cout << "Found backup " << backup_path << ", checking if it's active\n"; - String stage_path = backup_path + "/stage"; + String stage_path = backup_path / "stage"; auto stages = client->zookeeper->getChildren(stage_path); bool is_active = false; diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index 5115c7d3789..3a54d0d175d 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -14,7 +14,7 @@ node = cluster.add_instance( ) -@pytest.fixture(scope="module") +@pytest.fixture(scope="module", autouse=True) def started_cluster(): try: cluster.start() @@ -24,30 +24,24 @@ def started_cluster(): cluster.shutdown() -class KeeperClient: - def __init__(self, started_cluster: ClickHouseCluster): - self.cluster = started_cluster - - def query(self, query: str): - return CommandRequest( - [ - self.cluster.server_bin_path, - "keeper-client", - "--host", - str(cluster.get_instance_ip("zoo1")), - "--port", - str(cluster.zookeeper_port), - "-q", - query, - ], - stdin="", - ) +def keeper_query(query: str): + return CommandRequest( + [ + cluster.server_bin_path, + "keeper-client", + "--host", + str(cluster.get_instance_ip("zoo1")), + "--port", + str(cluster.zookeeper_port), + "-q", + query, + ], + stdin="", + ) -def test_big_family(started_cluster: ClickHouseCluster): - client = KeeperClient(started_cluster) - - command = client.query( +def test_big_family(): + command = keeper_query( "create test_big_family foo;" "create test_big_family/1 foo;" "create test_big_family/1/1 foo;" @@ -55,6 +49,7 @@ def test_big_family(started_cluster: ClickHouseCluster): "create test_big_family/1/3 foo;" "create test_big_family/1/4 foo;" "create test_big_family/1/5 foo;" + "create test_big_family/2 foo;" "create test_big_family/2/1 foo;" "create test_big_family/2/2 foo;" "create test_big_family/2/3 foo;" @@ -76,7 +71,7 @@ def test_big_family(started_cluster: ClickHouseCluster): ] ) - command = client.query("find_big_family test_big_family 1;") + command = keeper_query("find_big_family test_big_family 1;") assert command.get_answer() == TSV( [ @@ -85,16 +80,16 @@ def test_big_family(started_cluster: ClickHouseCluster): ) -def test_find_super_nodes(started_cluster: ClickHouseCluster): - client = KeeperClient(started_cluster) - - command = client.query( +def test_find_super_nodes(): + command = keeper_query( + "create test_find_super_nodes foo;" "create test_find_super_nodes/1 foo;" "create test_find_super_nodes/1/1 foo;" "create test_find_super_nodes/1/2 foo;" "create test_find_super_nodes/1/3 foo;" "create test_find_super_nodes/1/4 foo;" "create test_find_super_nodes/1/5 foo;" + "create test_find_super_nodes/2 foo;" "create test_find_super_nodes/2/1 foo;" "create test_find_super_nodes/2/2 foo;" "create test_find_super_nodes/2/3 foo;" @@ -111,11 +106,8 @@ def test_find_super_nodes(started_cluster: ClickHouseCluster): ) -def test_delete_stable_backups(started_cluster: ClickHouseCluster): - client = KeeperClient(started_cluster) - - command = client.query( - "create /clickhouse foo;" +def test_delete_stable_backups(): + command = keeper_query( "create /clickhouse/backups foo;" "create /clickhouse/backups/1 foo;" "create /clickhouse/backups/1/stage foo;" @@ -130,18 +122,16 @@ def test_delete_stable_backups(started_cluster: ClickHouseCluster): assert command.get_answer() == ( "You are going to delete all inactive backups in /clickhouse/backups. Continue?\n" - "Found backup /clickhouse/backups/1, checking if it's active\n" - "Backup /clickhouse/backups/1 is active, not going to delete\n" - "Found backup /clickhouse/backups/2, checking if it's active\n" - "Backup /clickhouse/backups/2 is not active, deleting it\n" - "1" + "Found backup \"/clickhouse/backups/1\", checking if it's active\n" + "Backup \"/clickhouse/backups/1\" is active, not going to delete\n" + "Found backup \"/clickhouse/backups/2\", checking if it's active\n" + "Backup \"/clickhouse/backups/2\" is not active, deleting it\n" + "1\n" ) -def test_base_commands(started_cluster: ClickHouseCluster): - client = KeeperClient(started_cluster) - - command = client.query( +def test_base_commands(): + command = keeper_query( "create test_create_zk_node1 testvalue1;" "create test_create_zk_node_2 testvalue2;" "get test_create_zk_node1;" @@ -150,8 +140,6 @@ def test_base_commands(started_cluster: ClickHouseCluster): assert command.get_answer() == "testvalue1\n" -def test_four_letter_word_commands(started_cluster: ClickHouseCluster): - client = KeeperClient(started_cluster) - - command = client.query("ruok") +def test_four_letter_word_commands(): + command = keeper_query("ruok") assert command.get_answer() == "imok\n" From 9fd99ce81496867aab1499bf59ce2e6121065c5e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 28 Jul 2023 06:34:05 +0000 Subject: [PATCH 44/81] Automatic style fix --- tests/integration/test_keeper_client/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index 3a54d0d175d..3187ce10d2a 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -122,10 +122,10 @@ def test_delete_stable_backups(): assert command.get_answer() == ( "You are going to delete all inactive backups in /clickhouse/backups. Continue?\n" - "Found backup \"/clickhouse/backups/1\", checking if it's active\n" - "Backup \"/clickhouse/backups/1\" is active, not going to delete\n" - "Found backup \"/clickhouse/backups/2\", checking if it's active\n" - "Backup \"/clickhouse/backups/2\" is not active, deleting it\n" + 'Found backup "/clickhouse/backups/1", checking if it\'s active\n' + 'Backup "/clickhouse/backups/1" is active, not going to delete\n' + 'Found backup "/clickhouse/backups/2", checking if it\'s active\n' + 'Backup "/clickhouse/backups/2" is not active, deleting it\n' "1\n" ) From 8255e71554d52639075f66fdeba962f8742c1154 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 28 Jul 2023 11:39:06 +0200 Subject: [PATCH 45/81] Fix tests --- tests/queries/0_stateless/02833_url_without_path_encoding.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02833_url_without_path_encoding.sh b/tests/queries/0_stateless/02833_url_without_path_encoding.sh index 699b1c8e99b..b71586099cf 100755 --- a/tests/queries/0_stateless/02833_url_without_path_encoding.sh +++ b/tests/queries/0_stateless/02833_url_without_path_encoding.sh @@ -5,8 +5,8 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "select count() from url('http://localhost:11111/test%2Fa.tsv') settings enable_url_encoding=0" +$CLICKHOUSE_CLIENT -q "select count() from url('http://localhost:11111/test%2Fa.tsv') settings enable_url_encoding=1" # Grep 'test%2Fa.tsv' to ensure that path wasn't encoded/decoded -$CLICKHOUSE_CLIENT -q "select count() from url('http://localhost:11111/test%2Fa.tsv') settings enable_url_encoding=1" 2>&1 | grep -o "test%2Fa.tsv" -m1 +$CLICKHOUSE_CLIENT -q "select count() from url('http://localhost:11111/test%2Fa.tsv') settings enable_url_encoding=0" 2>&1 | grep -o "test%2Fa.tsv" -m1 From 1e77c7c73f3dc7db2e52d618999f909fcf3e9d97 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 28 Jul 2023 10:48:22 -0400 Subject: [PATCH 46/81] Deterministic delete_stable_backups --- programs/keeper-client/Commands.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index cbdcde4f89a..fd0a00d59db 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -228,6 +228,7 @@ void DeleteStableBackups::execute(const ASTKeeperQuery * /* query */, KeeperClie { fs::path backup_root = "/clickhouse/backups"; auto backups = client->zookeeper->getChildren(backup_root); + std::sort(backups.begin(), backups.end()); for (const auto & child : backups) { From b2ee4505fb1feee2e3581d438544e491bfa7c9a1 Mon Sep 17 00:00:00 2001 From: Yury Bogomolov Date: Fri, 28 Jul 2023 16:06:03 +0000 Subject: [PATCH 47/81] review fixes --- src/Storages/Kafka/StorageKafka.cpp | 6 +----- tests/integration/test_storage_kafka/test.py | 22 ++++---------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 8cc4fd90f8d..a7315eb51ea 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -250,11 +250,7 @@ StorageKafka::StorageKafka( : IStorage(table_id_) , WithContext(context_->getGlobalContext()) , kafka_settings(std::move(kafka_settings_)) - , macros_info([&table_id_](){ - Macros::MacroExpansionInfo info; - info.table_id = table_id_; - return info; - }()) + , macros_info{.table_id = table_id_} , topics(parseTopics(getContext()->getMacros()->expand(kafka_settings->kafka_topic_list.value, macros_info))) , brokers(getContext()->getMacros()->expand(kafka_settings->kafka_broker_list.value, macros_info)) , group(getContext()->getMacros()->expand(kafka_settings->kafka_group_name.value, macros_info)) diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 621fc72a607..aa227c08196 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -461,28 +461,14 @@ def test_kafka_settings_predefined_macros(kafka_cluster): ) messages = [] - for i in range(25): + for i in range(50): messages.append(json.dumps({"key": i, "value": i})) - kafka_produce(kafka_cluster, "test_kafka", messages) - - # Insert couple of malformed messages. - kafka_produce(kafka_cluster, "test_kafka", ["}{very_broken_message,"]) - kafka_produce(kafka_cluster, "test_kafka", ["}another{very_broken_message,"]) - - messages = [] - for i in range(25, 50): - messages.append(json.dumps({"key": i, "value": i})) - kafka_produce(kafka_cluster, "test_kafka", messages) - - result = "" - while True: - result += instance.query("SELECT * FROM test.kafka", ignore_error=True) - if kafka_check_result(result): - break + kafka_produce(kafka_cluster, "test_kafka_topic", messages) + result = instance.query("SELECT * FROM test.kafka", ignore_error=True) kafka_check_result(result, True) - members = describe_consumer_group(kafka_cluster, "new") + members = describe_consumer_group(kafka_cluster, "test_kafka_group") assert members[0]["client_id"] == "test_kafka test 1234" From 10a5230df4d8c352657327fdd75947acc269377a Mon Sep 17 00:00:00 2001 From: copperybean Date: Sat, 29 Jul 2023 11:20:12 +0800 Subject: [PATCH 48/81] fix unexpected sort result on multi columns with nulls first direction Change-Id: I94494019312d6ccfe25695f3614353be08800f5f --- src/Columns/ColumnNullable.cpp | 13 ++++++++++--- .../0_stateless/02832_nulls_first_sort.reference | 5 +++++ .../queries/0_stateless/02832_nulls_first_sort.sql | 7 +++++++ 3 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02832_nulls_first_sort.reference create mode 100644 tests/queries/0_stateless/02832_nulls_first_sort.sql diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index aa04fc910a5..2eb2ff0bf69 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -564,15 +564,22 @@ void ColumnNullable::updatePermutationImpl(IColumn::PermutationSortDirection dir else getNestedColumn().updatePermutation(direction, stability, limit, null_direction_hint, res, new_ranges); - equal_ranges = std::move(new_ranges); - if (unlikely(stability == PermutationSortStability::Stable)) { for (auto & null_range : null_ranges) ::sort(res.begin() + null_range.first, res.begin() + null_range.second); } - std::move(null_ranges.begin(), null_ranges.end(), std::back_inserter(equal_ranges)); + if (is_nulls_last || null_ranges.empty()) + { + equal_ranges = std::move(new_ranges); + std::move(null_ranges.begin(), null_ranges.end(), std::back_inserter(equal_ranges)); + } + else + { + equal_ranges = std::move(null_ranges); + std::move(new_ranges.begin(), new_ranges.end(), std::back_inserter(equal_ranges)); + } } void ColumnNullable::getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, diff --git a/tests/queries/0_stateless/02832_nulls_first_sort.reference b/tests/queries/0_stateless/02832_nulls_first_sort.reference new file mode 100644 index 00000000000..c16f69ac3c1 --- /dev/null +++ b/tests/queries/0_stateless/02832_nulls_first_sort.reference @@ -0,0 +1,5 @@ +5 \N 1 +5 \N 2 +5 \N 3 +5 \N 7 +5 1 1 diff --git a/tests/queries/0_stateless/02832_nulls_first_sort.sql b/tests/queries/0_stateless/02832_nulls_first_sort.sql new file mode 100644 index 00000000000..e17a49baf24 --- /dev/null +++ b/tests/queries/0_stateless/02832_nulls_first_sort.sql @@ -0,0 +1,7 @@ +DROP TABLE IF EXISTS nulls_first_sort_test; +CREATE TABLE nulls_first_sort_test (a Nullable(Int32), b Nullable(Int32), c Nullable(Int32)) ENGINE = Memory; + +INSERT INTO nulls_first_sort_test VALUES (5,null,2), (5,null,1), (5,null,7), (5,null,3), (5,7,4), (5,7,6), (5,7,2), (5,7,1), (5,7,3), (5,7,9), (5,1,4), (5,1,6), (5,1,2), (5,1,1), (5,1,3), (5,1,9); + +SELECT * FROM nulls_first_sort_test ORDER BY a NULLS FIRST,b NULLS FIRST,c NULLS FIRST LIMIT 5; +DROP TABLE nulls_first_sort_test; From c270393495c1622c1e085c1abf750c18bfeb27d4 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sat, 29 Jul 2023 14:56:43 +0000 Subject: [PATCH 49/81] Add abortOnFailedAssertion() declaration in defines.h --- base/base/defines.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/base/base/defines.h b/base/base/defines.h index 6abf8155b95..ee29ecf6118 100644 --- a/base/base/defines.h +++ b/base/base/defines.h @@ -115,8 +115,15 @@ /// because SIGABRT is easier to debug than SIGTRAP (the second one makes gdb crazy) #if !defined(chassert) #if defined(ABORT_ON_LOGICAL_ERROR) + // clang-format off + #include + namespace DB + { + void abortOnFailedAssertion(const String & description); + } #define chassert(x) static_cast(x) ? void(0) : ::DB::abortOnFailedAssertion(#x) #define UNREACHABLE() abort() + // clang-format off #else /// Here sizeof() trick is used to suppress unused warning for result, /// since simple "(void)x" will evaluate the expression, while From 9e0fb6b81da2a72c41a40f16dea245d1f6a16f35 Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 30 Jul 2023 09:48:05 +0800 Subject: [PATCH 50/81] rename test files Change-Id: I2388f72af133e4496f957f8361fd23a5cf79469f --- ...ulls_first_sort.reference => 02834_nulls_first_sort.reference} | 0 .../{02832_nulls_first_sort.sql => 02834_nulls_first_sort.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02832_nulls_first_sort.reference => 02834_nulls_first_sort.reference} (100%) rename tests/queries/0_stateless/{02832_nulls_first_sort.sql => 02834_nulls_first_sort.sql} (100%) diff --git a/tests/queries/0_stateless/02832_nulls_first_sort.reference b/tests/queries/0_stateless/02834_nulls_first_sort.reference similarity index 100% rename from tests/queries/0_stateless/02832_nulls_first_sort.reference rename to tests/queries/0_stateless/02834_nulls_first_sort.reference diff --git a/tests/queries/0_stateless/02832_nulls_first_sort.sql b/tests/queries/0_stateless/02834_nulls_first_sort.sql similarity index 100% rename from tests/queries/0_stateless/02832_nulls_first_sort.sql rename to tests/queries/0_stateless/02834_nulls_first_sort.sql From 0ff0a4e41f39479530c1ef81388dc058266201b5 Mon Sep 17 00:00:00 2001 From: zvonand Date: Sun, 30 Jul 2023 19:42:10 +0200 Subject: [PATCH 51/81] diverge parsing methods --- src/Functions/FunctionsConversion.h | 76 ++++++++++++++----- src/IO/ReadHelpers.h | 4 +- src/IO/readFloatText.cpp | 12 ++- src/IO/readFloatText.h | 16 +++- .../0_stateless/02813_float_parsing.reference | 1 + .../0_stateless/02813_float_parsing.sql | 10 +++ 6 files changed, 92 insertions(+), 27 deletions(-) create mode 100644 tests/queries/0_stateless/02813_float_parsing.reference create mode 100644 tests/queries/0_stateless/02813_float_parsing.sql diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 00c4cfe7284..01ff47af18c 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1024,14 +1024,23 @@ inline void convertFromTime(DataTypeDateTime::FieldType & x, t /** Conversion of strings to numbers, dates, datetimes: through parsing. */ +template +void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +{ + if constexpr (std::is_floating_point_v) + readFloatText(x, rb); + else + parseType(x, rb, time_zone); +} + template -void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +void parseType(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { readText(x, rb); } template <> -inline void parseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline void parseType(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { DayNum tmp(0); readDateText(tmp, rb, *time_zone); @@ -1039,7 +1048,7 @@ inline void parseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb } template <> -inline void parseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline void parseType(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { ExtendedDayNum tmp(0); readDateText(tmp, rb, *time_zone); @@ -1049,7 +1058,7 @@ inline void parseImpl(DataTypeDate32::FieldType & x, ReadBuffer // NOTE: no need of extra overload of DateTime64, since readDateTimeText64 has different signature and that case is explicitly handled in the calling code. template <> -inline void parseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline void parseType(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { time_t time = 0; readDateTimeText(time, rb, *time_zone); @@ -1057,7 +1066,7 @@ inline void parseImpl(DataTypeDateTime::FieldType & x, ReadBuf } template <> -inline void parseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline void parseType(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { UUID tmp; readUUIDText(tmp, rb); @@ -1065,7 +1074,7 @@ inline void parseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb } template <> -inline void parseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline void parseType(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { IPv4 tmp; readIPv4Text(tmp, rb); @@ -1073,24 +1082,30 @@ inline void parseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb } template <> -inline void parseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline void parseType(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { IPv6 tmp; readIPv6Text(tmp, rb); x = tmp; } -template -bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +template +bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { if constexpr (std::is_floating_point_v) - return tryReadFloatText(x, rb); + return tryReadFloatText(x, rb); else /*if constexpr (is_integer_v)*/ - return tryReadIntText(x, rb); + return tryParseTypeImpl(x, rb, time_zone); +} + +template +inline bool tryParseTypeImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +{ + return tryReadIntText(x, rb); } template <> -inline bool tryParseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline bool tryParseTypeImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { DayNum tmp(0); if (!tryReadDateText(tmp, rb, *time_zone)) @@ -1100,7 +1115,7 @@ inline bool tryParseImpl(DataTypeDate::FieldType & x, ReadBuffer & } template <> -inline bool tryParseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline bool tryParseTypeImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { ExtendedDayNum tmp(0); if (!tryReadDateText(tmp, rb, *time_zone)) @@ -1110,7 +1125,7 @@ inline bool tryParseImpl(DataTypeDate32::FieldType & x, ReadBuff } template <> -inline bool tryParseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline bool tryParseTypeImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { time_t tmp = 0; if (!tryReadDateTimeText(tmp, rb, *time_zone)) @@ -1120,7 +1135,7 @@ inline bool tryParseImpl(DataTypeDateTime::FieldType & x, Read } template <> -inline bool tryParseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline bool tryParseTypeImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { UUID tmp; if (!tryReadUUIDText(tmp, rb)) @@ -1131,7 +1146,7 @@ inline bool tryParseImpl(DataTypeUUID::FieldType & x, ReadBuffer & } template <> -inline bool tryParseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline bool tryParseTypeImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { IPv4 tmp; if (!tryReadIPv4Text(tmp, rb)) @@ -1142,7 +1157,7 @@ inline bool tryParseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & } template <> -inline bool tryParseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline bool tryParseTypeImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) { IPv6 tmp; if (!tryReadIPv6Text(tmp, rb)) @@ -1321,6 +1336,29 @@ struct ConvertThroughParsing size_t current_offset = 0; + bool precise_float_parsing = false; + + if (DB::CurrentThread::isInitialized()) + { + const DB::ContextPtr query_context = DB::CurrentThread::get().getQueryContext(); + + if (query_context) + precise_float_parsing = query_context->getSettingsRef().precise_float_parsing; + } + + std::function parseFunction; + std::function tryParseFunction; + + if constexpr (!(parsing_mode == ConvertFromStringParsingMode::BestEffort || + parsing_mode == ConvertFromStringParsingMode::BestEffortUS || + to_datetime64 || IsDataTypeDecimal)) + { + if constexpr (exception_mode == ConvertFromStringExceptionMode::Throw) + parseFunction = precise_float_parsing ? parseImpl : parseImpl; + else + tryParseFunction = precise_float_parsing ? tryParseImpl : tryParseImpl; + } + for (size_t i = 0; i < size; ++i) { size_t next_offset = std::is_same_v ? (*offsets)[i] : (current_offset + fixed_string_size); @@ -1387,7 +1425,7 @@ struct ConvertThroughParsing } } - parseImpl(vec_to[i], read_buffer, local_time_zone); + parseFunction(vec_to[i], read_buffer, local_time_zone); } while (false); } } @@ -1457,7 +1495,7 @@ struct ConvertThroughParsing } } - parsed = tryParseImpl(vec_to[i], read_buffer, local_time_zone); + parsed = tryParseFunction(vec_to[i], read_buffer, local_time_zone); } while (false); } } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index b8ce162ec91..f4d31c2d3d9 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -526,8 +526,8 @@ void tryReadIntTextUnsafe(T & x, ReadBuffer & buf) /// Look at readFloatText.h -template void readFloatText(T & x, ReadBuffer & in); -template bool tryReadFloatText(T & x, ReadBuffer & in); +template void readFloatText(T & x, ReadBuffer & in); +template bool tryReadFloatText(T & x, ReadBuffer & in); /// simple: all until '\n' or '\t' diff --git a/src/IO/readFloatText.cpp b/src/IO/readFloatText.cpp index d1143f7c62c..60505e0fb22 100644 --- a/src/IO/readFloatText.cpp +++ b/src/IO/readFloatText.cpp @@ -62,9 +62,13 @@ template void readFloatTextSimple(Float64 &, ReadBuffer &); template bool tryReadFloatTextSimple(Float32 &, ReadBuffer &); template bool tryReadFloatTextSimple(Float64 &, ReadBuffer &); -template void readFloatText(Float32 &, ReadBuffer &); -template void readFloatText(Float64 &, ReadBuffer &); -template bool tryReadFloatText(Float32 &, ReadBuffer &); -template bool tryReadFloatText(Float64 &, ReadBuffer &); +template void readFloatText(Float32 &, ReadBuffer &); +template void readFloatText(Float64 &, ReadBuffer &); +template void readFloatText(Float32 &, ReadBuffer &); +template void readFloatText(Float64 &, ReadBuffer &); +template bool tryReadFloatText(Float32 &, ReadBuffer &); +template bool tryReadFloatText(Float64 &, ReadBuffer &); +template bool tryReadFloatText(Float32 &, ReadBuffer &); +template bool tryReadFloatText(Float64 &, ReadBuffer &); } diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index da4719b8dcb..de80342687f 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -590,7 +590,19 @@ template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { retu /// Implementation that is selected as default. -template void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } -template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } +template void readFloatText(T & x, ReadBuffer & in) +{ + if constexpr (precise_float_parsing) + readFloatTextPrecise(x, in); + else + readFloatTextFast(x, in); +} +template bool tryReadFloatText(T & x, ReadBuffer & in) +{ + if constexpr (precise_float_parsing) + return tryReadFloatTextPrecise(x, in); + else + return tryReadFloatTextFast(x, in); +} } diff --git a/tests/queries/0_stateless/02813_float_parsing.reference b/tests/queries/0_stateless/02813_float_parsing.reference new file mode 100644 index 00000000000..ed4797f7a37 --- /dev/null +++ b/tests/queries/0_stateless/02813_float_parsing.reference @@ -0,0 +1 @@ +1.7091 15008753 6e-9 6.000000000000001e-9 1.7091 15008753 6e-9 6e-9 diff --git a/tests/queries/0_stateless/02813_float_parsing.sql b/tests/queries/0_stateless/02813_float_parsing.sql new file mode 100644 index 00000000000..10f390f56d7 --- /dev/null +++ b/tests/queries/0_stateless/02813_float_parsing.sql @@ -0,0 +1,10 @@ +SELECT + toFloat64('1.7091'), + toFloat64('15008753.'), + toFloat64('6e-09'), + toFloat64('6.000000000000001e-9'), + toFloat32('1.7091'), + toFloat32('15008753.'), + toFloat32('6e-09'), + toFloat32('6.000000000000001e-9') +SETTINGS precise_float_parsing = 1; From 00a1a9f6bdcae72a5c115f467874b2b18f4a60e8 Mon Sep 17 00:00:00 2001 From: zvonand Date: Sun, 30 Jul 2023 20:59:15 +0200 Subject: [PATCH 52/81] added docs --- docs/en/operations/settings/settings.md | 25 ++++++++++++++++++++++++ docs/ru/operations/settings/settings.md | 26 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 8b969f87a4d..886177b253b 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4569,3 +4569,28 @@ Type: Int64 Default: 0 +## precise_float_parsing {#precise_float_parsing} + +Switches [Float32/Float64](../../sql-reference/data-types/float.md) parsing algorithms: +* If the value is `1`, then precise method is used. It is slower than fast method, but it always returns a number that is the closest machine representable number to the input. +* Otherwise, fast method is used (default). It usually returns the same value as precise, but in rare cases result may differ by one or two least significant bits. + +Possible values: `0`, `1`. + +Default value: `0`. + +Example: + +```sql +SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 0; + +┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐ +│ 1.7090999999999998 │ 15008753.000000002 │ +└─────────────────────┴──────────────────────────┘ + +SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 1; + +┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐ +│ 1.7091 │ 15008753 │ +└─────────────────────┴──────────────────────────┘ +``` diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 42e21f6140b..bb84c4225f7 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -4212,3 +4212,29 @@ SELECT *, timezone() FROM test_tz WHERE d = '2000-01-01 00:00:00' SETTINGS sessi - Запрос: `SELECT * FROM file('sample.csv')` Если чтение и обработка `sample.csv` прошли успешно, файл будет переименован в `processed_sample_1683473210851438.csv`. + +## precise_float_parsing {#precise_float_parsing} + +Позволяет выбрать алгоритм, используемый при парсинге [Float32/Float64](../../sql-reference/data-types/float.md): +* Если установлено значение `1`, то используется точный метод. Он более медленный, но всегда возвращает число, наиболее близкое к входному значению. +* В противном случае используется быстрый метод (поведение по умолчанию). Обычно результат его работы совпадает с результатом, полученным точным методом, однако в редких случаях он может отличаться на 1 или 2 наименее значимых бит. + +Возможные значения: `0`, `1`. + +Значение по умолчанию: `0`. + +Пример: + +```sql +SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 0; + +┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐ +│ 1.7090999999999998 │ 15008753.000000002 │ +└─────────────────────┴──────────────────────────┘ + +SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 1; + +┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐ +│ 1.7091 │ 15008753 │ +└─────────────────────┴──────────────────────────┘ +``` From d23dd7d30fde7b81273bdec99968422ec49ecabe Mon Sep 17 00:00:00 2001 From: zvonand Date: Sun, 30 Jul 2023 23:02:45 +0200 Subject: [PATCH 53/81] upd test --- tests/queries/0_stateless/02813_float_parsing.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02813_float_parsing.sql b/tests/queries/0_stateless/02813_float_parsing.sql index 10f390f56d7..ab74f1e64e4 100644 --- a/tests/queries/0_stateless/02813_float_parsing.sql +++ b/tests/queries/0_stateless/02813_float_parsing.sql @@ -1,10 +1,10 @@ SELECT toFloat64('1.7091'), - toFloat64('15008753.'), + toFloat64('1.5008753E7'), toFloat64('6e-09'), toFloat64('6.000000000000001e-9'), toFloat32('1.7091'), - toFloat32('15008753.'), + toFloat32('1.5008753E7'), toFloat32('6e-09'), toFloat32('6.000000000000001e-9') SETTINGS precise_float_parsing = 1; From 36b5ccfa3ad355af55ee2a6fa1b82bc631af1289 Mon Sep 17 00:00:00 2001 From: zvonand Date: Mon, 31 Jul 2023 10:03:44 +0200 Subject: [PATCH 54/81] update tests --- .../queries/0_stateless/02813_float_parsing.reference | 1 + tests/queries/0_stateless/02813_float_parsing.sql | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/tests/queries/0_stateless/02813_float_parsing.reference b/tests/queries/0_stateless/02813_float_parsing.reference index ed4797f7a37..c83331e0138 100644 --- a/tests/queries/0_stateless/02813_float_parsing.reference +++ b/tests/queries/0_stateless/02813_float_parsing.reference @@ -1 +1,2 @@ +1.7090999999999998 15008753.000000002 6.000000000000001e-9 6.000000000000002e-9 1.7091 15008752 5.9999996e-9 5.9999996e-9 1.7091 15008753 6e-9 6.000000000000001e-9 1.7091 15008753 6e-9 6e-9 diff --git a/tests/queries/0_stateless/02813_float_parsing.sql b/tests/queries/0_stateless/02813_float_parsing.sql index ab74f1e64e4..ba57b87f191 100644 --- a/tests/queries/0_stateless/02813_float_parsing.sql +++ b/tests/queries/0_stateless/02813_float_parsing.sql @@ -1,3 +1,14 @@ +SELECT + toFloat64('1.7091'), + toFloat64('1.5008753E7'), + toFloat64('6e-09'), + toFloat64('6.000000000000001e-9'), + toFloat32('1.7091'), + toFloat32('1.5008753E7'), + toFloat32('6e-09'), + toFloat32('6.000000000000001e-9') +SETTINGS precise_float_parsing = 0; + SELECT toFloat64('1.7091'), toFloat64('1.5008753E7'), From 987dd6602ab1ccf7e3af017f59d275c241406138 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:08:51 +0300 Subject: [PATCH 55/81] Update docs/en/operations/settings/settings.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 886177b253b..345257ac6ed 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4573,7 +4573,7 @@ Default: 0 Switches [Float32/Float64](../../sql-reference/data-types/float.md) parsing algorithms: * If the value is `1`, then precise method is used. It is slower than fast method, but it always returns a number that is the closest machine representable number to the input. -* Otherwise, fast method is used (default). It usually returns the same value as precise, but in rare cases result may differ by one or two least significant bits. +* Otherwise, fast method is used (default). It usually returns the same value as precise, but in rare cases result may differ by one or two least significant digits. Possible values: `0`, `1`. From cb5bebf4e4582a1c581128d5087393ce37e3848a Mon Sep 17 00:00:00 2001 From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:09:01 +0300 Subject: [PATCH 56/81] Update docs/ru/operations/settings/settings.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/ru/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index bb84c4225f7..87a7aa3b0d4 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -4217,7 +4217,7 @@ SELECT *, timezone() FROM test_tz WHERE d = '2000-01-01 00:00:00' SETTINGS sessi Позволяет выбрать алгоритм, используемый при парсинге [Float32/Float64](../../sql-reference/data-types/float.md): * Если установлено значение `1`, то используется точный метод. Он более медленный, но всегда возвращает число, наиболее близкое к входному значению. -* В противном случае используется быстрый метод (поведение по умолчанию). Обычно результат его работы совпадает с результатом, полученным точным методом, однако в редких случаях он может отличаться на 1 или 2 наименее значимых бит. +* В противном случае используется быстрый метод (поведение по умолчанию). Обычно результат его работы совпадает с результатом, полученным точным методом, однако в редких случаях он может отличаться на 1 или 2 наименее значимых цифры. Возможные значения: `0`, `1`. From f457a51ac02ffa6b4ec00c416be2e5d9958f12e0 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 31 Jul 2023 13:50:36 +0200 Subject: [PATCH 57/81] Fix default value --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9823f8e9c95..fbe805cdfa5 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -622,7 +622,7 @@ class IColumn; M(Bool, engine_file_allow_create_multiple_files, false, "Enables or disables creating a new file on each insert in file engine tables if format has suffix.", 0) \ M(Bool, engine_file_skip_empty_files, false, "Allows to skip empty files in file table engine", 0) \ M(Bool, engine_url_skip_empty_files, false, "Allows to skip empty files in url table engine", 0) \ - M(Bool, enable_url_encoding, false, " Allows to enable/disable decoding/encoding path in uri in URL table engine", 0) \ + M(Bool, enable_url_encoding, true, " Allows to enable/disable decoding/encoding path in uri in URL table engine", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ M(Bool, database_replicated_enforce_synchronous_settings, false, "Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, alter_sync). Not recommended to enable these settings.", 0) \ From e02805d9f777cddbc3d933c781aa6585efe3a6b3 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 31 Jul 2023 16:55:51 +0300 Subject: [PATCH 58/81] Update test.py --- tests/integration/test_crash_log/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_crash_log/test.py b/tests/integration/test_crash_log/test.py index 1b7e7f38242..a5b82039a84 100644 --- a/tests/integration/test_crash_log/test.py +++ b/tests/integration/test_crash_log/test.py @@ -30,7 +30,7 @@ def send_signal(started_node, signal): def wait_for_clickhouse_stop(started_node): result = None - for attempt in range(60): + for attempt in range(120): time.sleep(1) pid = started_node.get_process_pid("clickhouse") if pid is None: From 2088f846ad0e3fcedc37e8123cbc8c6d185958d4 Mon Sep 17 00:00:00 2001 From: zvonand Date: Mon, 31 Jul 2023 16:26:18 +0200 Subject: [PATCH 59/81] update as per reviews --- src/Functions/FunctionsConversion.h | 14 ++++++++++++-- src/IO/ReadHelpers.h | 10 ++++++++-- src/IO/readFloatText.cpp | 12 ++++-------- src/IO/readFloatText.h | 16 ++-------------- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 01ff47af18c..f3b93e5cf7a 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1028,7 +1028,12 @@ template void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { if constexpr (std::is_floating_point_v) - readFloatText(x, rb); + { + if constexpr (precise_float_parsing) + readFloatTextPrecise(x, rb); + else + readFloatTextFast(x, rb); + } else parseType(x, rb, time_zone); } @@ -1093,7 +1098,12 @@ template bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) { if constexpr (std::is_floating_point_v) - return tryReadFloatText(x, rb); + { + if constexpr (precise_float_parsing) + return tryReadFloatTextPrecise(x, rb); + else + return tryReadFloatTextFast(x, rb); + } else /*if constexpr (is_integer_v)*/ return tryParseTypeImpl(x, rb, time_zone); } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index f4d31c2d3d9..20fe8bd5998 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -526,8 +526,14 @@ void tryReadIntTextUnsafe(T & x, ReadBuffer & buf) /// Look at readFloatText.h -template void readFloatText(T & x, ReadBuffer & in); -template bool tryReadFloatText(T & x, ReadBuffer & in); +template void readFloatText(T & x, ReadBuffer & in); +template bool tryReadFloatText(T & x, ReadBuffer & in); + +template void readFloatTextPrecise(T & x, ReadBuffer & in); +template bool tryReadFloatTextPrecise(T & x, ReadBuffer & in); +template void readFloatTextFast(T & x, ReadBuffer & in); +template bool tryReadFloatTextFast(T & x, ReadBuffer & in); + /// simple: all until '\n' or '\t' diff --git a/src/IO/readFloatText.cpp b/src/IO/readFloatText.cpp index 60505e0fb22..d1143f7c62c 100644 --- a/src/IO/readFloatText.cpp +++ b/src/IO/readFloatText.cpp @@ -62,13 +62,9 @@ template void readFloatTextSimple(Float64 &, ReadBuffer &); template bool tryReadFloatTextSimple(Float32 &, ReadBuffer &); template bool tryReadFloatTextSimple(Float64 &, ReadBuffer &); -template void readFloatText(Float32 &, ReadBuffer &); -template void readFloatText(Float64 &, ReadBuffer &); -template void readFloatText(Float32 &, ReadBuffer &); -template void readFloatText(Float64 &, ReadBuffer &); -template bool tryReadFloatText(Float32 &, ReadBuffer &); -template bool tryReadFloatText(Float64 &, ReadBuffer &); -template bool tryReadFloatText(Float32 &, ReadBuffer &); -template bool tryReadFloatText(Float64 &, ReadBuffer &); +template void readFloatText(Float32 &, ReadBuffer &); +template void readFloatText(Float64 &, ReadBuffer &); +template bool tryReadFloatText(Float32 &, ReadBuffer &); +template bool tryReadFloatText(Float64 &, ReadBuffer &); } diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index de80342687f..da4719b8dcb 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -590,19 +590,7 @@ template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { retu /// Implementation that is selected as default. -template void readFloatText(T & x, ReadBuffer & in) -{ - if constexpr (precise_float_parsing) - readFloatTextPrecise(x, in); - else - readFloatTextFast(x, in); -} -template bool tryReadFloatText(T & x, ReadBuffer & in) -{ - if constexpr (precise_float_parsing) - return tryReadFloatTextPrecise(x, in); - else - return tryReadFloatTextFast(x, in); -} +template void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } +template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } } From e50ffa118134e534c275d1a744e2668a6e05eb26 Mon Sep 17 00:00:00 2001 From: zvonand Date: Mon, 31 Jul 2023 16:44:46 +0200 Subject: [PATCH 60/81] fix style --- src/IO/ReadHelpers.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 20fe8bd5998..ba6b6f453af 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -535,7 +535,6 @@ template void readFloatTextFast(T & x, ReadBuffer & in); template bool tryReadFloatTextFast(T & x, ReadBuffer & in); - /// simple: all until '\n' or '\t' void readString(String & s, ReadBuffer & buf); From f9e0a202a7eb55b17b427f4ad9439ca80ce21821 Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Mon, 31 Jul 2023 17:45:54 +0100 Subject: [PATCH 61/81] clickhouse-keeper: fix implementation of server with poll() --- src/Server/KeeperTCPHandler.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index e3edc281e83..e44ab4a4bd7 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -175,10 +175,12 @@ struct SocketInterruptablePollWrapper } while (rc < 0 && errno == POCO_EINTR); - if (rc >= 1 && poll_buf[0].revents & POLLIN) - socket_ready = true; - if (rc >= 2 && poll_buf[1].revents & POLLIN) - fd_ready = true; + if (rc >= 1) { + if (poll_buf[0].revents & POLLIN) + socket_ready = true; + if (poll_buf[1].revents & POLLIN) + fd_ready = true; + } #endif } From 8447a93931d59b3b3a2c1fc1fef0c14ae43d62b4 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 31 Jul 2023 14:22:30 +0000 Subject: [PATCH 62/81] fix sorting of sparse columns with large limit --- src/Columns/ColumnSparse.cpp | 3 ++- .../Transforms/PartialSortingTransform.cpp | 2 +- .../02834_sparse_columns_sort_with_limit.reference | 1 + .../02834_sparse_columns_sort_with_limit.sql | 12 ++++++++++++ 4 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.reference create mode 100644 tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.sql diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 0cad2c2d36b..d5edae7e7ea 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -1,3 +1,4 @@ +#include "Common/typeid_cast.h" #include #include #include @@ -439,7 +440,7 @@ void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num, PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const { - if (row_indexes) + if (row_indexes || !typeid_cast(&rhs)) { /// TODO: implement without conversion to full column. auto this_full = convertToFullColumnIfSparse(); diff --git a/src/Processors/Transforms/PartialSortingTransform.cpp b/src/Processors/Transforms/PartialSortingTransform.cpp index 33c2d870b76..3fc9a4e71db 100644 --- a/src/Processors/Transforms/PartialSortingTransform.cpp +++ b/src/Processors/Transforms/PartialSortingTransform.cpp @@ -159,7 +159,7 @@ void PartialSortingTransform::transform(Chunk & chunk) { MutableColumnPtr sort_description_threshold_column_updated = raw_block_columns[i]->cloneEmpty(); sort_description_threshold_column_updated->insertFrom(*raw_block_columns[i], min_row_to_compare); - sort_description_threshold_columns_updated[i] = std::move(sort_description_threshold_column_updated); + sort_description_threshold_columns_updated[i] = sort_description_threshold_column_updated->convertToFullColumnIfSparse(); } sort_description_threshold_columns = std::move(sort_description_threshold_columns_updated); diff --git a/tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.reference b/tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.reference new file mode 100644 index 00000000000..e2ed8f4daf2 --- /dev/null +++ b/tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.reference @@ -0,0 +1 @@ +65536 diff --git a/tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.sql b/tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.sql new file mode 100644 index 00000000000..32bd9694bd0 --- /dev/null +++ b/tests/queries/0_stateless/02834_sparse_columns_sort_with_limit.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS t_sparse_sort_limit; + +CREATE TABLE t_sparse_sort_limit (date Date, i UInt64, v Int16) +ENGINE = MergeTree ORDER BY (date, i) +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9; + +INSERT INTO t_sparse_sort_limit SELECT '2020-10-10', number % 10, number FROM numbers(100000); +INSERT INTO t_sparse_sort_limit SELECT '2020-10-11', number % 10, number FROM numbers(100000); + +SELECT count() FROM (SELECT toStartOfMonth(date) AS d FROM t_sparse_sort_limit ORDER BY -i LIMIT 65536); + +DROP TABLE IF EXISTS t_sparse_sort_limit; From f13343b7ee49838fafa3ba71dd37d19741c98785 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 31 Jul 2023 17:50:24 +0000 Subject: [PATCH 63/81] ASD --- src/Columns/ColumnSparse.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index d5edae7e7ea..4f76a9be4b9 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -1,4 +1,3 @@ -#include "Common/typeid_cast.h" #include #include #include From f2fd1da89163b037495f2b10a6af936d1874be3b Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 31 Jul 2023 19:13:27 +0000 Subject: [PATCH 64/81] Fix a build error --- src/Functions/FunctionsHashing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 929cb2f77f4..5363220bd01 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -1074,7 +1074,7 @@ private: { // IPv6 addresses are parsed into four 32-bit components in big-endian ordering on both platforms, so no change is necessary. // Reference: `parseIPv6orIPv4` in src/Common/formatIPv6.h. - if constexpr (std::endian::native == std::endian::big && std::is_same_v) + if constexpr (std::endian::native == std::endian::big && std::is_same_v, IPv6>) return; transformEndianness(value); From e5240f900b2f0c05060ce538e0d07fe233f53883 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 1 Aug 2023 00:09:13 +0200 Subject: [PATCH 65/81] make regexp analyzer recognize named capturing groups --- src/Common/OptimizedRegularExpression.cpp | 24 +++++++++++++++++++ src/Common/tests/gtest_optimize_re.cpp | 4 ++++ .../02751_match_constant_needle.reference | 1 + .../02751_match_constant_needle.sql | 1 + 4 files changed, 30 insertions(+) diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index 05e6aefbb5e..bcef49ce00c 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -45,6 +45,25 @@ size_t shortest_literal_length(const Literals & literals) return shortest; } +const char * skipNameCapturingGroup(const char * pos, size_t offset, const char * end) +{ + const char special = *(pos + offset) == '<' ? '>' : '\''; + offset ++; + while (pos + offset < end) + { + const char cur = *(pos + offset); + if (cur == special) + { + return pos + offset; + } + if (('0' <= cur && cur <= '9') || ('a' <= cur && cur <= 'z') || ('A' <= cur && cur <= 'Z')) + offset ++; + else + return pos; + } + return pos; +} + const char * analyzeImpl( std::string_view regexp, const char * pos, @@ -247,10 +266,15 @@ const char * analyzeImpl( break; } } + /// (?regex) means non-capturing parentheses group if (pos + 2 < end && pos[1] == '?' && pos[2] == ':') { pos += 2; } + if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<'))) + { + pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end); + } Literal group_required_substr; bool group_is_trival = true; Literals group_alters; diff --git a/src/Common/tests/gtest_optimize_re.cpp b/src/Common/tests/gtest_optimize_re.cpp index 3710666d336..a9fcb918b24 100644 --- a/src/Common/tests/gtest_optimize_re.cpp +++ b/src/Common/tests/gtest_optimize_re.cpp @@ -47,4 +47,8 @@ TEST(OptimizeRE, analyze) test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"}); test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"}); test_f(R"(\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z)", "/k8s1"); + test_f("[a-zA-Z]+(?P\\d+)", ""); + test_f("[a-zA-Z]+(?\\d+)", ""); + test_f("[a-zA-Z]+(?'num'\\d+)", ""); + test_f("[a-zA-Z]+(?x\\d+)", "x"); } diff --git a/tests/queries/0_stateless/02751_match_constant_needle.reference b/tests/queries/0_stateless/02751_match_constant_needle.reference index d00491fd7e5..6ed281c757a 100644 --- a/tests/queries/0_stateless/02751_match_constant_needle.reference +++ b/tests/queries/0_stateless/02751_match_constant_needle.reference @@ -1 +1,2 @@ 1 +1 diff --git a/tests/queries/0_stateless/02751_match_constant_needle.sql b/tests/queries/0_stateless/02751_match_constant_needle.sql index 71bdcc7cb0a..9980c3760f3 100644 --- a/tests/queries/0_stateless/02751_match_constant_needle.sql +++ b/tests/queries/0_stateless/02751_match_constant_needle.sql @@ -1 +1,2 @@ select match('default/k8s1', '\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z'); +select match('abc123', '[a-zA-Z]+(?P\\d+)'); From 0374b3de3e80edeac7e0b8580d32f313e2734d30 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 1 Aug 2023 09:04:55 +0200 Subject: [PATCH 66/81] Update src/Server/KeeperTCPHandler.cpp Co-authored-by: Azat Khuzhin --- src/Server/KeeperTCPHandler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index e44ab4a4bd7..58d227a5ae5 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -175,7 +175,8 @@ struct SocketInterruptablePollWrapper } while (rc < 0 && errno == POCO_EINTR); - if (rc >= 1) { + if (rc >= 1) + { if (poll_buf[0].revents & POLLIN) socket_ready = true; if (poll_buf[1].revents & POLLIN) From 9ab875f5ad115002148addc0e3757610b93965d0 Mon Sep 17 00:00:00 2001 From: zvonand Date: Tue, 1 Aug 2023 11:38:15 +0200 Subject: [PATCH 67/81] add explicit nullptr for now --- src/Functions/FunctionsConversion.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index e95699a0cec..d4dd843a66e 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1371,8 +1371,8 @@ struct ConvertThroughParsing precise_float_parsing = query_context->getSettingsRef().precise_float_parsing; } - std::function parseFunction; - std::function tryParseFunction; + std::function parseFunction = nullptr; + std::function tryParseFunction = nullptr; if constexpr (!(parsing_mode == ConvertFromStringParsingMode::BestEffort || parsing_mode == ConvertFromStringParsingMode::BestEffortUS || From fe869207a1b6930826723cbaae6f0b391f1a04c0 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 1 Aug 2023 10:28:23 +0000 Subject: [PATCH 68/81] Disable a couple of long tests for debug build. --- .../00840_long_concurrent_select_and_drop_deadlock.sh | 2 +- .../0_stateless/02151_hash_table_sizes_stats_distributed.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh index 6714d8b35ca..cbe37de6651 100755 --- a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh +++ b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: deadlock, no-parallel +# Tags: deadlock, no-parallel, no-debug # NOTE: database = $CLICKHOUSE_DATABASE is unwanted diff --git a/tests/queries/0_stateless/02151_hash_table_sizes_stats_distributed.sh b/tests/queries/0_stateless/02151_hash_table_sizes_stats_distributed.sh index 703b2c4357c..b23be4283b2 100755 --- a/tests/queries/0_stateless/02151_hash_table_sizes_stats_distributed.sh +++ b/tests/queries/0_stateless/02151_hash_table_sizes_stats_distributed.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, distributed, no-tsan +# Tags: long, distributed, no-tsan, no-debug # These tests don't use `current_database = currentDatabase()` condition, because database name isn't propagated during remote queries. From 09cb5cbe8704f158b743b8c355e87aaf512b88e0 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 1 Aug 2023 14:30:41 +0000 Subject: [PATCH 69/81] Try to continue clickhouse process in stress test after terminating gdb. --- tests/ci/stress.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/ci/stress.py b/tests/ci/stress.py index 6d17384c63f..60ddeb85d35 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -125,6 +125,8 @@ def prepare_for_hung_check(drop_databases): # However, it obstruct checking for hung queries. logging.info("Will terminate gdb (if any)") call_with_retry("kill -TERM $(pidof gdb)") + # Sometimes there is a message `Child process was stopped by signal 19` in logs after stopping gdb + call_with_retry("kill -CONT $(lsof -ti:9000)") # ThreadFuzzer significantly slows down server and causes false-positive hung check failures call_with_retry("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'") From 4e849a8abac2baec0156c9d306607422b31a09c7 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 1 Aug 2023 16:44:13 +0200 Subject: [PATCH 70/81] fix ci for #52091 --- .../Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp index 7205ac299a9..774c07ef1d4 100644 --- a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp +++ b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp @@ -48,7 +48,7 @@ public: return true; } - void visitImpl(QueryTreeNodePtr & node) const + void enterImpl(QueryTreeNodePtr & node) const { const static std::unordered_map swap_relations = { {"equals", "equals"}, From 79d49a006c2f179ea1aa4044bc1228eb6b687358 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 1 Aug 2023 17:06:23 +0200 Subject: [PATCH 71/81] Fix the PR body check for `Reverts #number` --- tests/ci/cancel_and_rerun_workflow_lambda/app.py | 4 +++- tests/ci/lambda_shared_package/lambda_shared/pr.py | 8 ++------ tests/ci/run_check.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/app.py b/tests/ci/cancel_and_rerun_workflow_lambda/app.py index 250655ddeb2..8282fb7768a 100644 --- a/tests/ci/cancel_and_rerun_workflow_lambda/app.py +++ b/tests/ci/cancel_and_rerun_workflow_lambda/app.py @@ -323,7 +323,9 @@ def main(event): if action == "edited": print("PR is edited, check if the body is correct") - error, category = check_pr_description(pull_request["body"]) + error, _ = check_pr_description( + pull_request["body"], pull_request["base"]["repo"]["full_name"] + ) if error: print( f"The PR's body is wrong, is going to comment it. The error is: {error}" diff --git a/tests/ci/lambda_shared_package/lambda_shared/pr.py b/tests/ci/lambda_shared_package/lambda_shared/pr.py index ef47eacc082..82fc53b9356 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/pr.py +++ b/tests/ci/lambda_shared_package/lambda_shared/pr.py @@ -101,7 +101,7 @@ LABELS = { CATEGORY_TO_LABEL = {c: lb for lb, categories in LABELS.items() for c in categories} -def check_pr_description(pr_body: str) -> Tuple[str, str]: +def check_pr_description(pr_body: str, repo_name: str) -> Tuple[str, str]: """The function checks the body to being properly formatted according to .github/PULL_REQUEST_TEMPLATE.md, if the first returned string is not empty, then there is an error.""" @@ -109,11 +109,7 @@ def check_pr_description(pr_body: str) -> Tuple[str, str]: lines = [re.sub(r"\s+", " ", line) for line in lines] # Check if body contains "Reverts ClickHouse/ClickHouse#36337" - if [ - True - for line in lines - if re.match(r"\AReverts {GITHUB_REPOSITORY}#[\d]+\Z", line) - ]: + if [True for line in lines if re.match(rf"\AReverts {repo_name}#[\d]+\Z", line)]: return "", LABELS["pr-not-for-changelog"][0] category = "" diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 330a1309016..4f022b6c0a5 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -108,7 +108,7 @@ def main(): gh = Github(get_best_robot_token(), per_page=100) commit = get_commit(gh, pr_info.sha) - description_error, category = check_pr_description(pr_info.body) + description_error, category = check_pr_description(pr_info.body, GITHUB_REPOSITORY) pr_labels_to_add = [] pr_labels_to_remove = [] if ( From b79cd49ccf11f0d3fc74c495ea8cae6809862ec6 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 1 Aug 2023 17:27:36 +0200 Subject: [PATCH 72/81] doc: add PGO article --- .../profile-guided-optimization.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 docs/en/operations/optimizing-performance/profile-guided-optimization.md diff --git a/docs/en/operations/optimizing-performance/profile-guided-optimization.md b/docs/en/operations/optimizing-performance/profile-guided-optimization.md new file mode 100644 index 00000000000..cda21e3c604 --- /dev/null +++ b/docs/en/operations/optimizing-performance/profile-guided-optimization.md @@ -0,0 +1,26 @@ +--- +slug: /en/operations/optimizing-performance/profile-guided-optimization +sidebar_position: 54 +sidebar_label: Profile Guided Optimization (PGO) +--- +import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md'; + +# Profile Guided Optimization + +Profile-Guided Optimization (PGO) is a compiler optimization technique where a program is optimized based on the runtime profile. + +According to the tests, PGO helps with achieving better performance for ClickHouse. According to the tests, we see improvements up to 15% in QPS on the ClickBench test suite. The more detailed results are available [here](https://pastebin.com/xbue3HMU). The performance benefits depend on your typical workload - you can get better or worse results. + +More information about PGO in ClickHouse you can read in the corresponding GitHub [issue](https://github.com/ClickHouse/ClickHouse/issues/44567). + +## How to build ClickHouse with PGO? + +There are two major kinds of PGO: [Instrumentation](https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers) and [Sampling](https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers) (also known as AutoFDO). In this guide is described the Instrumentation PGO with ClickHouse. + +1. Build ClickHouse in Instrumented mode. In Clang it can be done via passing `-fprofile-instr-generate` option to `CXXFLAGS`. +2. Run instrumented ClickHouse on a sample workload. Here you need to use your usual workload. One of the approaches could be using [ClickBench](https://github.com/ClickHouse/ClickBench) as a sample workload. ClickHouse in the instrumentation mode could work slowly so be ready for that and do not run instrumented ClickHouse in performance-critical environments. +3. Recompile ClickHouse once again with `-fprofile-instr-use` compiler flags and profiles that are collected from the previous step. + +A more detailed guide on how to apply PGO is in the Clang [documentation](https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization). + +If you are going to collect a sample workload directly from a production environment, we recommend trying to use Sampling PGO. From a330a7ac38cb733ee386d8bd8a5a77e763b1b874 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 1 Aug 2023 15:47:35 +0200 Subject: [PATCH 73/81] PyYAML>=5.4 conflicts with cython 3 --- docker/test/integration/runner/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 8e95d94b6dc..064538a0448 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -95,6 +95,7 @@ RUN python3 -m pip install --no-cache-dir \ pytest-timeout \ pytest-xdist \ pytz \ + pyyaml==5.3.1 \ redis \ requests-kerberos \ tzlocal==2.1 \ From 6b29cccf3e8c663cac47a7db076033bf7435e8bb Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 1 Aug 2023 15:50:02 +0200 Subject: [PATCH 74/81] Rename docker_images.json to docker_images_for_tests.json --- tests/ci/docker_test.py | 2 +- .../tests/{docker_images.json => docker_images_for_tests.json} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/ci/tests/{docker_images.json => docker_images_for_tests.json} (100%) diff --git a/tests/ci/docker_test.py b/tests/ci/docker_test.py index c679ab984ee..40c51702868 100644 --- a/tests/ci/docker_test.py +++ b/tests/ci/docker_test.py @@ -18,7 +18,7 @@ import docker_server as ds class TestDockerImageCheck(unittest.TestCase): docker_images_path = os.path.join( - os.path.dirname(__file__), "tests/docker_images.json" + os.path.dirname(__file__), "tests/docker_images_for_tests.json" ) def test_get_changed_docker_images(self): diff --git a/tests/ci/tests/docker_images.json b/tests/ci/tests/docker_images_for_tests.json similarity index 100% rename from tests/ci/tests/docker_images.json rename to tests/ci/tests/docker_images_for_tests.json From eb93d34e8db6a6ea3cffff91ed1583387fcac106 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 1 Aug 2023 15:39:56 +0000 Subject: [PATCH 75/81] Disable more tests for debug. --- tests/queries/0_stateless/00900_long_parquet_load.sh | 2 +- tests/queries/0_stateless/02352_rwlock.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00900_long_parquet_load.sh b/tests/queries/0_stateless/00900_long_parquet_load.sh index 8142c5b5810..60317e1671c 100755 --- a/tests/queries/0_stateless/00900_long_parquet_load.sh +++ b/tests/queries/0_stateless/00900_long_parquet_load.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-fasttest +# Tags: long, no-fasttest, no-debug # # Load all possible .parquet files found in submodules. diff --git a/tests/queries/0_stateless/02352_rwlock.sh b/tests/queries/0_stateless/02352_rwlock.sh index 7505a03a382..22453be5c0b 100755 --- a/tests/queries/0_stateless/02352_rwlock.sh +++ b/tests/queries/0_stateless/02352_rwlock.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-parallel, no-fasttest +# Tags: no-parallel, no-fasttest, no-debug # Tag no-parallel -- to avoid running it in parallel, this will avoid possible issues due to high pressure # Test that ensures that WRITE lock failure notifies READ. From 107759aec834bf97c95bee1bce3e2c7852c7ce1b Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 1 Aug 2023 18:00:53 +0200 Subject: [PATCH 76/81] doc: add ClickBench and AutoFDO to the aspell ignore list --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 013ec546119..fe47cd83a41 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -41,6 +41,7 @@ AsynchronousMetricsUpdateInterval AsynchronousReadWait Authenticator Authenticators +AutoFDO AutoML Autocompletion AvroConfluent @@ -146,6 +147,7 @@ ChannelID Cidr Ciphertext CityHash +ClickBench ClickCat ClickHouse ClickHouse's From c485348486298ab66fac9ceab57b09004b81ff96 Mon Sep 17 00:00:00 2001 From: Justin de Guzman Date: Tue, 1 Aug 2023 09:28:27 -0700 Subject: [PATCH 77/81] Specify that written_bytes is uncompressed --- docs/en/operations/system-tables/query_log.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/system-tables/query_log.md b/docs/en/operations/system-tables/query_log.md index c6f565b8748..7143520835f 100644 --- a/docs/en/operations/system-tables/query_log.md +++ b/docs/en/operations/system-tables/query_log.md @@ -48,7 +48,7 @@ Columns: - `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number of rows read from all tables and table functions participated in query. It includes usual subqueries, subqueries for `IN` and `JOIN`. For distributed queries `read_rows` includes the total number of rows read at all replicas. Each replica sends it’s `read_rows` value, and the server-initiator of the query summarizes all received and local values. The cache volumes do not affect this value. - `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number of bytes read from all tables and table functions participated in query. It includes usual subqueries, subqueries for `IN` and `JOIN`. For distributed queries `read_bytes` includes the total number of rows read at all replicas. Each replica sends it’s `read_bytes` value, and the server-initiator of the query summarizes all received and local values. The cache volumes do not affect this value. - `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. -- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. +- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written bytes (uncompressed). For other queries, the column value is 0. - `result_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of rows in a result of the `SELECT` query, or a number of rows in the `INSERT` query. - `result_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — RAM volume in bytes used to store a query result. - `memory_usage` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Memory consumption by the query. From 490554eb4f9a2ee339dde3b7a7dd393f3bf130ff Mon Sep 17 00:00:00 2001 From: zvonand Date: Tue, 1 Aug 2023 18:29:44 +0200 Subject: [PATCH 78/81] simplify --- src/Functions/FunctionsConversion.h | 69 +++++++++-------------------- 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index d4dd843a66e..79d17d8ac98 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1039,28 +1039,22 @@ inline void convertFromTime(DataTypeDateTime::FieldType & x, t /** Conversion of strings to numbers, dates, datetimes: through parsing. */ -template -void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +template +void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing) { if constexpr (std::is_floating_point_v) { - if constexpr (precise_float_parsing) + if (precise_float_parsing) readFloatTextPrecise(x, rb); else readFloatTextFast(x, rb); } else - parseType(x, rb, time_zone); -} - -template -void parseType(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) -{ - readText(x, rb); + readText(x, rb); } template <> -inline void parseType(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline void parseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) { DayNum tmp(0); readDateText(tmp, rb, *time_zone); @@ -1068,7 +1062,7 @@ inline void parseType(DataTypeDate::FieldType & x, ReadBuffer & rb } template <> -inline void parseType(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline void parseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) { ExtendedDayNum tmp(0); readDateText(tmp, rb, *time_zone); @@ -1078,7 +1072,7 @@ inline void parseType(DataTypeDate32::FieldType & x, ReadBuffer // NOTE: no need of extra overload of DateTime64, since readDateTimeText64 has different signature and that case is explicitly handled in the calling code. template <> -inline void parseType(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline void parseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) { time_t time = 0; readDateTimeText(time, rb, *time_zone); @@ -1086,7 +1080,7 @@ inline void parseType(DataTypeDateTime::FieldType & x, ReadBuf } template <> -inline void parseType(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline void parseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) { UUID tmp; readUUIDText(tmp, rb); @@ -1094,7 +1088,7 @@ inline void parseType(DataTypeUUID::FieldType & x, ReadBuffer & rb } template <> -inline void parseType(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline void parseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) { IPv4 tmp; readIPv4Text(tmp, rb); @@ -1102,35 +1096,29 @@ inline void parseType(DataTypeIPv4::FieldType & x, ReadBuffer & rb } template <> -inline void parseType(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline void parseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) { IPv6 tmp; readIPv6Text(tmp, rb); x = tmp; } -template -bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +template +bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing) { if constexpr (std::is_floating_point_v) { - if constexpr (precise_float_parsing) + if (precise_float_parsing) return tryReadFloatTextPrecise(x, rb); else return tryReadFloatTextFast(x, rb); } else /*if constexpr (is_integer_v)*/ - return tryParseTypeImpl(x, rb, time_zone); -} - -template -inline bool tryParseTypeImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) -{ - return tryReadIntText(x, rb); + return tryReadIntText(x, rb); } template <> -inline bool tryParseTypeImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline bool tryParseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) { DayNum tmp(0); if (!tryReadDateText(tmp, rb, *time_zone)) @@ -1140,7 +1128,7 @@ inline bool tryParseTypeImpl(DataTypeDate::FieldType & x, ReadBuff } template <> -inline bool tryParseTypeImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline bool tryParseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) { ExtendedDayNum tmp(0); if (!tryReadDateText(tmp, rb, *time_zone)) @@ -1150,7 +1138,7 @@ inline bool tryParseTypeImpl(DataTypeDate32::FieldType & x, Read } template <> -inline bool tryParseTypeImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone) +inline bool tryParseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) { time_t tmp = 0; if (!tryReadDateTimeText(tmp, rb, *time_zone)) @@ -1160,7 +1148,7 @@ inline bool tryParseTypeImpl(DataTypeDateTime::FieldType & x, } template <> -inline bool tryParseTypeImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline bool tryParseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) { UUID tmp; if (!tryReadUUIDText(tmp, rb)) @@ -1171,7 +1159,7 @@ inline bool tryParseTypeImpl(DataTypeUUID::FieldType & x, ReadBuff } template <> -inline bool tryParseTypeImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline bool tryParseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) { IPv4 tmp; if (!tryReadIPv4Text(tmp, rb)) @@ -1182,7 +1170,7 @@ inline bool tryParseTypeImpl(DataTypeIPv4::FieldType & x, ReadBuff } template <> -inline bool tryParseTypeImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *) +inline bool tryParseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) { IPv6 tmp; if (!tryReadIPv6Text(tmp, rb)) @@ -1371,19 +1359,6 @@ struct ConvertThroughParsing precise_float_parsing = query_context->getSettingsRef().precise_float_parsing; } - std::function parseFunction = nullptr; - std::function tryParseFunction = nullptr; - - if constexpr (!(parsing_mode == ConvertFromStringParsingMode::BestEffort || - parsing_mode == ConvertFromStringParsingMode::BestEffortUS || - to_datetime64 || IsDataTypeDecimal)) - { - if constexpr (exception_mode == ConvertFromStringExceptionMode::Throw) - parseFunction = precise_float_parsing ? parseImpl : parseImpl; - else - tryParseFunction = precise_float_parsing ? tryParseImpl : tryParseImpl; - } - for (size_t i = 0; i < size; ++i) { size_t next_offset = std::is_same_v ? (*offsets)[i] : (current_offset + fixed_string_size); @@ -1450,7 +1425,7 @@ struct ConvertThroughParsing } } - parseFunction(vec_to[i], read_buffer, local_time_zone); + parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); } while (false); } } @@ -1520,7 +1495,7 @@ struct ConvertThroughParsing } } - parsed = tryParseFunction(vec_to[i], read_buffer, local_time_zone); + parsed = tryParseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); } while (false); } } From 7aecec8bc727226bf9be98efa64fbc0f6858f707 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Tue, 1 Aug 2023 13:43:28 -0400 Subject: [PATCH 79/81] Update src/Common/OptimizedRegularExpression.cpp --- src/Common/OptimizedRegularExpression.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index bcef49ce00c..dc35ddc14a9 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -266,7 +266,7 @@ const char * analyzeImpl( break; } } - /// (?regex) means non-capturing parentheses group + /// (?:regex) means non-capturing parentheses group if (pos + 2 < end && pos[1] == '?' && pos[2] == ':') { pos += 2; From dc33564044c2c59680c4a046367178d3f08a2bf5 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 2 Aug 2023 06:30:22 +0000 Subject: [PATCH 80/81] Disable reconfig test until proper fix --- .../test_keeper_reconfig_replace_leader_in_one_command/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py index e23d0674c12..3a8bf3fc9b1 100644 --- a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py @@ -42,7 +42,7 @@ def started_cluster(): def get_fake_zk(node): return ku.get_fake_zk(cluster, node) - +@pytest.mark.skip(reason="test is flaky because changes are not properly waited for") def test_reconfig_replace_leader_in_one_command(started_cluster): """ Remove leader from a cluster of 3 and add a new node to this cluster in a single command From c1752355b48e5da5a0154cd333f1ecbbee94f700 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 2 Aug 2023 06:40:32 +0000 Subject: [PATCH 81/81] Automatic style fix --- .../test_keeper_reconfig_replace_leader_in_one_command/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py index 3a8bf3fc9b1..1ec44d8a002 100644 --- a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py @@ -42,6 +42,7 @@ def started_cluster(): def get_fake_zk(node): return ku.get_fake_zk(cluster, node) + @pytest.mark.skip(reason="test is flaky because changes are not properly waited for") def test_reconfig_replace_leader_in_one_command(started_cluster): """