diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 53e295b7fbb..aa4747636c9 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -313,11 +313,11 @@ void LocalServer::cleanup() std::string LocalServer::getInitialCreateTableQuery() { - if (!config().has("table-structure")) + if (!config().has("table-structure") && !config().has("table-file")) return {}; auto table_name = backQuoteIfNeed(config().getString("table-name", "table")); - auto table_structure = config().getString("table-structure"); + auto table_structure = config().getString("table-structure", "auto"); auto data_format = backQuoteIfNeed(config().getString("table-data-format", "TSV")); String table_file; @@ -332,7 +332,12 @@ std::string LocalServer::getInitialCreateTableQuery() table_file = quoteString(config().getString("table-file")); } - return fmt::format("CREATE TABLE {} ({}) ENGINE = File({}, {});", + if (table_structure == "auto") + table_structure = ""; + else + table_structure = "(" + table_structure + ")"; + + return fmt::format("CREATE TABLE {} {} ENGINE = File({}, {});", table_name, table_structure, data_format, table_file); } @@ -422,7 +427,7 @@ try #else is_interactive = stdin_is_a_tty && (config().hasOption("interactive") - || (!config().has("query") && !config().has("table-structure") && queries_files.empty())); + || (!config().has("query") && !config().has("table-structure") && queries_files.empty() && !config().has("table-file"))); #endif if (!is_interactive) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7124961821e..0fe66314114 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -526,6 +526,14 @@ if (USE_BZIP2) target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BZIP2_INCLUDE_DIR}) endif() +if(USE_SIMDJSON) + dbms_target_link_libraries(PRIVATE simdjson) +endif() + +if(USE_RAPIDJSON) + dbms_target_include_directories(SYSTEM PRIVATE ${RAPIDJSON_INCLUDE_DIR}) +endif() + dbms_target_link_libraries(PUBLIC consistent-hashing) include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index f2e5e018e1b..16f85fcae61 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -604,6 +604,7 @@ M(633, QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW) \ M(634, MONGODB_ERROR) \ M(635, CANNOT_POLL) \ + M(636, CANNOT_EXTRACT_TABLE_STRUCTURE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index f05a10b8815..c8753c8edaf 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -26,6 +26,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; } } @@ -1133,4 +1134,54 @@ Coordination::RequestPtr makeCheckRequest(const std::string & path, int version) return request; } +std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + { + /// Do not allow this for new tables, print warning for tables created in old versions + if (check_starts_with_slash) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); + if (log) + LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); + zookeeper_path = "/" + zookeeper_path; + } + + return zookeeper_path; +} + +String extractZooKeeperName(const String & path) +{ + static constexpr auto default_zookeeper_name = "default"; + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return default_zookeeper_name; + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + auto zookeeper_name = path.substr(0, pos); + if (zookeeper_name.empty()) + throw DB::Exception("Zookeeper path should start with '/' or ':/'", DB::ErrorCodes::BAD_ARGUMENTS); + return zookeeper_name; + } + return default_zookeeper_name; +} + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return normalizeZooKeeperPath(path, check_starts_with_slash, log); + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); + } + return normalizeZooKeeperPath(path, check_starts_with_slash, log); +} + } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 8e015b1f331..371f93f6df3 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -379,4 +379,11 @@ private: }; using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr; + +String normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + +String extractZooKeeperName(const String & path); + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 952009047d4..6e53fa4342c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -596,6 +596,8 @@ class IColumn; M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ + M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ + M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ @@ -661,6 +663,7 @@ class IColumn; M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0)\ + // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index e74df5c327a..85644b6f6ca 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -377,6 +377,8 @@ struct WhichDataType constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } constexpr bool isFunction() const { return idx == TypeIndex::Function; } constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } + + constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e9944b592ed..165bad950f5 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -76,10 +76,16 @@ std::pair createTableFromAST( /// - the database has not been loaded yet; /// - the code is simpler, since the query is already brought to a suitable form. if (!ast_create_query.columns_list || !ast_create_query.columns_list->columns) - throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); - - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); - constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + { + if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(ast_create_query.storage->engine->name)) + throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); + /// Leave columns empty. + } + else + { + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); + constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + } } return diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index ecfa5df8351..bed46a97c1b 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -26,6 +28,7 @@ namespace ErrorCodes extern const int FILE_DOESNT_EXIST; extern const int UNKNOWN_EXCEPTION; extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) @@ -427,6 +430,113 @@ void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Blo } } +template +static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) +{ + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); +} + +static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) +{ + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "ClickHouse supports only 8 and 16-but Enums"); +} + +static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) +{ + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType()); + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unions are not supported"); + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Tuples and Lists cannot be inside Nullable"); + + auto nested_type = getDataTypeFromCapnProtoType(value_type); + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(getDataTypeFromCapnProtoType(field.getType())); + } + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType()); + names_and_types.emplace_back(name, type); + } + return names_and_types; +} + } #endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoUtils.h index 93ca0a5e616..51c152de17f 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoUtils.h @@ -38,6 +38,7 @@ capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Re void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema); } #endif diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index d956d9e6bfb..0a7747fc864 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -1,7 +1,16 @@ #include +#include +#include #include +#include +#include +#include #include #include +#include +#include +#include +#include namespace DB { @@ -9,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) @@ -193,30 +203,145 @@ void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSe } } -String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +template +String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { String result; switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: - readQuotedString(result, buf); + if constexpr (read_string) + readQuotedString(result, buf); + else + readQuotedFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::JSON: - readJSONString(result, buf); + if constexpr (read_string) + readJSONString(result, buf); + else + readJSONFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); break; case FormatSettings::EscapingRule::CSV: - readCSVString(result, buf, format_settings.csv); + if constexpr (read_string) + readCSVString(result, buf, format_settings.csv); + else + readCSVField(result, buf, format_settings.csv); break; case FormatSettings::EscapingRule::Escaped: readEscapedString(result, buf); break; default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read string with {} escaping rule", escapingRuleToString(escaping_rule)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); } return result; } +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context) +{ + if (!context) + throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression"); + + ParserExpression parser; + Expected expected; + Tokens tokens(field.data, field.data + field.size); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + ASTPtr ast; + + /// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats. + bool parsed = parser.parse(token_iterator, ast, expected); + if (!parsed) + return false; + + try + { + std::pair result = evaluateConstantExpression(ast, context); + type = generalizeDataType(result.second); + return true; + } + catch (...) + { + return false; + } +} + +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + { + DataTypePtr type; + bool parsed = evaluateConstantExpressionFromString(field, type, context); + return parsed ? type : nullptr; + } + case FormatSettings::EscapingRule::JSON: + return getDataTypeFromJSONField(field); + case FormatSettings::EscapingRule::CSV: + { + if (field.empty() || field == format_settings.csv.null_representation) + return nullptr; + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return std::make_shared(); + + DataTypePtr type; + bool parsed; + if (field[0] == '\'' || field[0] == '"') + { + /// Try to evaluate expression inside quotes. + parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context); + /// If it's a number in quotes we determine it as a string. + if (parsed && type && isNumber(removeNullable(type))) + return makeNullable(std::make_shared()); + } + else + parsed = evaluateConstantExpressionFromString(field, type, context); + + /// If we couldn't parse an expression, determine it as a string. + return parsed ? type : makeNullable(std::make_shared()); + } + case FormatSettings::EscapingRule::Raw: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: + /// TODO: Try to use some heuristics here to determine the type of data. + return field.empty() ? nullptr : makeNullable(std::make_shared()); + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); + } +} + +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context)); + return data_types; +} + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::CSV: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: [[fallthrough]]; + case FormatSettings::EscapingRule::Raw: + return makeNullable(std::make_shared()); + default: + return nullptr; + } +} + } diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 02f027db74d..10147b29ad6 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -33,5 +34,24 @@ void serializeFieldByEscapingRule( void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +/// Try to determine the type of the field written by a specific escaping rule. +/// If cannot, return nullptr. +/// - For Quoted escaping rule we can interpret a single field as a constant +/// expression and get it's type by evaluation this expression. +/// - For JSON escaping rule we can use JSON parser to parse a single field +/// and then convert JSON type of this field to ClickHouse type. +/// - For CSV escaping rule we can do the next: +/// - If the field is an unquoted string, then we could try to evaluate it +/// as a constant expression, and if it fails, treat it as a String. +/// - If the field is a string in quotes, then we can try to evaluate +/// expression inside quotes as a constant expression, and if it fails or +/// the result is a number (we don't parse numbers in quotes) we treat it as a String. +/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index cf2cdd6c547..a0a5550627d 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -14,9 +14,6 @@ #include #include -#include -#include - namespace DB { @@ -120,6 +117,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; + format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; + format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) @@ -201,7 +200,6 @@ InputFormatPtr FormatFactory::getInput( return format; } - InputFormatPtr FormatFactory::getInputFormat( const String & name, ReadBuffer & buf, @@ -342,6 +340,32 @@ String FormatFactory::getContentType( return format->getContentType(); } +SchemaReaderPtr FormatFactory::getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & schema_reader_creator = dict.at(name).schema_reader_creator; + if (!schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return schema_reader_creator(buf, format_settings, context); +} + +ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; + if (!external_schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return external_schema_reader_creator(format_settings); +} void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator) { @@ -375,6 +399,21 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm target = std::move(file_segmentation_engine); } +void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator) +{ + auto & target = dict[name].schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(schema_reader_creator); +} + +void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator) +{ + auto & target = dict[name].external_schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(external_schema_reader_creator); +} void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name) { @@ -412,6 +451,23 @@ bool FormatFactory::isOutputFormat(const String & name) const return it != dict.end() && it->second.output_creator; } +bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.external_schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) +{ + return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name); +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index ea285c47996..a62b32da0cc 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include @@ -31,6 +33,11 @@ class IOutputFormat; struct RowInputFormatParams; struct RowOutputFormatParams; +class ISchemaReader; +class IExternalSchemaReader; +using SchemaReaderPtr = std::shared_ptr; +using ExternalSchemaReaderPtr = std::shared_ptr; + using InputFormatPtr = std::shared_ptr; using OutputFormatPtr = std::shared_ptr; @@ -85,11 +92,16 @@ private: /// The checker should return true if parallel parsing should be disabled. using NonTrivialPrefixAndSuffixChecker = std::function; + using SchemaReaderCreator = std::function; + using ExternalSchemaReaderCreator = std::function; + struct Creators { InputCreator input_creator; OutputCreator output_creator; FileSegmentationEngine file_segmentation_engine; + SchemaReaderCreator schema_reader_creator; + ExternalSchemaReaderCreator external_schema_reader_creator; bool supports_parallel_formatting{false}; bool is_column_oriented{false}; NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; @@ -138,6 +150,17 @@ public: ContextPtr context, const std::optional & format_settings = std::nullopt) const; + SchemaReaderPtr getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + + ExternalSchemaReaderPtr getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker); @@ -146,11 +169,19 @@ public: void registerInputFormat(const String & name, InputCreator input_creator); void registerOutputFormat(const String & name, OutputCreator output_creator); + /// Register schema readers for format its name. + void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); + void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator); + void markOutputFormatSupportsParallelFormatting(const String & name); void markFormatAsColumnOriented(const String & name); bool checkIfFormatIsColumnOriented(const String & name); + bool checkIfFormatHasSchemaReader(const String & name); + bool checkIfFormatHasExternalSchemaReader(const String & name); + bool checkIfFormatHasAnySchemaReader(const String & name); + const FormatsDictionary & getAllFormats() const { return dict; @@ -163,6 +194,7 @@ private: FormatsDictionary dict; const Creators & getCreators(const String & name) const; + }; } diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index d9af07fdc9c..6298e959c3e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -33,6 +33,7 @@ struct FormatSettings bool defaults_for_omitted_fields = true; bool seekable_read = true; + UInt64 max_rows_to_read_for_schema_inference = 100; enum class DateTimeInputFormat { @@ -217,6 +218,11 @@ struct FormatSettings { EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; } capn_proto; + + struct + { + UInt64 number_of_columns = 0; + } msgpack; }; } diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index b55e9f59cc7..c63b8453634 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -1,7 +1,17 @@ #include #include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -26,7 +36,7 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) { const auto current_object_size = memory.size() + static_cast(pos - in.position()); - if (current_object_size > 10 * min_chunk_size) + if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size) throw ParsingException("Size of JSON object is extremely large. Expected not greater than " + std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) + " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA); @@ -92,6 +102,122 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer return {loadAtPosition(in, memory, pos), number_of_rows}; } +template +static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in) +{ + Memory memory; + fileSegmentationEngineJSONEachRowImpl(in, memory, 0, 1); + return String(memory.data(), memory.size()); +} + +template +DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) +{ + if (field.isNull()) + return nullptr; + + if (field.isBool()) + return makeNullable(std::make_shared()); + + if (field.isInt64() || field.isUInt64() || field.isDouble()) + return makeNullable(std::make_shared()); + + if (field.isString()) + return makeNullable(std::make_shared()); + + if (field.isArray()) + { + auto array = field.getArray(); + + /// Return nullptr in case of empty array because we cannot determine nested type. + if (array.size() == 0) + return nullptr; + + DataTypes nested_data_types; + /// If this array contains fields with different types we will treat it as Tuple. + bool is_tuple = false; + for (const auto element : array) + { + auto type = getDataTypeFromJSONFieldImpl(element); + if (!type) + return nullptr; + + if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) + is_tuple = true; + + nested_data_types.push_back(std::move(type)); + } + + if (is_tuple) + return std::make_shared(nested_data_types); + + return std::make_shared(nested_data_types.back()); + } + + if (field.isObject()) + { + auto object = field.getObject(); + DataTypePtr value_type; + for (const auto key_value_pair : object) + { + auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); + if (!type) + return nullptr; + + if (value_type && value_type->getName() != type->getName()) + return nullptr; + + value_type = type; + } + return std::make_shared(std::make_shared(), value_type); + } + + throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; +} + +auto getJSONParserAndElement() +{ +#if USE_SIMDJSON + return std::pair(); +#elif USE_RAPIDJSON + return std::pair(); +#else + return std::pair(); +#endif +} + +DataTypePtr getDataTypeFromJSONField(const String & field) +{ + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(field, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + return getDataTypeFromJSONFieldImpl(element); +} + +template +static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) +{ + String line = readJSONEachRowLineIntoStringImpl(in); + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(line, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + auto fields = extractor.extract(element); + + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(getDataTypeFromJSONFieldImpl(field)); + + /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. + /// Should we try to parse data inside strings somehow in this case? + + return data_types; +} + std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); @@ -102,6 +228,60 @@ std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); } +struct JSONEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// {..., "" : , ...} + auto object = element.getObject(); + std::vector fields; + fields.reserve(object.size()); + column_names.reserve(object.size()); + for (const auto & key_value_pair : object) + { + column_names.emplace_back(key_value_pair.first); + fields.push_back(key_value_pair.second); + } + + return fields; + } + + std::vector column_names; +}; + +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) +{ + JSONEachRowFieldsExtractor extractor; + auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + std::unordered_map result; + for (size_t i = 0; i != extractor.column_names.size(); ++i) + result[extractor.column_names[i]] = data_types[i]; + return result; +} + +struct JSONCompactEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// [..., , ...] + auto array = element.getArray(); + std::vector fields; + fields.reserve(array.size()); + for (size_t i = 0; i != array.size(); ++i) + fields.push_back(array[i]); + return fields; + } +}; + +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) +{ + JSONCompactEachRowFieldsExtractor extractor; + return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); +} + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) { /// For JSONEachRow we can safely skip whitespace characters diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 4a049aa1abd..6f71baa8b40 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -11,6 +11,21 @@ namespace DB std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); + +/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. +/// JSON array with different nested types is treated as Tuple. +/// If cannot convert (for example when field contains null), return nullptr. +DataTypePtr getDataTypeFromJSONField(const String & field); + +/// Read row in JSONEachRow format and try to determine type for each field. +/// Return map {column_name : type}. +/// If cannot determine the type of some field, return nullptr for it. +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); + +/// Read row in JSONCompactEachRow format and try to determine type for each field. +/// If cannot determine the type of some field, return nullptr for it. +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); diff --git a/src/Formats/ParsedTemplateFormatString.cpp b/src/Formats/ParsedTemplateFormatString.cpp index 4966420f05b..8d1b987d01a 100644 --- a/src/Formats/ParsedTemplateFormatString.cpp +++ b/src/Formats/ParsedTemplateFormatString.cpp @@ -14,14 +14,14 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) +ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); String format_string; readStringUntilEOF(format_string, schema_file); try { - parse(format_string, idx_by_name); + parse(format_string, idx_by_name, allow_indexes); } catch (DB::Exception & e) { @@ -33,7 +33,7 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & } -void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) +void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { enum ParserState { @@ -100,6 +100,8 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) column_idx = idx_by_name(column_names.back()); + else if (!allow_indexes) + throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Indexes instead of names are not allowed"); } format_idx_to_column_idx.emplace_back(column_idx); break; diff --git a/src/Formats/ParsedTemplateFormatString.h b/src/Formats/ParsedTemplateFormatString.h index ba0ebdf5aa8..c5617d0f0ef 100644 --- a/src/Formats/ParsedTemplateFormatString.h +++ b/src/Formats/ParsedTemplateFormatString.h @@ -31,9 +31,9 @@ struct ParsedTemplateFormatString typedef std::function(const String &)> ColumnIdxGetter; ParsedTemplateFormatString() = default; - ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name); + ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); - void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); + void parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); size_t columnsCount() const; diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 5232b76b7fe..b59db12a16c 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -24,6 +24,7 @@ # include # include # include +# include # include # include # include @@ -56,6 +57,7 @@ namespace ErrorCodes extern const int PROTOBUF_FIELD_NOT_REPEATED; extern const int PROTOBUF_BAD_CAST; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } namespace @@ -3017,10 +3019,8 @@ namespace { std::vector column_names_used; column_names_used.reserve(used_column_indices_in_nested.size()); - for (size_t i : used_column_indices_in_nested) column_names_used.emplace_back(nested_column_names[i]); - auto field_serializer = std::make_unique( std::move(column_names_used), field_descriptor, std::move(nested_message_serializer), get_root_desc_function); transformColumnIndices(used_column_indices_in_nested, nested_column_indices); @@ -3230,8 +3230,105 @@ namespace std::function get_root_desc_function; std::shared_ptr root_serializer_ptr; }; -} + template + DataTypePtr getEnumDataType(const google::protobuf::EnumDescriptor * enum_descriptor) + { + std::vector> values; + for (int i = 0; i != enum_descriptor->value_count(); ++i) + { + const auto * enum_value_descriptor = enum_descriptor->value(i); + values.emplace_back(enum_value_descriptor->name(), enum_value_descriptor->number()); + } + return std::make_shared>(std::move(values)); + } + + NameAndTypePair getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool allow_repeat = true) + { + if (allow_repeat && field_descriptor->is_map()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + const auto * tuple_type = assert_cast(name_and_type.type.get()); + return {name_and_type.name, std::make_shared(tuple_type->getElements())}; + } + + if (allow_repeat && field_descriptor->is_repeated()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + return {name_and_type.name, std::make_shared(name_and_type.type)}; + } + + switch (field_descriptor->type()) + { + case FieldTypeId::TYPE_SFIXED32: [[fallthrough]]; + case FieldTypeId::TYPE_SINT32: [[fallthrough]]; + case FieldTypeId::TYPE_INT32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_SFIXED64: [[fallthrough]]; + case FieldTypeId::TYPE_SINT64: [[fallthrough]]; + case FieldTypeId::TYPE_INT64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BOOL: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_FLOAT: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_DOUBLE: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT32: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT64: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BYTES: [[fallthrough]]; + case FieldTypeId::TYPE_STRING: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_ENUM: + { + const auto * enum_descriptor = field_descriptor->enum_type(); + if (enum_descriptor->value_count() == 0) + throw Exception("Empty enum field", ErrorCodes::BAD_ARGUMENTS); + int max_abs = std::abs(enum_descriptor->value(0)->number()); + for (int i = 1; i != enum_descriptor->value_count(); ++i) + { + if (std::abs(enum_descriptor->value(i)->number()) > max_abs) + max_abs = std::abs(enum_descriptor->value(i)->number()); + } + if (max_abs < 128) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else if (max_abs < 32768) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else + throw Exception("ClickHouse supports only 8-bit and 16-bit enums", ErrorCodes::BAD_ARGUMENTS); + } + case FieldTypeId::TYPE_GROUP: [[fallthrough]]; + case FieldTypeId::TYPE_MESSAGE: + { + const auto * message_descriptor = field_descriptor->message_type(); + if (message_descriptor->field_count() == 1) + { + const auto * nested_field_descriptor = message_descriptor->field(0); + auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor); + return {field_descriptor->name() + "_" + nested_name_and_type.name, nested_name_and_type.type}; + } + else + { + DataTypes nested_types; + Strings nested_names; + for (int i = 0; i != message_descriptor->field_count(); ++i) + { + auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i)); + nested_types.push_back(nested_name_and_type.type); + nested_names.push_back(nested_name_and_type.name); + } + return {field_descriptor->name(), std::make_shared(std::move(nested_types), std::move(nested_names))}; + } + } + } + + __builtin_unreachable(); + } +} std::unique_ptr ProtobufSerializer::create( const Strings & column_names, @@ -3254,5 +3351,14 @@ std::unique_ptr ProtobufSerializer::create( std::vector missing_column_indices; return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter); } + +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor) +{ + NamesAndTypesList schema; + for (int i = 0; i != message_descriptor->field_count(); ++i) + schema.push_back(getNameAndDataTypeFromField(message_descriptor->field(i))); + return schema; +} + } #endif diff --git a/src/Formats/ProtobufSerializer.h b/src/Formats/ProtobufSerializer.h index 3eaca6a18d6..d9bed913517 100644 --- a/src/Formats/ProtobufSerializer.h +++ b/src/Formats/ProtobufSerializer.h @@ -4,6 +4,7 @@ #if USE_PROTOBUF # include +#include namespace google::protobuf { class Descriptor; } @@ -48,5 +49,7 @@ public: ProtobufWriter & writer); }; +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor); + } #endif diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp new file mode 100644 index 00000000000..37067eae64f --- /dev/null +++ b/src/Formats/ReadSchemaUtils.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int BAD_ARGUMENTS; +} + +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context) +{ + NamesAndTypesList names_and_types; + if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + { + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + try + { + names_and_types = external_schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) + { + auto read_buf = read_buffer_creator(); + if (read_buf->eof()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name); + + auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings); + try + { + names_and_types = schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name); + + return ColumnsDescription(names_and_types); +} + +DataTypePtr generalizeDataType(DataTypePtr type) +{ + WhichDataType which(type); + + if (which.isNothing()) + return nullptr; + + if (which.isNullable()) + { + const auto * nullable_type = assert_cast(type.get()); + return generalizeDataType(nullable_type->getNestedType()); + } + + if (isNumber(type)) + return makeNullable(std::make_shared()); + + if (which.isArray()) + { + const auto * array_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(array_type->getNestedType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + if (which.isTuple()) + { + const auto * tuple_type = assert_cast(type.get()); + DataTypes nested_types; + for (const auto & element : tuple_type->getElements()) + { + auto nested_type = generalizeDataType(element); + if (!nested_type) + return nullptr; + nested_types.push_back(nested_type); + } + return std::make_shared(std::move(nested_types)); + } + + if (which.isMap()) + { + const auto * map_type = assert_cast(type.get()); + auto key_type = removeNullable(generalizeDataType(map_type->getKeyType())); + auto value_type = generalizeDataType(map_type->getValueType()); + return key_type && value_type ? std::make_shared(key_type, value_type) : nullptr; + } + + if (which.isLowCarnality()) + { + const auto * lc_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(lc_type->getDictionaryType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + return makeNullable(type); +} + +} diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h new file mode 100644 index 00000000000..fb43acc3cd6 --- /dev/null +++ b/src/Formats/ReadSchemaUtils.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Try to determine the schema of the data in specifying format. +/// For formats that have an external schema reader, it will +/// use it and won't create a read buffer. +/// For formats that have a schema reader from the data, +/// read buffer will be created by the provided creator and +/// the schema will be extracted from the data. +/// If format doesn't have any schema reader or a schema reader +/// couldn't determine the schema, an exception will be thrown. +using ReadBufferCreator = std::function()>; +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context); + +/// Convert type to the most general type: +/// - IntN, UIntN, FloatN, Decimal -> Float64 +/// - Type -> Nullable(type) +/// - Array(Type) -> Array(Nullable(Type)) +/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) +/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) +/// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) +/// If type is Nothing or one of the nested types is Nothing, return nullptr. +DataTypePtr generalizeDataType(DataTypePtr type); + +} diff --git a/src/Formats/config_formats.h.in b/src/Formats/config_formats.h.in index f6497b4830b..427abc7d1ce 100644 --- a/src/Formats/config_formats.h.in +++ b/src/Formats/config_formats.h.in @@ -10,4 +10,3 @@ #cmakedefine01 USE_ARROW #cmakedefine01 USE_PROTOBUF #cmakedefine01 USE_MSGPACK - diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 7425c6898de..1349c9e3323 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -81,6 +81,28 @@ void registerInputFormatCapnProto(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory); +void registerArrowSchemaReader(FormatFactory & factory); +void registerParquetSchemaReader(FormatFactory & factory); +void registerORCSchemaReader(FormatFactory & factory); +void registerTSVSchemaReader(FormatFactory & factory); +void registerCSVSchemaReader(FormatFactory & factory); +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory); +void registerJSONEachRowSchemaReader(FormatFactory & factory); +void registerNativeSchemaReader(FormatFactory & factory); +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory); +void registerAvroSchemaReader(FormatFactory & factory); +void registerProtobufSchemaReader(FormatFactory & factory); +void registerLineAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerRawBLOBSchemaReader(FormatFactory & factory); +void registerMsgPackSchemaReader(FormatFactory & factory); +void registerCapnProtoSchemaReader(FormatFactory & factory); +void registerCustomSeparatedSchemaReader(FormatFactory & factory); +void registerRegexpSchemaReader(FormatFactory & factory); +void registerTSKVSchemaReader(FormatFactory & factory); +void registerValuesSchemaReader(FormatFactory & factory); +void registerTemplateSchemaReader(FormatFactory & factory); + void registerFormats() { auto & factory = FormatFactory::instance(); @@ -152,6 +174,28 @@ void registerFormats() registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); + + registerArrowSchemaReader(factory); + registerParquetSchemaReader(factory); + registerORCSchemaReader(factory); + registerTSVSchemaReader(factory); + registerCSVSchemaReader(factory); + registerJSONCompactEachRowSchemaReader(factory); + registerJSONEachRowSchemaReader(factory); + registerNativeSchemaReader(factory); + registerRowBinaryWithNamesAndTypesSchemaReader(factory); + registerAvroSchemaReader(factory); + registerProtobufSchemaReader(factory); + registerLineAsStringSchemaReader(factory); + registerJSONAsStringSchemaReader(factory); + registerRawBLOBSchemaReader(factory); + registerMsgPackSchemaReader(factory); + registerCapnProtoSchemaReader(factory); + registerCustomSeparatedSchemaReader(factory); + registerRegexpSchemaReader(factory); + registerTSKVSchemaReader(factory); + registerValuesSchemaReader(factory); + registerTemplateSchemaReader(factory); } } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index b0a6838b81e..48811a41edd 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -702,6 +702,25 @@ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & set readCSVStringInto(s, buf, settings); } +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + s.clear(); + bool add_quote = false; + char quote = '\''; + + if (!buf.eof() && (*buf.position() == '\'' || *buf.position() == '"')) + { + quote = *buf.position(); + s.push_back(quote); + add_quote = true; + } + + readCSVStringInto(s, buf, settings); + + if (add_quote) + s.push_back(quote); +} + template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); @@ -1212,6 +1231,19 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } } +// Use PeekableReadBuffer to copy field to string after parsing. +template +static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func) +{ + PeekableReadBuffer peekable_buf(buf); + peekable_buf.setCheckpoint(); + parse_func(peekable_buf); + peekable_buf.makeContinuousMemoryFromCheckpointToPos(); + auto * end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(); + s.append(peekable_buf.position(), end); + peekable_buf.position() = end; +} template static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) @@ -1266,7 +1298,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedString(s, buf); + { + s.push_back('\''); + readQuotedStringInto(s, buf); + s.push_back('\''); + } else if (*buf.position() == '[') readQuotedFieldInBrackets<'[', ']'>(s, buf); else if (*buf.position() == '(') @@ -1290,18 +1326,19 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) else { /// It's an integer, float or decimal. They all can be parsed as float. - /// Use PeekableReadBuffer to copy field to string after parsing. - PeekableReadBuffer peekable_buf(buf); - peekable_buf.setCheckpoint(); - Float64 tmp; - readFloatText(tmp, peekable_buf); - peekable_buf.makeContinuousMemoryFromCheckpointToPos(); - auto * end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(); - s.append(peekable_buf.position(), end); - peekable_buf.position() = end; + auto parse_func = [](ReadBuffer & in) + { + Float64 tmp; + readFloatText(tmp, in); + }; + readParsedValueIntoString(s, buf, parse_func); } } +void readJSONFieldIntoString(String & s, ReadBuffer & buf) +{ + auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; + readParsedValueIntoString(s, buf, parse_func); +} } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index b2ad4035cdc..6d1023947a5 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -563,6 +563,8 @@ void readStringUntilWhitespace(String & s, ReadBuffer & buf); */ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +/// Differ from readCSVString in that it doesn't remove quotes around field if any. +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// Read and append result to array of characters. template @@ -1381,4 +1383,7 @@ struct PcgDeserializer void readQuotedFieldIntoString(String & s, ReadBuffer & buf); +void readJSONFieldIntoString(String & s, ReadBuffer & buf); + } + diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8f003e75a07..7ddb0c8c26e 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -637,13 +637,14 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti /// Table function without columns list. auto table_function = TableFunctionFactory::instance().get(create.as_table_function, getContext()); properties.columns = table_function->getActualTableStructure(getContext()); - assert(!properties.columns.empty()); } else if (create.is_dictionary) { return {}; } - else + /// We can have queries like "CREATE TABLE ENGINE=" if + /// supports schema inference (will determine table structure in it's constructor). + else if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(create.storage->engine->name)) throw Exception("Incorrect CREATE query: required list of column descriptions or AS section or SELECT.", ErrorCodes::INCORRECT_QUERY); /// Even if query has list of columns, canonicalize it (unfold Nested columns). @@ -1083,7 +1084,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, { const auto & factory = TableFunctionFactory::instance(); auto table_func = factory.get(create.as_table_function, getContext()); - res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns); + /// In case of CREATE AS table_function() query we should use global context + /// in storage creation because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns, /*use_global_context=*/true); res->renameInMemory({create.getDatabase(), create.getTable(), create.uuid}); } else diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index 3e77bee19a9..e61a0f55142 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -359,7 +359,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (as_table_function) { - if (columns_list) + if (columns_list && !columns_list->empty()) { frame.expression_list_always_start_on_new_line = true; settings.ostr << (settings.one_line ? " (" : "\n("); @@ -375,7 +375,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat frame.expression_list_always_start_on_new_line = true; - if (columns_list && !as_table_function) + if (columns_list && !columns_list->empty() && !as_table_function) { settings.ostr << (settings.one_line ? " (" : "\n("); FormatStateStacked frame_nested = frame; diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 93fced7dba5..2e35731acad 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -50,6 +50,12 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; + + bool empty() + { + return (!columns || columns->children.empty()) && (!indices || indices->children.empty()) && (!constraints || constraints->children.empty()) + && (!projections || projections->children.empty()); + } }; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index dbbea986404..7f47e1efb49 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -557,34 +557,43 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe } } } + /** Create queries without list of columns: + * - CREATE|ATTACH TABLE ... AS ... + * - CREATE|ATTACH TABLE ... ENGINE = engine + */ else { storage_p.parse(pos, storage, expected); - if (!s_as.ignore(pos, expected)) - return false; - - if (!select_p.parse(pos, select, expected)) /// AS SELECT ... + /// CREATE|ATTACH TABLE ... AS ... + if (s_as.ignore(pos, expected)) { - /// ENGINE can not be specified for table functions. - if (storage || !table_function_p.parse(pos, as_table_function, expected)) + if (!select_p.parse(pos, select, expected)) /// AS SELECT ... { - /// AS [db.]table - if (!name_p.parse(pos, as_table, expected)) - return false; - - if (s_dot.ignore(pos, expected)) + /// ENGINE can not be specified for table functions. + if (storage || !table_function_p.parse(pos, as_table_function, expected)) { - as_database = as_table; + /// AS [db.]table if (!name_p.parse(pos, as_table, expected)) return false; - } - /// Optional - ENGINE can be specified. - if (!storage) - storage_p.parse(pos, storage, expected); + if (s_dot.ignore(pos, expected)) + { + as_database = as_table; + if (!name_p.parse(pos, as_table, expected)) + return false; + } + + /// Optional - ENGINE can be specified. + if (!storage) + storage_p.parse(pos, storage, expected); + } } } + else if (!storage) + { + return false; + } } auto comment = parseComment(pos, expected); diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index bc1ebd65639..33aafb40d83 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -361,6 +361,8 @@ protected: * Or: * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] AS ENGINE = engine SELECT ... * + * Or (for engines that supports schema inference): + * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] ENGINE = engine */ class ParserCreateTableQuery : public IParserBase { diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp new file mode 100644 index 00000000000..096e39a2893 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -0,0 +1,160 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowSchemaReader::readSchema() +{ + DataTypes data_types = readRowAndGetDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + DataTypes new_data_types = readRowAndGetDataTypes(); + if (new_data_types.empty()) + /// We reached eof. + break; + + if (new_data_types.size() != data_types.size()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Rows have different amount of values"); + + for (size_t i = 0; i != data_types.size(); ++i) + { + /// We couldn't determine the type of this column in a new row, just skip it. + if (!new_data_types[i]) + continue; + + /// If we couldn't determine the type of column yet, just set the new type. + if (!data_types[i]) + data_types[i] = new_data_types[i]; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (data_types[i]->getName() != new_data_types[i]->getName()) + { + if (default_type) + data_types[i] = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName()); + } + } + } + + /// Check that we read at list one column. + if (data_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + /// If column names weren't set, use default names 'c1', 'c2', ... + if (column_names.empty()) + { + column_names.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + column_names.push_back("c" + std::to_string(i + 1)); + } + /// If column names were set, check that the number of names match the number of types. + else if (column_names.size() != data_types.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", column_names.size(), data_types.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_types.size(); ++i) + { + /// Check that we could determine the type of this column. + if (!data_types[i]) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + data_types[i] = default_type; + } + result.emplace_back(column_names[i], data_types[i]); + } + + return result; +} + +IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowWithNamesSchemaReader::readSchema() +{ + auto names_and_types = readRowAndGetNamesAndDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + auto new_names_and_types = readRowAndGetNamesAndDataTypes(); + if (new_names_and_types.empty()) + /// We reached eof. + break; + + for (const auto & [name, new_type] : new_names_and_types) + { + auto it = names_and_types.find(name); + /// If we didn't see this column before, just add it. + if (it == names_and_types.end()) + { + names_and_types[name] = new_type; + continue; + } + + auto & type = it->second; + /// If we couldn't determine the type of column yet, just set the new type. + if (!type) + type = new_type; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (new_type && type->getName() != new_type->getName()) + { + if (default_type) + type = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName()); + } + } + } + + /// Check that we read at list one column. + if (names_and_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + NamesAndTypesList result; + for (auto & [name, type] : names_and_types) + { + /// Check that we could determine the type of this column. + if (!type) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + type = default_type; + } + result.emplace_back(name, type); + } + + return result; +} + +} diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h new file mode 100644 index 00000000000..67a8eb88d61 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/// Base class for schema inference for the data in some specific format. +/// It reads some data from read buffer and try to determine the schema +/// from read data. +class ISchemaReader +{ +public: + ISchemaReader(ReadBuffer & in_) : in(in_) {} + + virtual NamesAndTypesList readSchema() = 0; + + virtual ~ISchemaReader() = default; + +protected: + ReadBuffer & in; +}; + +/// Base class for schema inference for formats that read data row by row. +/// It reads data row by row (up to max_rows_to_read), determines types of columns +/// for each row and compare them with types from the previous rows. If some column +/// contains values with different types in different rows, the default type will be +/// used for this column or the exception will be thrown (if default type is not set). +class IRowSchemaReader : public ISchemaReader +{ +public: + IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return types in the same order in which the values were in the row. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty list if can't read more data. + virtual DataTypes readRowAndGetDataTypes() = 0; + + void setColumnNames(const std::vector & names) { column_names = names; } + +private: + size_t max_rows_to_read; + DataTypePtr default_type; + std::vector column_names; +}; + +/// Base class for schema inference for formats that read data row by row and each +/// row contains column names and values (ex: JSONEachRow, TSKV). +/// Differ from IRowSchemaReader in that after reading a row we get +/// a map {column_name : type} and some columns may be missed in a single row +/// (in this case we will use types from the previous rows for missed columns). +class IRowWithNamesSchemaReader : public ISchemaReader +{ +public: + IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return map {column_name : type}. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty map is can't read more data. + virtual std::unordered_map readRowAndGetNamesAndDataTypes() = 0; + +private: + size_t max_rows_to_read; + DataTypePtr default_type; +}; + +/// Base class for schema inference for formats that don't need any data to +/// determine the schema: formats with constant schema (ex: JSONAsString, LineAsString) +/// and formats that use external format schema (ex: Protobuf, CapnProto). +class IExternalSchemaReader +{ +public: + virtual NamesAndTypesList readSchema() = 0; + + virtual ~IExternalSchemaReader() = default; +}; + +} diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index 1f6b530d72f..4af2c651c39 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -85,31 +85,38 @@ void ArrowBlockInputFormat::resetParser() record_batch_current = 0; } +static std::shared_ptr createStreamReader(ReadBuffer & in) +{ + auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(in)); + if (!stream_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", stream_reader_status.status().ToString()); + return *stream_reader_status; +} + +static std::shared_ptr createFileReader(ReadBuffer & in, const FormatSettings & format_settings, std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return nullptr; + + auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file)); + if (!file_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", file_reader_status.status().ToString()); + return *file_reader_status; +} + + void ArrowBlockInputFormat::prepareReader() { - std::shared_ptr schema; - if (stream) - { - auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(*in)); - if (!stream_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", stream_reader_status.status().ToString()); - stream_reader = *stream_reader_status; - schema = stream_reader->schema(); - } + stream_reader = createStreamReader(*in); else { - auto arrow_file = asArrowFile(*in, format_settings, is_stopped); - if (is_stopped) + file_reader = createFileReader(*in, format_settings, is_stopped); + if (!file_reader) return; - - auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file)); - if (!file_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", file_reader_status.status().ToString()); - file_reader = *file_reader_status; - schema = file_reader->schema(); } arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested); @@ -122,6 +129,27 @@ void ArrowBlockInputFormat::prepareReader() record_batch_current = 0; } +ArrowSchemaReader::ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_) + : ISchemaReader(in_), stream(stream_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ArrowSchemaReader::readSchema() +{ + std::shared_ptr schema; + + if (stream) + schema = createStreamReader(in)->schema(); + else + { + std::atomic is_stopped = 0; + schema = createFileReader(in, format_settings, is_stopped)->schema(); + } + + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow"); + return header.getNamesAndTypesList(); +} + void registerInputFormatArrow(FormatFactory & factory) { factory.registerInputFormat( @@ -145,6 +173,20 @@ void registerInputFormatArrow(FormatFactory & factory) }); } +void registerArrowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Arrow", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + factory.registerSchemaReader( + "ArrowStream", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + });} } #else @@ -154,6 +196,8 @@ class FormatFactory; void registerInputFormatArrow(FormatFactory &) { } + +void registerArrowSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h index bb8a000477c..62cbf949fc2 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h @@ -4,6 +4,7 @@ #if USE_ARROW #include +#include #include namespace arrow { class RecordBatchReader; } @@ -51,6 +52,18 @@ private: std::atomic is_stopped{0}; }; +class ArrowSchemaReader : public ISchemaReader +{ +public: + ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + bool stream; + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 272907022a1..aa181ea0b8b 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -239,10 +239,8 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr -static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) { - const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); - auto internal_type = std::make_shared>(arrow_decimal_type->precision(), arrow_decimal_type->scale()); auto internal_column = internal_type->createColumn(); auto & column = assert_cast &>(*internal_column); auto & column_data = column.getData(); @@ -259,6 +257,21 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr +static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +{ + const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); + size_t precision = arrow_decimal_type->precision(); + auto internal_type = createDecimal(precision, arrow_decimal_type->scale()); + if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); +} + /// Creates a null bytemap from arrow's null bitmap static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr & arrow_column) { @@ -328,12 +341,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( const std::string & column_name, const std::string & format_name, bool is_nullable, - std::unordered_map> & dictionary_values) + std::unordered_map> & dictionary_values, + bool read_ints_as_dates) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullable_type = std::make_shared(std::move(nested_column.type)); auto nullable_column = ColumnNullable::create(std::move(nested_column.column), std::move(nullmap_column)); @@ -358,25 +372,27 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::UINT16: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::UINT32: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::TIMESTAMP: return readColumnWithTimestampData(arrow_column, column_name); case arrow::Type::DECIMAL128: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::DECIMAL256: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -388,7 +404,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(std::move(nested_column.column), std::move(offsets_column)); auto array_type = std::make_shared(nested_column.type); @@ -413,7 +429,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -436,7 +452,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); @@ -483,7 +499,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) { ColumnsWithTypeAndName sample_columns; for (const auto & field : schema.fields()) @@ -493,24 +509,21 @@ static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::stri std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, field->type(), &array_builder); checkStatus(status, field->name(), format_name); + std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); checkStatus(status, field->name(), format_name); + arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values); + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } -ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const arrow::Schema & schema, const std::string & format_name_, bool import_nested_) - : header(arrowSchemaToCHHeader(schema, format_name_)), format_name(format_name_), import_nested(import_nested_) -{ -} - ArrowColumnToCHColumn::ArrowColumnToCHColumn( const Block & header_, const std::string & format_name_, bool import_nested_) : header(header_), format_name(format_name_), import_nested(import_nested_) @@ -553,7 +566,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (!nested_tables.contains(nested_table_name)) { std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values)}; + ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -573,7 +586,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (read_from_nested) column = nested_tables[nested_table_name]->getByName(header_column.name); else - column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values); + column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); try { diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 46976093f0b..58f8f1536b5 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -23,16 +23,14 @@ public: ArrowColumnToCHColumn(const Block & header_, const std::string & format_name_, bool import_nested_); - /// Constructor that create header by arrow schema. It will be useful for inserting - /// data from file without knowing table structure. - ArrowColumnToCHColumn(const arrow::Schema & schema, const std::string & format_name, bool import_nested_); - void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr); + static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name); + private: - const Block header; + const Block & header; const std::string format_name; bool import_nested; diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 11e56ecbe0c..a372df41344 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -815,6 +815,92 @@ const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(Sc return it->second; } +AvroSchemaReader::AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_) + : ISchemaReader(in_), confluent(confluent_), format_settings(format_settings_) +{ +} + +NamesAndTypesList AvroSchemaReader::readSchema() +{ + avro::NodePtr root_node; + if (confluent) + { + UInt32 schema_id = readConfluentSchemaId(in); + root_node = getConfluentSchemaRegistry(format_settings)->getSchema(schema_id).root(); + } + else + { + auto file_reader_ptr = std::make_unique(std::make_unique(in)); + root_node = file_reader_ptr->dataSchema().root(); + } + + if (root_node->type() != avro::Type::AVRO_RECORD) + throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); + + NamesAndTypesList names_and_types; + for (size_t i = 0; i != root_node->leaves(); ++i) + names_and_types.emplace_back(root_node->nameAt(i), avroNodeToDataType(root_node->leafAt(i))); + + return names_and_types; +} + +DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node) +{ + switch (node->type()) + { + case avro::Type::AVRO_INT: + return {std::make_shared()}; + case avro::Type::AVRO_LONG: + return std::make_shared(); + case avro::Type::AVRO_BOOL: + return std::make_shared(); + case avro::Type::AVRO_FLOAT: + return std::make_shared(); + case avro::Type::AVRO_DOUBLE: + return std::make_shared(); + case avro::Type::AVRO_STRING: + return std::make_shared(); + case avro::Type::AVRO_BYTES: + return std::make_shared(); + case avro::Type::AVRO_ENUM: + { + if (node->names() < 128) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + else if (node->names() < 32768) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse supports only 8 and 16-bit Enum."); + } + case avro::Type::AVRO_FIXED: + return std::make_shared(node->fixedSize()); + case avro::Type::AVRO_ARRAY: + return std::make_shared(avroNodeToDataType(node->leafAt(0))); + case avro::Type::AVRO_NULL: + return std::make_shared(); + case avro::Type::AVRO_UNION: + if (node->leaves() == 2 && (node->leafAt(0)->type() == avro::Type::AVRO_NULL || node->leafAt(1)->type() == avro::Type::AVRO_NULL)) + { + size_t nested_leaf_index = node->leafAt(0)->type() == avro::Type::AVRO_NULL ? 1 : 0; + return makeNullable(avroNodeToDataType(node->leafAt(nested_leaf_index))); + } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro type UNION is not supported for inserting."); + case avro::Type::AVRO_SYMBOLIC: + return avroNodeToDataType(avro::resolveSymbol(node)); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro column {} is not supported for inserting."); + } +} + void registerInputFormatAvro(FormatFactory & factory) { factory.registerInputFormat("Avro", []( @@ -836,6 +922,21 @@ void registerInputFormatAvro(FormatFactory & factory) }); } +void registerAvroSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + + factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + }); + +} + + } #else @@ -846,6 +947,8 @@ class FormatFactory; void registerInputFormatAvro(FormatFactory &) { } + +void registerAvroSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.h b/src/Processors/Formats/Impl/AvroRowInputFormat.h index 73237369e56..46e571d87ec 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,20 @@ private: FormatSettings format_settings; }; +class AvroSchemaReader : public ISchemaReader +{ +public: + AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + DataTypePtr avroNodeToDataType(avro::NodePtr node); + + bool confluent; + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index 0506c539c0f..b356967a544 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -5,7 +5,6 @@ #include #include - namespace DB { @@ -15,11 +14,23 @@ namespace ErrorCodes } BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + std::move(header), + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { } -std::vector BinaryRowInputFormat::readHeaderRow() + +BinaryFormatReader::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +std::vector BinaryFormatReader::readHeaderRow() { std::vector fields; String field; @@ -31,13 +42,13 @@ std::vector BinaryRowInputFormat::readHeaderRow() return fields; } -std::vector BinaryRowInputFormat::readNames() +std::vector BinaryFormatReader::readNames() { readVarUInt(read_columns, *in); return readHeaderRow(); } -std::vector BinaryRowInputFormat::readTypes() +std::vector BinaryFormatReader::readTypes() { auto types = readHeaderRow(); for (const auto & type_name : types) @@ -45,31 +56,37 @@ std::vector BinaryRowInputFormat::readTypes() return types; } -bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) +bool BinaryFormatReader::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) { serialization->deserializeBinary(column, *in); return true; } -void BinaryRowInputFormat::skipHeaderRow() +void BinaryFormatReader::skipHeaderRow() { String tmp; for (size_t i = 0; i < read_columns; ++i) readStringBinary(tmp, *in); } -void BinaryRowInputFormat::skipNames() +void BinaryFormatReader::skipNames() { readVarUInt(read_columns, *in); skipHeaderRow(); } -void BinaryRowInputFormat::skipTypes() +void BinaryFormatReader::skipTypes() { + if (read_columns == 0) + { + /// It's possible only when with_names = false and with_types = true + readVarUInt(read_columns, *in); + } + skipHeaderRow(); } -void BinaryRowInputFormat::skipField(size_t file_column) +void BinaryFormatReader::skipField(size_t file_column) { if (file_column >= read_data_types.size()) throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown"); @@ -77,6 +94,11 @@ void BinaryRowInputFormat::skipField(size_t file_column) read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in); } +BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_) +{ +} + void registerInputFormatRowBinary(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -94,4 +116,13 @@ void registerInputFormatRowBinary(FormatFactory & factory) registerWithNamesAndTypes("RowBinary", register_func); } +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + + } diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.h b/src/Processors/Formats/Impl/BinaryRowInputFormat.h index 61d6df77522..d98e75bf621 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.h +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.h @@ -1,15 +1,19 @@ #pragma once #include -#include #include +#include namespace DB { -class ReadBuffer; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} +class ReadBuffer; /** A stream for inputting data in a binary line-by-line format. */ @@ -24,9 +28,15 @@ public: /// in this format we cannot provide any DiagnosticInfo, because here we have /// just binary data. std::string getDiagnosticInfo() override { return {}; } +}; + +class BinaryFormatReader : public FormatWithNamesAndTypesReader +{ +public: + BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); -private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + void skipField(size_t file_column) override; void skipNames() override; @@ -37,9 +47,24 @@ private: std::vector readTypes() override; std::vector readHeaderRow(); +private: /// Data types read from input data. DataTypes read_data_types; - UInt64 read_columns = 0; + UInt64 read_columns; +}; + +class BinaryWithNamesAndTypesSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override + { + throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypes is not implemented"}; + } + + BinaryFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9de2b908b1e..735a549d0a6 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -5,13 +5,16 @@ #include #include #include +#include +#include #include #include -#include +#include +#include + namespace DB { - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -26,7 +29,14 @@ CSVRowInputFormat::CSVRowInputFormat( bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { const String bad_delimiters = " \t\"'.UL"; if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos) @@ -36,6 +46,11 @@ CSVRowInputFormat::CSVRowInputFormat( ErrorCodes::BAD_ARGUMENTS); } +void CSVRowInputFormat::syncAfterError() +{ + skipToNextLineOrEOF(*in); +} + static void skipEndOfLine(ReadBuffer & in) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -52,8 +67,10 @@ static void skipEndOfLine(ReadBuffer & in) if (!in.eof() && *in.position() == '\n') ++in.position(); else - throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." - " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); + throw Exception( + "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." + " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", + ErrorCodes::INCORRECT_DATA); } else if (!in.eof()) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); @@ -62,32 +79,38 @@ static void skipEndOfLine(ReadBuffer & in) /// Skip `whitespace` symbols allowed in CSV. static inline void skipWhitespacesAndTabs(ReadBuffer & in) { - while (!in.eof() - && (*in.position() == ' ' - || *in.position() == '\t')) + while (!in.eof() && (*in.position() == ' ' || *in.position() == '\t')) ++in.position(); } -void CSVRowInputFormat::skipFieldDelimiter() +CSVFormatReader::CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*in); assertChar(format_settings.csv.delimiter, *in); } -String CSVRowInputFormat::readFieldIntoString() +template +String CSVFormatReader::readCSVFieldIntoString() { skipWhitespacesAndTabs(*in); String field; - readCSVString(field, *in, format_settings.csv); + if constexpr (read_string) + readCSVString(field, *in, format_settings.csv); + else + readCSVField(field, *in, format_settings.csv); return field; } -void CSVRowInputFormat::skipField() +void CSVFormatReader::skipField() { - readFieldIntoString(); + readCSVFieldIntoString(); } -void CSVRowInputFormat::skipRowEndDelimiter() +void CSVFormatReader::skipRowEndDelimiter() { skipWhitespacesAndTabs(*in); @@ -105,33 +128,32 @@ void CSVRowInputFormat::skipRowEndDelimiter() skipEndOfLine(*in); } -void CSVRowInputFormat::skipHeaderRow() +void CSVFormatReader::skipHeaderRow() { do { skipField(); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); } -std::vector CSVRowInputFormat::readHeaderRow() +template +std::vector CSVFormatReader::readRowImpl() { std::vector fields; do { - fields.push_back(readFieldIntoString()); + fields.push_back(readCSVFieldIntoString()); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); return fields; } -bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; @@ -144,7 +166,8 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { if (*in->position() == '\n' || *in->position() == '\r') { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." + out << "ERROR: Line feed found where delimiter (" << delimiter + << ") is expected." " It's like your file has less columns than expected.\n" "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; } @@ -160,7 +183,7 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) return true; } -bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespacesAndTabs(*in); @@ -191,23 +214,21 @@ bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } -void CSVRowInputFormat::syncAfterError() -{ - skipToNextLineOrEOF(*in); -} - -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) +bool CSVFormatReader::readField( + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + bool is_last_file_column, + const String & /*column_name*/) { skipWhitespacesAndTabs(*in); const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter; - const bool at_last_column_line_end = is_last_file_column - && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); + const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); /// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default /// only one empty or NULL column will be expected - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) + if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) { /// Treat empty unquoted column value as default value, if /// specified in the settings. Tuple columns might seem @@ -231,6 +252,31 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co } } + +CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV)) + , reader(in_, format_setting_) + , context(context_) +{ +} + + +DataTypes CSVSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context); +} + + void registerInputFormatCSV(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -326,4 +372,17 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory) registerWithNamesAndTypes("CSV", register_func); } +void registerCSVSchemaReader(FormatFactory & factory) +{ + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, settings, context); + }); + }; + + registerWithNamesAndTypes("CSV", register_func); +} + } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index d7c557b58d8..d723647595e 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -5,6 +5,7 @@ #include #include +#include #include @@ -28,6 +29,12 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class CSVFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -42,17 +49,34 @@ private: void skipField(size_t /*file_column*/) override { skipField(); } void skipField(); - void skipHeaderRow() ; + void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } + std::vector readHeaderRow() { return readRowImpl(); } + std::vector readRow() { return readRowImpl(); } - String readFieldIntoString(); + template + std::vector readRowImpl(); + + template + String readCSVFieldIntoString(); +}; + +class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + CSVFormatReader reader; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 4d000bb1f35..311f4742335 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -273,6 +273,7 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension #endif auto root_reader = msg.getRoot(root); + for (size_t i = 0; i != columns.size(); ++i) { auto value = getReaderByColumnName(root_reader, column_names[i]); @@ -282,6 +283,24 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension return true; } +CapnProtoSchemaReader::CapnProtoSchemaReader(const FormatSettings & format_settings_) : format_settings(format_settings_) +{ +} + +NamesAndTypesList CapnProtoSchemaReader::readSchema() +{ + auto schema_info = FormatSchemaInfo( + format_settings.schema.format_schema, + "CapnProto", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path); + + auto schema_parser = CapnProtoSchemaParser(); + auto schema = schema_parser.getMessageSchema(schema_info); + return capnProtoSchemaToCHSchema(schema); +} + void registerInputFormatCapnProto(FormatFactory & factory) { factory.registerInputFormat( @@ -293,6 +312,14 @@ void registerInputFormatCapnProto(FormatFactory & factory) }); } +void registerCapnProtoSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("CapnProto", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -301,6 +328,7 @@ namespace DB { class FormatFactory; void registerInputFormatCapnProto(FormatFactory &) {} + void registerCapnProtoSchemaReader(FormatFactory &) {} } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index 4c0f34d70a3..053de14d1a4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -38,6 +39,17 @@ private: Names column_names; }; +class CapnProtoSchemaReader : public IExternalSchemaReader +{ +public: + explicit CapnProtoSchemaReader(const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 56aa4345777..d2e0d6e21a9 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -31,7 +31,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( bool ignore_spaces_, const FormatSettings & format_settings_) : CustomSeparatedRowInputFormat( - header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, format_settings_) + header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, updateFormatSettings(format_settings_)) { } @@ -43,10 +43,15 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, *buf_, params_, with_names_, with_types_, updateFormatSettings(format_settings_)) + : RowInputFormatWithNamesAndTypes( + header_, + *buf_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(*buf_, ignore_spaces_, format_settings_)) , buf(std::move(buf_)) - , ignore_spaces(ignore_spaces_) - , escaping_rule(format_settings_.custom.escaping_rule) { /// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know /// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are @@ -61,43 +66,76 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( } } -void CustomSeparatedRowInputFormat::skipPrefixBeforeHeader() + +bool CustomSeparatedRowInputFormat::allowSyncAfterError() const +{ + return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty(); +} + +void CustomSeparatedRowInputFormat::syncAfterError() +{ + skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); + end_of_stream = buf->eof(); + /// It can happen that buf->position() is not at the beginning of row + /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. + /// It will cause another parsing error. +} + +void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + RowInputFormatWithNamesAndTypes::setReadBuffer(*buf); +} + +CustomSeparatedFormatReader::CustomSeparatedFormatReader( + PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(buf_, format_settings_), buf(&buf_), ignore_spaces(ignore_spaces_) +{ +} + +void CustomSeparatedRowInputFormat::resetParser() +{ + RowInputFormatWithNamesAndTypes::resetParser(); + buf->reset(); +} + +void CustomSeparatedFormatReader::skipPrefixBeforeHeader() { skipSpaces(); assertString(format_settings.custom.result_before_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowStartDelimiter() +void CustomSeparatedFormatReader::skipRowStartDelimiter() { skipSpaces(); assertString(format_settings.custom.row_before_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipFieldDelimiter() +void CustomSeparatedFormatReader::skipFieldDelimiter() { skipSpaces(); assertString(format_settings.custom.field_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowEndDelimiter() +void CustomSeparatedFormatReader::skipRowEndDelimiter() { skipSpaces(); assertString(format_settings.custom.row_after_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowBetweenDelimiter() +void CustomSeparatedFormatReader::skipRowBetweenDelimiter() { skipSpaces(); assertString(format_settings.custom.row_between_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipField() +void CustomSeparatedFormatReader::skipField() { skipSpaces(); - skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedRowInputFormat::checkEndOfRow() +bool CustomSeparatedFormatReader::checkEndOfRow() { PeekableReadBufferCheckpoint checkpoint{*buf, true}; @@ -118,43 +156,66 @@ bool CustomSeparatedRowInputFormat::checkEndOfRow() return checkForSuffixImpl(true); } -std::vector CustomSeparatedRowInputFormat::readHeaderRow() +template +String CustomSeparatedFormatReader::readFieldIntoString(bool is_first) +{ + if (!is_first) + skipFieldDelimiter(); + skipSpaces(); + if constexpr (is_header) + return readStringByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); + else + return readFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); +} + +template +std::vector CustomSeparatedFormatReader::readRowImpl() { std::vector values; skipRowStartDelimiter(); - do + + if (columns == 0) { - if (!values.empty()) - skipFieldDelimiter(); - skipSpaces(); - values.push_back(readStringByEscapingRule(*buf, escaping_rule, format_settings)); + do + { + values.push_back(readFieldIntoString(values.empty())); + } while (!checkEndOfRow()); + columns = values.size(); + } + else + { + for (size_t i = 0; i != columns; ++i) + values.push_back(readFieldIntoString(i == 0)); } - while (!checkEndOfRow()); skipRowEndDelimiter(); return values; } -void CustomSeparatedRowInputFormat::skipHeaderRow() +void CustomSeparatedFormatReader::skipHeaderRow() { - size_t columns = getPort().getHeader().columns(); skipRowStartDelimiter(); - for (size_t i = 0; i != columns; ++i) + bool first = true; + do { - skipField(); - if (i + 1 != columns) + if (!first) skipFieldDelimiter(); + first = false; + + skipField(); } + while (!checkEndOfRow()); + skipRowEndDelimiter(); } -bool CustomSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) +bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) { skipSpaces(); - return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, format_settings); + return deserializeFieldByEscapingRule(type, serialization, column, *buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) +bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof) { skipSpaces(); if (format_settings.custom.result_after_delimiter.empty()) @@ -177,7 +238,7 @@ bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) return false; } -bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) { PeekableReadBufferCheckpoint checkpoint{*buf}; if (checkForSuffixImpl(false)) @@ -192,7 +253,7 @@ bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer return true; } -bool CustomSeparatedRowInputFormat::checkForSuffix() +bool CustomSeparatedFormatReader::checkForSuffix() { PeekableReadBufferCheckpoint checkpoint{*buf}; if (checkForSuffixImpl(true)) @@ -201,51 +262,60 @@ bool CustomSeparatedRowInputFormat::checkForSuffix() return false; } - -bool CustomSeparatedRowInputFormat::allowSyncAfterError() const -{ - return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty(); -} - -void CustomSeparatedRowInputFormat::syncAfterError() -{ - skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); - end_of_stream = buf->eof(); - /// It can happen that buf->position() is not at the beginning of row - /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. - /// It will cause another parsing error. -} - -bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.field_delimiter, "delimiter between fields", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_after_delimiter, "delimiter after last field", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_between_delimiter, "delimiter between rows", ignore_spaces); } -void CustomSeparatedRowInputFormat::resetParser() +void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) { - RowInputFormatWithNamesAndTypes::resetParser(); - buf->reset(); + buf = assert_cast(&in_); + FormatWithNamesAndTypesReader::setReadBuffer(in_); } -void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) +CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + buf, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule)) + , buf(in_) + , reader(buf, ignore_spaces_, updateFormatSettings(format_setting_)) + , context(context_) { - buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); +} + +DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (reader.checkForSuffix()) + return {}; + + if (!first_row || with_names || with_types) + reader.skipRowBetweenDelimiter(); + + if (first_row) + first_row = false; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context); } void registerInputFormatCustomSeparated(FormatFactory & factory) @@ -267,4 +337,20 @@ void registerInputFormatCustomSeparated(FormatFactory & factory) } } +void registerCustomSeparatedSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, ignore_spaces, settings, context); + }); + }; + + registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func); + } +} + } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index 6b572ca1417..d38d5bf0da4 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -19,7 +19,6 @@ public: void resetParser() override; String getName() const override { return "CustomSeparatedRowInputFormat"; } - void setReadBuffer(ReadBuffer & in_) override; private: @@ -28,6 +27,19 @@ private: std::unique_ptr in_buf_, const Params & params_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_); + + bool allowSyncAfterError() const override; + void syncAfterError() override; + + std::unique_ptr buf; + bool ignore_spaces; +}; + +class CustomSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CustomSeparatedFormatReader(PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -46,9 +58,6 @@ private: bool checkForSuffix() override; - bool allowSyncAfterError() const override; - void syncAfterError() override; - bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -57,15 +66,41 @@ private: std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - std::vector readHeaderRow(); + std::vector readHeaderRow() {return readRowImpl(); } + + std::vector readRow() { return readRowImpl(); } bool checkEndOfRow(); bool checkForSuffixImpl(bool check_eof); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } - std::unique_ptr buf; + EscapingRule getEscapingRule() { return format_settings.custom.escaping_rule; } + + void setReadBuffer(ReadBuffer & in_) override; +private: + template + std::vector readRowImpl(); + + template + String readFieldIntoString(bool is_first); + + PeekableReadBuffer * buf; bool ignore_spaces; - EscapingRule escaping_rule; + size_t columns = 0; +}; + +class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + CustomSeparatedFormatReader reader; + ContextPtr context; + bool first_row = true; }; } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 476985c2509..56ba975dea1 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -202,4 +202,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index d86142af795..ea6e9a1ed2f 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -1,8 +1,10 @@ #pragma once #include +#include #include #include +#include namespace DB { @@ -39,4 +41,13 @@ private: bool allow_new_rows = true; }; +class JSONAsStringExternalSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"json", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 88fb411ffbd..263702ad20f 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -8,16 +9,13 @@ #include #include #include +#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - - JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( const Block & header_, ReadBuffer & in_, @@ -26,24 +24,40 @@ JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_) - , yield_strings(yield_strings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, yield_strings_, format_settings_)) { } -void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter() +void JSONCompactEachRowRowInputFormat::syncAfterError() +{ + skipToUnescapedNextLineOrEOF(*in); +} + +JSONCompactEachRowFormatReader::JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(in_, format_settings_), yield_strings(yield_strings_) +{ +} + +void JSONCompactEachRowFormatReader::skipRowStartDelimiter() { skipWhitespaceIfAny(*in); assertChar('[', *in); } -void JSONCompactEachRowRowInputFormat::skipFieldDelimiter() +void JSONCompactEachRowFormatReader::skipFieldDelimiter() { skipWhitespaceIfAny(*in); assertChar(',', *in); } -void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() +void JSONCompactEachRowFormatReader::skipRowEndDelimiter() { skipWhitespaceIfAny(*in); assertChar(']', *in); @@ -55,29 +69,18 @@ void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() skipWhitespaceIfAny(*in); } -String JSONCompactEachRowRowInputFormat::readFieldIntoString() +void JSONCompactEachRowFormatReader::skipField() { skipWhitespaceIfAny(*in); - String field; - readJSONString(field, *in); - return field; + skipJSONField(*in, "skipped_field"); } -void JSONCompactEachRowRowInputFormat::skipField(size_t file_column) -{ - skipWhitespaceIfAny(*in); - skipJSONField(*in, column_mapping->names_of_columns[file_column]); -} - -void JSONCompactEachRowRowInputFormat::skipHeaderRow() +void JSONCompactEachRowFormatReader::skipHeaderRow() { skipRowStartDelimiter(); - size_t i = 0; do { - if (i >= column_mapping->names_of_columns.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names"); - skipField(i++); + skipField(); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -85,13 +88,16 @@ void JSONCompactEachRowRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() +std::vector JSONCompactEachRowFormatReader::readHeaderRow() { skipRowStartDelimiter(); std::vector fields; + String field; do { - fields.push_back(readFieldIntoString()); + skipWhitespaceIfAny(*in); + readJSONString(field, *in); + fields.push_back(field); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -100,18 +106,13 @@ std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() return fields; } -bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) +bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) { skipWhitespaceIfAny(*in); return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); } -void JSONCompactEachRowRowInputFormat::syncAfterError() -{ - skipToUnescapedNextLineOrEOF(*in); -} - -bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); if (!checkChar('[', *in)) @@ -123,7 +124,7 @@ bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuff return true; } -bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -150,7 +151,7 @@ bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(Wri return true; } -bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); @@ -180,6 +181,20 @@ bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer return true; } +JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) +{ +} + +DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() +{ + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); +} + void registerInputFormatJSONCompactEachRow(FormatFactory & factory) { for (bool yield_strings : {true, false}) @@ -200,6 +215,21 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory) } } +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory) +{ + for (bool json_strings : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, json_strings, settings); + }); + }; + registerWithNamesAndTypes(json_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } +} + void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index e01a4f49b30..0551aa8b64e 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -10,6 +11,7 @@ namespace DB class ReadBuffer; + /** A stream for reading data in a bunch of formats: * - JSONCompactEachRow * - JSONCompactEachRowWithNamesAndTypes @@ -34,6 +36,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class JSONCompactEachRowFormatReader : public FormatWithNamesAndTypesReader +{ +public: + JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_); + bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; @@ -45,7 +54,8 @@ private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - void skipField(size_t file_column) override; + void skipField(size_t /*column_index*/) override { skipField(); } + void skipField(); void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } @@ -56,9 +66,21 @@ private: std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - String readFieldIntoString(); + bool yieldStrings() const { return yield_strings; } +private: bool yield_strings; }; +class JSONCompactEachRowRowSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override; + + JSONCompactEachRowFormatReader reader; +}; + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 28481313974..75beca955b9 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -286,11 +287,7 @@ void JSONEachRowRowInputFormat::readPrefix() skipBOMIfExists(*in); skipWhitespaceIfAny(*in); - if (!in->eof() && *in->position() == '[') - { - ++in->position(); - data_in_square_brackets = true; - } + data_in_square_brackets = checkChar('[', *in); } void JSONEachRowRowInputFormat::readSuffix() @@ -309,6 +306,28 @@ void JSONEachRowRowInputFormat::readSuffix() assertEOF(*in); } +JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings) + : IRowWithNamesSchemaReader(in_, format_settings.max_rows_to_read_for_schema_inference), json_strings(json_strings_) +{ +} + + +std::unordered_map JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + skipWhitespaceIfAny(in); + checkChar('[', in); + first_row = false; + } + + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); +} void registerInputFormatJSONEachRow(FormatFactory & factory) { @@ -343,4 +362,17 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONEachRowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, false, settings); + }); + + factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, true, settings); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 9810f2dc765..323909a7730 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -84,4 +85,16 @@ private: bool yield_strings; }; +class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader +{ +public: + JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + bool json_strings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp index 1a05f61d36b..5983f3170e5 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp @@ -72,4 +72,13 @@ void registerInputFormatLineAsString(FormatFactory & factory) return std::make_shared(sample, buf, params); }); } + +void registerLineAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("LineAsString", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h index 1a6c6247558..c4c17c47dbe 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include namespace DB { @@ -26,4 +28,13 @@ private: void readLineObject(IColumn & column); }; +class LinaAsStringSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"line", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 60db32d879a..c56af536e15 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,8 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int INCORRECT_DATA; + extern const int BAD_ARGUMENTS; + extern const int UNEXPECTED_END_OF_FILE; } MsgPackRowInputFormat::MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) @@ -369,7 +372,108 @@ bool MsgPackRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_) { buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); + IInputFormat::setReadBuffer(in_); +} + +MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) +{ + if (!number_of_columns) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data"); +} + + +msgpack::object_handle MsgPackSchemaReader::readObject() +{ + if (buf.eof()) + throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected eof while parsing msgpack object"); + + PeekableReadBufferCheckpoint checkpoint{buf}; + size_t offset = 0; + bool need_more_data = true; + msgpack::object_handle object_handle; + while (need_more_data) + { + offset = 0; + try + { + object_handle = msgpack::unpack(buf.position(), buf.buffer().end() - buf.position(), offset); + need_more_data = false; + } + catch (msgpack::insufficient_bytes &) + { + buf.position() = buf.buffer().end(); + if (buf.eof()) + throw Exception("Unexpected end of file while parsing msgpack object", ErrorCodes::UNEXPECTED_END_OF_FILE); + buf.position() = buf.buffer().end(); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + } + } + buf.position() += offset; + return object_handle; +} + +DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) +{ + switch (object.type) + { + case msgpack::type::object_type::POSITIVE_INTEGER: [[fallthrough]]; + case msgpack::type::object_type::NEGATIVE_INTEGER: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT32: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT64: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BOOLEAN: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BIN: [[fallthrough]]; + case msgpack::type::object_type::STR: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::ARRAY: + { + msgpack::object_array object_array = object.via.array; + if (object_array.size) + { + auto nested_type = getDataType(object_array.ptr[0]); + if (nested_type) + return std::make_shared(getDataType(object_array.ptr[0])); + } + return nullptr; + } + case msgpack::type::object_type::MAP: + { + msgpack::object_map object_map = object.via.map; + if (object_map.size) + { + auto key_type = removeNullable(getDataType(object_map.ptr[0].key)); + auto value_type = getDataType(object_map.ptr[0].val); + if (key_type && value_type) + return std::make_shared(key_type, value_type); + } + return nullptr; + } + case msgpack::type::object_type::NIL: + return nullptr; + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported"); + } +} + +DataTypes MsgPackSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + DataTypes data_types; + data_types.reserve(number_of_columns); + for (size_t i = 0; i != number_of_columns; ++i) + { + auto object_handle = readObject(); + data_types.push_back(getDataType(object_handle.get())); + } + + return data_types; } void registerInputFormatMsgPack(FormatFactory & factory) @@ -384,6 +488,14 @@ void registerInputFormatMsgPack(FormatFactory & factory) }); } +void registerMsgPackSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + } #else @@ -394,6 +506,10 @@ class FormatFactory; void registerInputFormatMsgPack(FormatFactory &) { } + +void registerMsgPackSchemaReader(FormatFactory &) +{ +} } #endif diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index bb3887695eb..dd5655c80fc 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -6,6 +6,7 @@ #if USE_MSGPACK #include +#include #include #include #include @@ -76,6 +77,20 @@ private: const DataTypes data_types; }; +class MsgPackSchemaReader : public IRowSchemaReader +{ +public: + MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + msgpack::object_handle readObject(); + DataTypePtr getDataType(const msgpack::object & object); + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + UInt64 number_of_columns; +}; + } #endif diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index 07cf4670981..19e2ede6b65 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -1,8 +1,10 @@ #include #include + #include #include #include +#include #include @@ -82,6 +84,20 @@ private: NativeWriter writer; }; +class NativeSchemaReader : public ISchemaReader +{ +public: + explicit NativeSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) {} + + NamesAndTypesList readSchema() override + { + auto reader = NativeReader(in, 0); + auto block = reader.read(); + return block.getNamesAndTypesList(); + } +}; + + void registerInputFormatNative(FormatFactory & factory) { factory.registerInputFormat("Native", []( @@ -106,4 +122,14 @@ void registerOutputFormatNative(FormatFactory & factory) }); } + +void registerNativeSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr) + { + return std::make_shared(buf); + }); +} + + } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 87eec459aa3..9a787e5a614 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -87,9 +87,14 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } -void ORCBlockInputFormat::prepareReader() +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) { - auto arrow_file = asArrowFile(*in, format_settings, is_stopped); + auto arrow_file = asArrowFile(in, format_settings, is_stopped); if (is_stopped) return; @@ -101,7 +106,15 @@ void ORCBlockInputFormat::prepareReader() auto read_schema_result = file_reader->ReadSchema(); if (!read_schema_result.ok()) throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); - std::shared_ptr schema = std::move(read_schema_result).ValueOrDie(); + schema = std::move(read_schema_result).ValueOrDie(); +} + +void ORCBlockInputFormat::prepareReader() +{ + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "ORC", format_settings.orc.import_nested); @@ -128,7 +141,21 @@ void ORCBlockInputFormat::prepareReader() } } -void registerInputFormatORC(FormatFactory &factory) +ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ORCSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( "ORC", @@ -142,6 +169,17 @@ void registerInputFormatORC(FormatFactory &factory) factory.markFormatAsColumnOriented("ORC"); } +void registerORCSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "ORC", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -151,6 +189,10 @@ namespace DB void registerInputFormatORC(FormatFactory &) { } + + void registerORCSchemaReader(FormatFactory &) + { + } } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index c7dc1c4a710..9b55747f552 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_ORC #include +#include #include #include @@ -54,5 +55,16 @@ private: std::atomic is_stopped{0}; }; +class ORCSchemaReader : public ISchemaReader +{ +public: + ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 4c8f6ab2c54..651b9545c81 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -87,6 +87,7 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t { return orc::createPrimitiveType(orc::TypeKind::DOUBLE); } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Date: { return orc::createPrimitiveType(orc::TypeKind::DATE); @@ -292,6 +293,7 @@ void ORCBlockOutputFormat::writeColumn( writeNumbers(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; }); break; } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Int32: { writeNumbers(orc_column, column, null_bytemap, [](const Int32 & value){ return value; }); diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 0b6cd006300..1d303014d31 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -94,19 +94,30 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return; + THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); + THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); +} + void ParquetBlockInputFormat::prepareReader() { - auto arrow_file = asArrowFile(*in, format_settings, is_stopped); + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); if (is_stopped) return; - THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); row_group_total = file_reader->num_row_groups(); row_group_current = 0; - std::shared_ptr schema; - THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested); std::unordered_set nested_table_names; @@ -130,7 +141,21 @@ void ParquetBlockInputFormat::prepareReader() } } -void registerInputFormatParquet(FormatFactory &factory) +ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ParquetSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatParquet(FormatFactory & factory) { factory.registerInputFormat( "Parquet", @@ -144,6 +169,17 @@ void registerInputFormatParquet(FormatFactory &factory) factory.markFormatAsColumnOriented("Parquet"); } +void registerParquetSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Parquet", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -154,6 +190,8 @@ class FormatFactory; void registerInputFormatParquet(FormatFactory &) { } + +void registerParquetSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 3e04c523442..dbc99c08a35 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_PARQUET #include +#include #include namespace parquet::arrow { class FileReader; } @@ -44,6 +45,17 @@ private: std::atomic is_stopped{0}; }; +class ParquetSchemaReader : public ISchemaReader +{ +public: + ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index df7b7102739..66da27e8829 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -73,6 +73,34 @@ void registerInputFormatProtobuf(FormatFactory & factory) } } +ProtobufSchemaReader::ProtobufSchemaReader(const FormatSettings & format_settings) + : schema_info( + format_settings.schema.format_schema, + "Protobuf", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path) +{ +} + +NamesAndTypesList ProtobufSchemaReader::readSchema() +{ + const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info); + return protobufSchemaToCHSchema(message_descriptor); +} + +void registerProtobufSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("Protobuf", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); + factory.registerExternalSchemaReader("ProtobufSingle", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -81,6 +109,8 @@ namespace DB { class FormatFactory; void registerInputFormatProtobuf(FormatFactory &) {} + +void registerProtobufSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index 6f465e3f0b8..d7d16d36ddf 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -3,7 +3,9 @@ #include "config_formats.h" #if USE_PROTOBUF +# include # include +# include namespace DB { @@ -42,5 +44,16 @@ private: std::unique_ptr serializer; }; +class ProtobufSchemaReader : public IExternalSchemaReader +{ +public: + explicit ProtobufSchemaReader(const FormatSettings & format_settings); + + NamesAndTypesList readSchema() override; + +private: + FormatSchemaInfo schema_info; +}; + } #endif diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp index 34424fffd34..91b1cc60fae 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp @@ -51,5 +51,14 @@ void registerInputFormatRawBLOB(FormatFactory & factory) }); } +void registerRawBLOBSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("RawBLOB", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h index 343af9f4068..367ca04f9d8 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include namespace DB @@ -22,5 +24,14 @@ private: bool readRow(MutableColumns & columns, RowReadExtension &) override; }; +class RawBLOBSchemaReader: public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"raw_blob", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 279ae89aba5..90db6f6f0ec 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -14,18 +14,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RegexpRowInputFormat::RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) -{ -} - -RegexpRowInputFormat::RegexpRowInputFormat( - std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : IRowInputFormat(header_, *buf_, std::move(params_)) - , buf(std::move(buf_)) - , format_settings(format_settings_) - , escaping_rule(format_settings_.regexp.escaping_rule) - , regexp(format_settings_.regexp.regexp) +RegexpFieldExtractor::RegexpFieldExtractor(const FormatSettings & format_settings) : regexp(format_settings.regexp.regexp), skip_unmatched(format_settings.regexp.skip_unmatched) { size_t fields_count = regexp.NumberOfCapturingGroups(); matched_fields.resize(fields_count); @@ -40,6 +29,50 @@ RegexpRowInputFormat::RegexpRowInputFormat( } } +bool RegexpFieldExtractor::parseRow(PeekableReadBuffer & buf) +{ + PeekableReadBufferCheckpoint checkpoint{buf}; + + size_t line_size = 0; + + do + { + char * pos = find_first_symbols<'\n', '\r'>(buf.position(), buf.buffer().end()); + line_size += pos - buf.position(); + buf.position() = pos; + } while (buf.position() == buf.buffer().end() && !buf.eof()); + + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + bool match = RE2::FullMatchN(re2::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); + + if (!match && !skip_unmatched) + throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); + + buf.position() += line_size; + checkChar('\r', buf); + if (!buf.eof() && !checkChar('\n', buf)) + throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); + + return match; +} + +RegexpRowInputFormat::RegexpRowInputFormat( + ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) +{ +} + +RegexpRowInputFormat::RegexpRowInputFormat( + std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : IRowInputFormat(header_, *buf_, std::move(params_)) + , buf(std::move(buf_)) + , format_settings(format_settings_) + , escaping_rule(format_settings_.regexp.escaping_rule) + , field_extractor(RegexpFieldExtractor(format_settings_)) +{ +} void RegexpRowInputFormat::resetParser() { @@ -50,7 +83,8 @@ void RegexpRowInputFormat::resetParser() bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) { const auto & type = getPort().getHeader().getByPosition(index).type; - ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); + auto matched_field = field_extractor.getField(index); + ReadBuffer field_buf(const_cast(matched_field.data()), matched_field.size(), 0); try { return deserializeFieldByEscapingRule(type, serializations[index], *columns[index], field_buf, escaping_rule, format_settings); @@ -64,7 +98,7 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) void RegexpRowInputFormat::readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext) { - if (matched_fields.size() != columns.size()) + if (field_extractor.getMatchedFieldsSize() != columns.size()) throw Exception("The number of matched fields in line doesn't match the number of columns.", ErrorCodes::INCORRECT_DATA); ext.read_columns.assign(columns.size(), false); @@ -79,39 +113,8 @@ bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & if (buf->eof()) return false; - PeekableReadBufferCheckpoint checkpoint{*buf}; - - size_t line_size = 0; - - do - { - char * pos = find_first_symbols<'\n', '\r'>(buf->position(), buf->buffer().end()); - line_size += pos - buf->position(); - buf->position() = pos; - } while (buf->position() == buf->buffer().end() && !buf->eof()); - - buf->makeContinuousMemoryFromCheckpointToPos(); - buf->rollbackToCheckpoint(); - - bool match = RE2::FullMatchN(re2::StringPiece(buf->position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); - bool read_line = true; - - if (!match) - { - if (!format_settings.regexp.skip_unmatched) - throw Exception("Line \"" + std::string(buf->position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); - read_line = false; - } - - if (read_line) + if (field_extractor.parseRow(*buf)) readFieldsFromMatch(columns, ext); - - buf->position() += line_size; - - checkChar('\r', *buf); - if (!buf->eof() && !checkChar('\n', *buf)) - throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); - return true; } @@ -121,6 +124,36 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader( + buf, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) + , format_settings(format_settings_) + , field_extractor(format_settings) + , buf(in_) + , context(context_) +{ +} + +DataTypes RegexpSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + field_extractor.parseRow(buf); + + DataTypes data_types; + data_types.reserve(field_extractor.getMatchedFieldsSize()); + for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i) + { + String field(field_extractor.getField(i)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context)); + } + + return data_types; +} + void registerInputFormatRegexp(FormatFactory & factory) { factory.registerInputFormat("Regexp", []( @@ -172,4 +205,12 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory) factory.registerFileSegmentationEngine("Regexp", &fileSegmentationEngineRegexpImpl); } +void registerRegexpSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index dbce31a9b49..dffd2f82e02 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,29 @@ namespace DB class ReadBuffer; +/// Class for extracting row fields from data by regexp. +class RegexpFieldExtractor +{ +public: + RegexpFieldExtractor(const FormatSettings & format_settings); + + /// Return true if row was successfully parsed and row fields were extracted. + bool parseRow(PeekableReadBuffer & buf); + + re2::StringPiece getField(size_t index) { return matched_fields[index]; } + size_t getMatchedFieldsSize() const { return matched_fields.size(); } + size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); } + +private: + const RE2 regexp; + // The vector of fields extracted from line using regexp. + std::vector matched_fields; + // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). + std::vector re2_arguments; + std::vector re2_arguments_ptrs; + bool skip_unmatched; +}; + /// Regexp input format. /// This format applies regular expression from format_regexp setting for every line of file /// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")). @@ -25,7 +49,6 @@ class ReadBuffer; class RegexpRowInputFormat : public IRowInputFormat { - using EscapingRule = FormatSettings::EscapingRule; public: RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_); @@ -36,6 +59,8 @@ public: private: RegexpRowInputFormat(std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool readField(size_t index, MutableColumns & columns); @@ -44,13 +69,22 @@ private: std::unique_ptr buf; const FormatSettings format_settings; const EscapingRule escaping_rule; + RegexpFieldExtractor field_extractor; +}; - const RE2 regexp; - // The vector of fields extracted from line using regexp. - std::vector matched_fields; - // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). - std::vector re2_arguments; - std::vector re2_arguments_ptrs; +class RegexpSchemaReader : public IRowSchemaReader +{ +public: + RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + using EscapingRule = FormatSettings::EscapingRule; + const FormatSettings format_settings; + RegexpFieldExtractor field_extractor; + PeekableReadBuffer buf; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index eef97e15dd5..8a56c2ed5c7 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -1,7 +1,10 @@ #include #include #include +#include #include +#include +#include namespace DB @@ -211,6 +214,59 @@ void TSKVRowInputFormat::resetParser() name_buf.clear(); } +TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowWithNamesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped)) + , format_settings(format_settings_) +{ +} + +std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + first_row = false; + } + + if (in.eof()) + return {}; + + if (*in.position() == '\n') + { + ++in.position(); + return {}; + } + + std::unordered_map names_and_types; + StringRef name_ref; + String name_tmp; + String value; + do + { + bool has_value = readName(in, name_ref, name_tmp); + if (has_value) + { + readEscapedString(value, in); + names_and_types[String(name_ref)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped); + } + else + { + /// The only thing that can go without value is `tskv` fragment that is ignored. + if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4))) + throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); + } + + } + while (checkChar('\t', in)); + + assertChar('\n', in); + + return names_and_types; +} + void registerInputFormatTSKV(FormatFactory & factory) { factory.registerInputFormat("TSKV", []( @@ -222,5 +278,12 @@ void registerInputFormatTSKV(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), settings); }); } +void registerTSKVSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index 7d732bae691..6aef50a0f84 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -52,4 +53,16 @@ private: /// for row like ..., non-nullable column name=\N, ... }; +class TSKVSchemaReader : public IRowWithNamesSchemaReader +{ +public: + TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + const FormatSettings format_settings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 1e6d238b202..bb844ec68ea 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -1,13 +1,15 @@ #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include namespace DB { @@ -38,40 +40,50 @@ TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( bool with_types_, bool is_raw_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::make_unique(in_, format_settings_, is_raw_)) { } -void TabSeparatedRowInputFormat::skipFieldDelimiter() +TabSeparatedFormatReader::TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool is_raw_) + : FormatWithNamesAndTypesReader(in_, format_settings_), is_raw(is_raw_) +{ +} + +void TabSeparatedFormatReader::skipFieldDelimiter() { assertChar('\t', *in); } -void TabSeparatedRowInputFormat::skipRowEndDelimiter() +void TabSeparatedFormatReader::skipRowEndDelimiter() { if (in->eof()) return; - if (unlikely(row_num <= 1)) + if (unlikely(first_row)) + { checkForCarriageReturn(*in); + first_row = false; + } assertChar('\n', *in); } -String TabSeparatedRowInputFormat::readFieldIntoString() +String TabSeparatedFormatReader::readFieldIntoString() { String field; - readEscapedString(field, *in); + if (is_raw) + readString(field, *in); + else + readEscapedString(field, *in); return field; } -void TabSeparatedRowInputFormat::skipField() +void TabSeparatedFormatReader::skipField() { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); + readFieldIntoString(); } -void TabSeparatedRowInputFormat::skipHeaderRow() +void TabSeparatedFormatReader::skipHeaderRow() { do { @@ -82,7 +94,7 @@ void TabSeparatedRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector TabSeparatedRowInputFormat::readHeaderRow() +std::vector TabSeparatedFormatReader::readRow() { std::vector fields; do @@ -95,7 +107,7 @@ std::vector TabSeparatedRowInputFormat::readHeaderRow() return fields; } -bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, +bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t'; @@ -118,6 +130,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } + if (as_nullable) return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization); @@ -125,7 +138,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } -bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -156,7 +169,7 @@ bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuff return true; } -bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { if (in->eof()) return true; @@ -190,7 +203,7 @@ bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out return true; } -void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type) +void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type) { bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default; @@ -218,6 +231,28 @@ void TabSeparatedRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } +TabSeparatedSchemaReader::TabSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(is_raw_ ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped)) + , reader(in_, format_settings_, is_raw_) +{ +} + +DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); +} + void registerInputFormatTabSeparated(FormatFactory & factory) { for (bool is_raw : {false, true}) @@ -239,6 +274,23 @@ void registerInputFormatTabSeparated(FormatFactory & factory) } } +void registerTSVSchemaReader(FormatFactory & factory) +{ + for (bool is_raw : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, is_raw, settings); + }); + }; + + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + } +} + static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows) { bool need_more_data = true; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 6e2e283e792..1f2bfc255b8 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -24,6 +25,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } +}; + +class TabSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings, bool is_raw_); bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -36,18 +44,34 @@ private: void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); - std::vector readNames() override { return readHeaderRow(); } - std::vector readTypes() override { return readHeaderRow(); } + std::vector readRow(); + std::vector readNames() override { return readRow(); } + std::vector readTypes() override { return readRow(); } String readFieldIntoString(); void checkNullValueForNonNullable(DataTypePtr type) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; - bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } + FormatSettings::EscapingRule getEscapingRule() + { + return is_raw ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped; + } +private: bool is_raw; + bool first_row = true; +}; + +class TabSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings); + +private: + DataTypes readRowAndGetDataTypes() override; + + TabSeparatedFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index 5d87f5a0b14..37bd8daa502 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -22,7 +22,10 @@ void TabSeparatedRowOutputFormat::writeLine(const std::vector & values) { for (size_t i = 0; i < values.size(); ++i) { - writeEscapedString(values[i], out); + if (is_raw) + writeString(values[i], out); + else + writeEscapedString(values[i], out); if (i + 1 == values.size()) writeRowEndDelimiter(); else diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index fccf6eb10df..06d6ba06bcc 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB @@ -12,13 +11,19 @@ namespace DB namespace ErrorCodes { -extern const int ATTEMPT_TO_READ_AFTER_EOF; -extern const int CANNOT_READ_ALL_DATA; -extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; -extern const int CANNOT_PARSE_QUOTED_STRING; -extern const int SYNTAX_ERROR; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; + extern const int CANNOT_PARSE_QUOTED_STRING; + extern const int SYNTAX_ERROR; } +[[noreturn]] static void throwUnexpectedEof(size_t row_num) +{ + throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " + "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", + ErrorCodes::CANNOT_READ_ALL_DATA); +} TemplateRowInputFormat::TemplateRowInputFormat( const Block & header_, @@ -41,37 +46,13 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu : RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()), settings(std::move(settings_)), ignore_spaces(ignore_spaces_), format(std::move(format_)), row_format(std::move(row_format_)), - default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(std::move(row_between_delimiter_)) + default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_), + format_reader(std::make_unique(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings)) { - /// Validate format string for result set - bool has_data = false; - for (size_t i = 0; i < format.columnsCount(); ++i) - { - if (format.format_idx_to_column_idx[i]) - { - if (*format.format_idx_to_column_idx[i] != 0) - format.throwInvalidFormat("Invalid input part", i); - if (has_data) - format.throwInvalidFormat("${data} can occur only once", i); - if (format.escaping_rules[i] != EscapingRule::None) - format.throwInvalidFormat("${data} must have empty or None deserialization type", i); - has_data = true; - format_data_idx = i; - } - else - { - if (format.escaping_rules[i] == EscapingRule::XML) - format.throwInvalidFormat("XML deserialization is not supported", i); - } - } - /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.escaping_rules[i] == EscapingRule::XML) - row_format.throwInvalidFormat("XML deserialization is not supported", i); - if (row_format.format_idx_to_column_idx[i]) { if (header_.columns() <= *row_format.format_idx_to_column_idx[i]) @@ -94,69 +75,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu void TemplateRowInputFormat::readPrefix() { - size_t last_successfully_parsed_idx = 0; - try - { - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); - } - catch (Exception & e) - { - format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); - } -} - -/// Asserts delimiters and skips fields in prefix or suffix. -/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row -/// (most likely false will be returned on first call of checkString(...)) -template -ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) -{ - static constexpr bool throw_exception = std::is_same_v; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], *buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], *buf))) - return ReturnType(false); - } - - while (input_part_beg < input_part_end) - { - skipSpaces(); - if constexpr (throw_exception) - skipField(format.escaping_rules[input_part_beg]); - else - { - try - { - skipField(format.escaping_rules[input_part_beg]); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - /// If it's parsing error, then suffix is not found - return ReturnType(false); - } - } - ++input_part_beg; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], *buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], *buf))) - return ReturnType(false); - } - } - - if constexpr (!throw_exception) - return ReturnType(true); + format_reader->readPrefix(); } bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) @@ -165,9 +84,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension if (unlikely(end_of_stream)) return false; - skipSpaces(); - - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -176,27 +93,24 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension updateDiagnosticInfo(); if (likely(row_num != 1)) - assertString(row_between_delimiter, *buf); + format_reader->skipRowBetweenDelimiter(); extra.read_columns.assign(columns.size(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - skipSpaces(); - assertString(row_format.delimiters[i], *buf); - skipSpaces(); + format_reader->skipDelimiter(i); + if (row_format.format_idx_to_column_idx[i]) { size_t col_idx = *row_format.format_idx_to_column_idx[i]; extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i); } else - skipField(row_format.escaping_rules[i]); - + format_reader->skipField(row_format.escaping_rules[i]); } - skipSpaces(); - assertString(row_format.delimiters.back(), *buf); + format_reader->skipRowEndDelimiter(); for (const auto & idx : always_default_columns) data_types[idx]->insertDefaultInto(*columns[idx]); @@ -219,65 +133,21 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, catch (Exception & e) { if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); + throwUnexpectedEof(row_num); throw; } } -void TemplateRowInputFormat::skipField(TemplateRowInputFormat::EscapingRule escaping_rule) -{ - try - { - skipFieldByEscapingRule(*buf, escaping_rule, settings); - } - catch (Exception & e) - { - if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); - throw; - } -} - -/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. -/// Otherwise returns false -bool TemplateRowInputFormat::checkForSuffix() -{ - PeekableReadBufferCheckpoint checkpoint{*buf}; - bool suffix_found = false; - size_t last_successfully_parsed_idx = format_data_idx + 1; - try - { - suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - } - - if (unlikely(suffix_found)) - { - skipSpaces(); - if (buf->eof()) - return true; - } - - buf->rollbackToCheckpoint(); - return false; -} - bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { out << "Suffix does not match: "; - size_t last_successfully_parsed_idx = format_data_idx + 1; + size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1; const ReadBuffer::Position row_begin_pos = buf->position(); bool caught = false; try { PeekableReadBufferCheckpoint checkpoint{*buf, true}; - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + format_reader->tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (Exception & e) { @@ -309,7 +179,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces)) return false; - skipSpaces(); + format_reader->skipSpaces(); if (row_format.format_idx_to_column_idx[i]) { const auto & header = getPort().getHeader(); @@ -364,7 +234,7 @@ void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColu if (index) deserializeField(type, serializations[*index], column, file_column); else - skipField(row_format.escaping_rules[file_column]); + format_reader->skipField(row_format.escaping_rules[file_column]); } bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) @@ -387,13 +257,6 @@ void TemplateRowInputFormat::syncAfterError() /// It will cause another parsing error. } -void TemplateRowInputFormat::throwUnexpectedEof() -{ - throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " - "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", - ErrorCodes::CANNOT_READ_ALL_DATA); -} - void TemplateRowInputFormat::resetParser() { RowInputFormatWithDiagnosticInfo::resetParser(); @@ -407,6 +270,268 @@ void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +TemplateFormatReader::TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter_, + const FormatSettings & format_settings_) + : buf(&buf_) + , ignore_spaces(ignore_spaces_) + , format(format_) + , row_format(row_format_) + , row_between_delimiter(row_between_delimiter_) + , format_settings(format_settings_) +{ + /// Validate format string for result set + bool has_data = false; + for (size_t i = 0; i < format.columnsCount(); ++i) + { + if (format.format_idx_to_column_idx[i]) + { + if (*format.format_idx_to_column_idx[i] != 0) + format.throwInvalidFormat("Invalid input part", i); + if (has_data) + format.throwInvalidFormat("${data} can occur only once", i); + if (format.escaping_rules[i] != EscapingRule::None) + format.throwInvalidFormat("${data} must have empty or None deserialization type", i); + has_data = true; + format_data_idx = i; + } + else + { + if (format.escaping_rules[i] == EscapingRule::XML) + format.throwInvalidFormat("XML deserialization is not supported", i); + } + } + + /// Validate format string for rows + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + if (row_format.escaping_rules[i] == EscapingRule::XML) + row_format.throwInvalidFormat("XML deserialization is not supported", i); + } +} + +void TemplateFormatReader::readPrefix() +{ + size_t last_successfully_parsed_idx = 0; + try + { + tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); + } + catch (Exception & e) + { + format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); + } +} + +void TemplateFormatReader::skipField(EscapingRule escaping_rule) +{ + try + { + skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throwUnexpectedEof(row_num); + throw; + } +} + +/// Asserts delimiters and skips fields in prefix or suffix. +/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row +/// (most likely false will be returned on first call of checkString(...)) +template +ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) +{ + static constexpr bool throw_exception = std::is_same_v; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + + while (input_part_beg < input_part_end) + { + skipSpaces(); + if constexpr (throw_exception) + skipField(format.escaping_rules[input_part_beg]); + else + { + try + { + skipField(format.escaping_rules[input_part_beg]); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + /// If it's parsing error, then suffix is not found + return ReturnType(false); + } + } + ++input_part_beg; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + } + + if constexpr (!throw_exception) + return ReturnType(true); +} + +/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. +/// Otherwise returns false +bool TemplateFormatReader::checkForSuffix() +{ + PeekableReadBufferCheckpoint checkpoint{*buf}; + bool suffix_found = false; + size_t last_successfully_parsed_idx = format_data_idx + 1; + try + { + suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + } + + if (unlikely(suffix_found)) + { + skipSpaces(); + if (buf->eof()) + return true; + } + + buf->rollbackToCheckpoint(); + return false; +} + +void TemplateFormatReader::skipDelimiter(size_t index) +{ + skipSpaces(); + assertString(row_format.delimiters[index], *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowEndDelimiter() +{ + ++row_num; + skipSpaces(); + assertString(row_format.delimiters.back(), *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowBetweenDelimiter() +{ + skipSpaces(); + assertString(row_between_delimiter, *buf); + skipSpaces(); +} + +TemplateSchemaReader::TemplateSchemaReader( + ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference) + , buf(in_) + , format(format_) + , row_format(row_format_) + , format_settings(format_settings_) + , context(context_) + , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) +{ + setColumnNames(row_format.column_names); +} + +DataTypes TemplateSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + format_reader.readPrefix(); + + if (format_reader.checkForSuffix()) + return {}; + + if (first_row) + first_row = false; + else + format_reader.skipRowBetweenDelimiter(); + + DataTypes data_types; + data_types.reserve(row_format.columnsCount()); + String field; + for (size_t i = 0; i != row_format.columnsCount(); ++i) + { + format_reader.skipDelimiter(i); + if (row_format.escaping_rules[i] == FormatSettings::EscapingRule::CSV) + format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front(); + + field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context)); + } + + format_reader.skipRowEndDelimiter(); + return data_types; +} + +static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings) +{ + ParsedTemplateFormatString resultset_format; + if (settings.template_settings.resultset_format.empty()) + { + /// Default format string: "${data}" + resultset_format.delimiters.resize(2); + resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + /// Read format string from file + resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, + settings.schema.is_server, settings.schema.format_schema_path), + [&](const String & partName) -> std::optional + { + if (partName == "data") + return 0; + throw Exception("Unknown input part " + partName, + ErrorCodes::SYNTAX_ERROR); + }); + } + return resultset_format; +} + +static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes) +{ + return ParsedTemplateFormatString( + FormatSchemaInfo( + settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), + idx_getter, allow_indexes); +} + void registerInputFormatTemplate(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) @@ -417,39 +542,34 @@ void registerInputFormatTemplate(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - ParsedTemplateFormatString resultset_format; - if (settings.template_settings.resultset_format.empty()) + auto idx_getter = [&](const String & colName) -> std::optional { - /// Default format string: "${data}" - resultset_format.delimiters.resize(2); - resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); - resultset_format.format_idx_to_column_idx.emplace_back(0); - resultset_format.column_names.emplace_back("data"); - } - else - { - /// Read format string from file - resultset_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & partName) -> std::optional - { - if (partName == "data") - return 0; - throw Exception("Unknown input part " + partName, - ErrorCodes::SYNTAX_ERROR); - }); - } + return sample.getPositionByName(colName); + }; - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.row_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) -> std::optional - { - return sample.getPositionByName(colName); - }); + return std::make_shared( + sample, + buf, + params, + settings, + ignore_spaces, + fillResultSetFormat(settings), + fillRowFormat(settings, idx_getter, true), + settings.template_settings.row_between_delimiter); + }); + } +} - return std::make_shared(sample, buf, params, settings, ignore_spaces, resultset_format, row_format, settings.template_settings.row_between_delimiter); +void registerTemplateSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + size_t index = 0; + auto idx_getter = [&](const String &) -> std::optional { return index++; }; + auto row_format = fillRowFormat(settings, idx_getter, false); + return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context); }); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 61cd97413bf..755ad6cb39b 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -2,15 +2,19 @@ #include #include +#include #include #include #include #include +#include namespace DB { +class TemplateFormatReader; + class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { using EscapingRule = FormatSettings::EscapingRule; @@ -40,14 +44,6 @@ private: bool deserializeField(const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, size_t file_column); - void skipField(EscapingRule escaping_rule); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } - - template - ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); - bool checkForSuffix(); - [[noreturn]] void throwUnexpectedEof(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; @@ -63,12 +59,76 @@ private: const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; - size_t format_data_idx; bool end_of_stream = false; std::vector always_default_columns; const char default_csv_delimiter; const std::string row_between_delimiter; + + std::unique_ptr format_reader; +}; + +class TemplateFormatReader +{ + using EscapingRule = FormatSettings::EscapingRule; + +public: + TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_); + + void readPrefix(); + void skipField(EscapingRule escaping_rule); + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } + + template + ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); + bool checkForSuffix(); + + void setReadBuffer(PeekableReadBuffer & buf_) { buf = &buf_; } + + void skipDelimiter(size_t index); + void skipRowEndDelimiter(); + void skipRowBetweenDelimiter(); + + size_t getFormatDataIdx() const { return format_data_idx; } + +private: + PeekableReadBuffer * buf; + bool ignore_spaces; + const ParsedTemplateFormatString & format; + const ParsedTemplateFormatString & row_format; + const std::string row_between_delimiter; + const FormatSettings & format_settings; + size_t format_data_idx; + size_t row_num; +}; + +class TemplateSchemaReader : public IRowSchemaReader +{ +public: + TemplateSchemaReader(ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_); + + DataTypes readRowAndGetDataTypes() override; + +private: + PeekableReadBuffer buf; + const ParsedTemplateFormatString format; + const ParsedTemplateFormatString row_format; + FormatSettings format_settings; + ContextPtr context; + TemplateFormatReader format_reader; + bool first_row = true; }; bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces); diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index adf6d2e8a25..b58be3f5526 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include +#include namespace DB { @@ -286,6 +288,50 @@ namespace } } +/// Can be used in fileSegmentationEngine for parallel parsing of Values +static bool skipToNextRow(PeekableReadBuffer * buf, size_t min_chunk_bytes, int balance) +{ + skipWhitespaceIfAny(*buf); + if (buf->eof() || *buf->position() == ';') + return false; + bool quoted = false; + + size_t chunk_begin_buf_count = buf->count(); + while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) + { + buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); + if (buf->position() == buf->buffer().end()) + continue; + if (*buf->position() == '\\') + { + ++buf->position(); + if (!buf->eof()) + ++buf->position(); + } + else if (*buf->position() == '\'') + { + quoted ^= true; + ++buf->position(); + } + else if (*buf->position() == ')') + { + ++buf->position(); + if (!quoted) + --balance; + } + else if (*buf->position() == '(') + { + ++buf->position(); + if (!quoted) + ++balance; + } + } + + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + return true; +} + bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx) { const Block & header = getPort().getHeader(); @@ -293,7 +339,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx auto settings = context->getSettingsRef(); /// We need continuous memory containing the expression to use Lexer - skipToNextRow(0, 1); + skipToNextRow(buf.get(), 0, 1); buf->makeContinuousMemoryFromCheckpointToPos(); buf->rollbackToCheckpoint(); @@ -437,50 +483,6 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx return true; } -/// Can be used in fileSegmentationEngine for parallel parsing of Values -bool ValuesBlockInputFormat::skipToNextRow(size_t min_chunk_bytes, int balance) -{ - skipWhitespaceIfAny(*buf); - if (buf->eof() || *buf->position() == ';') - return false; - bool quoted = false; - - size_t chunk_begin_buf_count = buf->count(); - while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) - { - buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); - if (buf->position() == buf->buffer().end()) - continue; - if (*buf->position() == '\\') - { - ++buf->position(); - if (!buf->eof()) - ++buf->position(); - } - else if (*buf->position() == '\'') - { - quoted ^= true; - ++buf->position(); - } - else if (*buf->position() == ')') - { - ++buf->position(); - if (!quoted) - --balance; - } - else if (*buf->position() == '(') - { - ++buf->position(); - if (!quoted) - ++balance; - } - } - - if (!buf->eof() && *buf->position() == ',') - ++buf->position(); - return true; -} - void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx) { if (unlikely(!checkDelimiterAfterValue(column_idx))) @@ -559,6 +561,63 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_) +{ +} + +DataTypes ValuesSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + { + skipBOMIfExists(buf); + first_row = false; + } + + skipWhitespaceIfAny(buf); + if (buf.eof()) + return {}; + + assertChar('(', buf); + PeekableReadBufferCheckpoint checkpoint(buf); + skipToNextRow(&buf, 0, 1); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + Tokens tokens(buf.position(), buf.buffer().end()); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + + DataTypes data_types; + bool finish = false; + while (!finish) + { + Expected expected; + ASTPtr ast; + + bool parsed = parser.parse(token_iterator, ast, expected); + /// Consider delimiter after value (',' or ')') as part of expression + parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket; + + if (!parsed) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}", + String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end)); + + std::pair result = evaluateConstantExpression(ast, context); + data_types.push_back(generalizeDataType(result.second)); + + if (token_iterator->type == TokenType::ClosingRoundBracket) + finish = true; + ++token_iterator; + buf.position() = const_cast(token_iterator->begin); + } + + skipWhitespaceIfAny(buf); + if (!buf.eof() && *buf.position() == ',') + ++buf.position(); + + return data_types; +} + void registerInputFormatValues(FormatFactory & factory) { factory.registerInputFormat("Values", []( @@ -571,4 +630,12 @@ void registerInputFormatValues(FormatFactory & factory) }); } +void registerValuesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 5bbd4bea5ba..e1521955472 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace DB @@ -68,8 +69,6 @@ private: void readPrefix(); void readSuffix(); - bool skipToNextRow(size_t min_chunk_bytes = 0, int balance = 0); - std::unique_ptr buf; const RowInputFormatParams params; @@ -95,4 +94,18 @@ private: BlockMissingValues block_missing_values; }; +class ValuesSchemaReader : public IRowSchemaReader +{ +public: + ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + ContextPtr context; + ParserExpression parser; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 87fa5ec1c4a..7720b01dc74 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -1,5 +1,7 @@ #include +#include #include +#include #include #include @@ -9,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( @@ -17,8 +20,13 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( const Params & params_, bool with_names_, bool with_types_, - const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_) + const FormatSettings & format_settings_, + std::unique_ptr format_reader_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_) + , format_settings(format_settings_) + , with_names(with_names_) + , with_types(with_types_) + , format_reader(std::move(format_reader_)) { const auto & sample = getPort().getHeader(); size_t num_columns = sample.columns(); @@ -88,7 +96,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } /// Skip prefix before names and types. - skipPrefixBeforeHeader(); + format_reader->skipPrefixBeforeHeader(); /// This is a bit of abstraction leakage, but we need it in parallel parsing: /// we check if this InputFormat is working with the "real" beginning of the data. @@ -97,7 +105,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (format_settings.with_names_use_header) { std::vector read_columns(data_types.size(), false); - auto column_names = readNames(); + auto column_names = format_reader->readNames(); for (const auto & name : column_names) addInputColumn(name, read_columns); @@ -110,7 +118,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() else { setupAllColumnsByTableSchema(); - skipNames(); + format_reader->skipNames(); } } else if (!column_mapping->is_set) @@ -119,10 +127,10 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (with_types) { /// Skip delimiter between names and types. - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); if (format_settings.with_types_use_header) { - auto types = readTypes(); + auto types = format_reader->readTypes(); if (types.size() != column_mapping->column_indexes_for_input_fields.size()) throw Exception( ErrorCodes::INCORRECT_DATA, @@ -143,7 +151,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } } else - skipTypes(); + format_reader->skipTypes(); } } @@ -161,7 +169,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE if (unlikely(end_of_stream)) return false; - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -170,9 +178,9 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE updateDiagnosticInfo(); if (likely(row_num != 1 || (getCurrentUnitNumber() == 0 && (with_names || with_types)))) - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); - skipRowStartDelimiter(); + format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -180,20 +188,20 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) - ext.read_columns[*column_index] = readField( + ext.read_columns[*column_index] = format_reader->readField( *columns[*column_index], data_types[*column_index], serializations[*column_index], is_last_file_column, column_mapping->names_of_columns[file_column]); else - skipField(file_column); + format_reader->skipField(file_column); if (!is_last_file_column) - skipFieldDelimiter(); + format_reader->skipFieldDelimiter(); } - skipRowEndDelimiter(); + format_reader->skipRowEndDelimiter(); insertDefaultsForNotSeenColumns(columns, ext); @@ -218,13 +226,13 @@ void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & ty const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; if (index) { - checkNullValueForNonNullable(type); + format_reader->checkNullValueForNonNullable(type); const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); + format_reader->readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); } else { - skipField(file_column); + format_reader->skipField(file_column); } } @@ -236,13 +244,13 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu return false; } - if (!tryParseSuffixWithDiagnosticInfo(out)) + if (!format_reader->tryParseSuffixWithDiagnosticInfo(out)) return false; - if (likely(row_num != 1) && !parseRowBetweenDelimiterWithDiagnosticInfo(out)) + if (likely(row_num != 1) && !format_reader->parseRowBetweenDelimiterWithDiagnosticInfo(out)) return false; - if (!parseRowStartWithDiagnosticInfo(out)) + if (!format_reader->parseRowStartWithDiagnosticInfo(out)) return false; for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -266,22 +274,68 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu /// Delimiters if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size()) { - if (!parseFieldDelimiterWithDiagnosticInfo(out)) + if (!format_reader->parseFieldDelimiterWithDiagnosticInfo(out)) return false; } } - return parseRowEndWithDiagnosticInfo(out); + return format_reader->parseRowEndWithDiagnosticInfo(out); } - -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine) +bool RowInputFormatWithNamesAndTypes::isGarbageAfterField(size_t index, ReadBuffer::Position pos) { - factory.registerFileSegmentationEngine(base_format_name, segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine); + return format_reader->isGarbageAfterField(index, pos); } +void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) +{ + format_reader->setReadBuffer(in_); + IInputFormat::setReadBuffer(in_); +} + +FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in_, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_) + : IRowSchemaReader(in_, max_rows_to_read_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) +{ +} + +NamesAndTypesList FormatWithNamesAndTypesSchemaReader::readSchema() +{ + if (with_names || with_types) + skipBOMIfExists(in); + + format_reader->skipPrefixBeforeHeader(); + + Names names; + if (with_names) + names = format_reader->readNames(); + + if (with_types) + { + format_reader->skipRowBetweenDelimiter(); + std::vector data_type_names = format_reader->readTypes(); + if (data_type_names.size() != names.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", names.size(), data_type_names.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_type_names.size(); ++i) + result.emplace_back(names[i], DataTypeFactory::instance().get(data_type_names[i])); + return result; + } + + if (!names.empty()) + setColumnNames(names); + + /// We should determine types by reading rows with data. Use the implementation from IRowSchemaReader. + return IRowSchemaReader::readSchema(); +} } + diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index cd7cd34d7e6..25ffc8d6de2 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -1,12 +1,15 @@ #pragma once #include +#include #include #include namespace DB { +class FormatWithNamesAndTypesReader; + /// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes. /// It accepts 2 parameters in constructor - with_names and with_types and implements /// input format depending on them: @@ -20,7 +23,7 @@ namespace DB /// then reads/skips types. So you can this invariant. class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo { -public: +protected: /** with_names - in the first line the header with column names * with_types - in the second line the header with column names */ @@ -28,44 +31,14 @@ public: const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_); + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_, + std::unique_ptr format_reader_); void resetParser() override; - -protected: - /// Read single field from input. Return false if there was no real value and we inserted default value. - virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; - - /// Skip single field, it's used to skip unknown columns. - virtual void skipField(size_t file_column) = 0; - /// Skip the whole row with names. - virtual void skipNames() = 0; - /// Skip the whole row with types. - virtual void skipTypes() = 0; - - /// Skip delimiters, if any. - virtual void skipPrefixBeforeHeader() {} - virtual void skipRowStartDelimiter() {} - virtual void skipFieldDelimiter() {} - virtual void skipRowEndDelimiter() {} - virtual void skipRowBetweenDelimiter() {} - - /// Check suffix. - virtual bool checkForSuffix() { return in->eof(); } - - /// Methods for parsing with diagnostic info. - virtual void checkNullValueForNonNullable(DataTypePtr) {} - virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } - bool isGarbageAfterField(size_t, ReadBuffer::Position) override {return false; } - - /// Read row with names and return the list of them. - virtual std::vector readNames() = 0; - /// Read row with types and return the list of them. - virtual std::vector readTypes() = 0; + bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override; + void setReadBuffer(ReadBuffer & in_) override; const FormatSettings format_settings; DataTypes data_types; @@ -84,10 +57,90 @@ private: bool with_names; bool with_types; + std::unique_ptr format_reader; std::unordered_map column_indexes_by_names; }; -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine); +/// Base class for parsing data in input formats with -WithNames and -WithNamesAndTypes suffixes. +/// Used for reading/skipping names/types/delimiters in specific format. +class FormatWithNamesAndTypesReader +{ +public: + explicit FormatWithNamesAndTypesReader(ReadBuffer & in_, const FormatSettings & format_settings_) : in(&in_), format_settings(format_settings_) {} + + /// Read single field from input. Return false if there was no real value and we inserted default value. + virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; + + /// Methods for parsing with diagnostic info. + virtual void checkNullValueForNonNullable(DataTypePtr) {} + virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool isGarbageAfterField(size_t, ReadBuffer::Position) { return false; } + + /// Read row with names and return the list of them. + virtual std::vector readNames() = 0; + /// Read row with types and return the list of them. + virtual std::vector readTypes() = 0; + + /// Skip single field, it's used to skip unknown columns. + virtual void skipField(size_t file_column) = 0; + /// Skip the whole row with names. + virtual void skipNames() = 0; + /// Skip the whole row with types. + virtual void skipTypes() = 0; + + /// Skip delimiters, if any. + virtual void skipPrefixBeforeHeader() {} + virtual void skipRowStartDelimiter() {} + virtual void skipFieldDelimiter() {} + virtual void skipRowEndDelimiter() {} + virtual void skipRowBetweenDelimiter() {} + + /// Check suffix. + virtual bool checkForSuffix() { return in->eof(); } + + const FormatSettings & getFormatSettings() const { return format_settings; } + + virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; } + + virtual ~FormatWithNamesAndTypesReader() = default; + +protected: + ReadBuffer * in; + const FormatSettings format_settings; +}; + +/// Base class for schema inference for formats with -WithNames and -WithNamesAndTypes suffixes. +/// For formats with -WithNamesAndTypes suffix the schema will be determined by first two rows. +/// For formats with -WithNames suffix the names of columns will be determined by the first row +/// and types of columns by the rows with data. +/// For formats without suffixes default column names will be used +/// and types will be determined by the rows with data. +class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader +{ +public: + FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_ = nullptr); + + NamesAndTypesList readSchema() override; + +protected: + virtual DataTypes readRowAndGetDataTypes() override = 0; + + bool with_names; + bool with_types; + +private: + FormatWithNamesAndTypesReader * format_reader; +}; } + diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index 502f8b800e3..f33fd938092 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -88,7 +88,7 @@ struct URLBasedDataSourceConfiguration String url; String format; String compression_method = "auto"; - String structure; + String structure = "auto"; std::vector> headers; String http_method; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 2105228abf6..f22f6f66ced 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -29,6 +28,8 @@ #include #include + +#include #include #include @@ -51,10 +52,70 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ACCESS_DENIED; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } +namespace +{ + /* Recursive directory listing with matched paths as a result. + * Have the same method in StorageFile. + */ + Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) + { + const size_t first_glob = for_match.find_first_of("*?{"); -static Strings listFilesWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match); + const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); + const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' + const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' + const size_t next_slash = suffix_with_globs.find('/', 1); + re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); + Strings result; + for (int i = 0; i < ls.length; ++i) + { + const String full_path = String(ls.file_info[i].mName); + const size_t last_slash = full_path.rfind('/'); + const String file_name = full_path.substr(last_slash); + const bool looking_for_directory = next_slash != std::string::npos; + const bool is_directory = ls.file_info[i].mKind == 'D'; + /// Condition with type of current file_info means what kind of path is it in current iteration of ls + if (!is_directory && !looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + result.push_back(String(ls.file_info[i].mName)); + } + } + else if (is_directory && looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + Strings result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); + /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. + std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); + } + } + } + + return result; + } + + std::pair getPathFromUriAndUriWithoutPath(const String & uri) + { + const size_t begin_of_path = uri.find('/', uri.find("//") + 2); + return {uri.substr(begin_of_path), uri.substr(0, begin_of_path)}; + } + + std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) + { + HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + + return LSWithRegexpMatching("/", fs, path_from_uri); + } +} StorageHDFS::StorageHDFS( const String & uri_, @@ -79,25 +140,52 @@ StorageHDFS::StorageHDFS( checkHDFSURL(uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageHDFS::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + auto paths = getPathsList(path_from_uri, uri, ctx); + if (paths.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path. You must " + "specify table structure manually", + format); + + auto compression = chooseCompressionMethod(paths[0], compression_method); + return wrapReadBufferWithCompressionMethod( + std::make_unique(uri_without_path, paths[0], ctx->getGlobalContext()->getConfigRef()), compression); + }; + + return readSchemaFromFormat(format, std::nullopt, read_buffer_creator, ctx); +} + class HDFSSource::DisclosedGlobIterator::Impl { public: Impl(ContextPtr context_, const String & uri) { - const size_t begin_of_path = uri.find('/', uri.find("//") + 2); - const String path_from_uri = uri.substr(begin_of_path); - const String uri_without_path = uri.substr(0, begin_of_path); /// ends without '/' - - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - uris = listFilesWithRegexpMatching("/", fs, path_from_uri); + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + uris = getPathsList(path_from_uri, uri_without_path, context_); for (auto & elem : uris) elem = uri_without_path + elem; uris_iter = uris.begin(); @@ -339,51 +427,6 @@ private: }; -/* Recursive directory listing with matched paths as a result. - * Have the same method in StorageFile. - */ -Strings listFilesWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) -{ - const size_t first_glob = for_match.find_first_of("*?{"); - - const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); - const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' - const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' - - const size_t next_slash = suffix_with_globs.find('/', 1); - re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); - - HDFSFileInfo ls; - ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); - Strings result; - for (int i = 0; i < ls.length; ++i) - { - const String full_path = String(ls.file_info[i].mName); - const size_t last_slash = full_path.rfind('/'); - const String file_name = full_path.substr(last_slash); - const bool looking_for_directory = next_slash != std::string::npos; - const bool is_directory = ls.file_info[i].mKind == 'D'; - /// Condition with type of current file_info means what kind of path is it in current iteration of ls - if (!is_directory && !looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - result.push_back(String(ls.file_info[i].mName)); - } - } - else if (is_directory && looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - Strings result_part = listFilesWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); - /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); - } - } - } - return result; -} - bool StorageHDFS::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -400,6 +443,7 @@ Pipe StorageHDFS::read( { bool need_path_column = false; bool need_file_column = false; + for (const auto & column : column_names) { if (column == "_path") @@ -528,6 +572,7 @@ void registerStorageHDFS(StorageFactory & factory) }, { .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::HDFS, }); } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 3e2f7a43127..9e845d8fd74 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -31,7 +31,7 @@ public: size_t max_block_size, unsigned num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) override; void truncate( const ASTPtr & query, @@ -49,6 +49,12 @@ public: /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. bool isColumnOriented() const; + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx); + protected: friend class HDFSSource; StorageHDFS( diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1b7be8ca98d..6597c28360d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -224,7 +224,6 @@ MergeTreeData::MergeTreeData( { try { - checkPartitionKeyAndInitMinMax(metadata_.partition_key); setProperties(metadata_, metadata_, attach); if (minmax_idx_date_column_pos == -1) diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index cb52c8b86c0..fc3eff7459b 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -35,6 +35,7 @@ namespace ErrorCodes extern const int NO_ELEMENTS_IN_CONFIG; extern const int UNKNOWN_STORAGE; extern const int NO_REPLICA_NAME_GIVEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -258,6 +259,34 @@ If you use the Replicated version of engines, see https://clickhouse.com/docs/en return help; } +static ColumnsDescription getColumnsDescriptionFromZookeeper(const String & raw_zookeeper_path, ContextMutablePtr context) +{ + String zookeeper_name = zkutil::extractZooKeeperName(raw_zookeeper_path); + String zookeeper_path = zkutil::extractZooKeeperPath(raw_zookeeper_path, true); + + if (!context->hasZooKeeper() && !context->hasAuxiliaryZooKeeper(zookeeper_name)) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure without zookeeper, you must specify the structure manually"}; + + zkutil::ZooKeeperPtr zookeeper; + try + { + if (zookeeper_name == StorageReplicatedMergeTree::getDefaultZooKeeperName()) + zookeeper = context->getZooKeeper(); + else + zookeeper = context->getAuxiliaryZooKeeper(zookeeper_name); + } + catch (...) + { + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure from zookeeper, because cannot get zookeeper: {}. You must specify structure manually", getCurrentExceptionMessage(false)}; + } + + if (!zookeeper->exists(zookeeper_path + "/replicas")) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure, because there no other replicas in zookeeper. You must specify the structure manually"}; + + Coordination::Stat columns_stat; + return ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_path) / "columns", &columns_stat)); +} + static StoragePtr create(const StorageFactory::Arguments & args) { @@ -638,7 +667,14 @@ static StoragePtr create(const StorageFactory::Arguments & args) String date_column_name; StorageInMemoryMetadata metadata; - metadata.setColumns(args.columns); + + ColumnsDescription columns; + if (args.columns.empty() && replicated) + columns = getColumnsDescriptionFromZookeeper(zookeeper_path, args.getContext()); + else + columns = args.columns; + + metadata.setColumns(columns); metadata.setComment(args.comment); std::unique_ptr storage_settings; @@ -705,12 +741,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (args.query.columns_list && args.query.columns_list->indices) for (auto & index : args.query.columns_list->indices->children) - metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, args.columns, args.getContext())); + metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, columns, args.getContext())); if (args.query.columns_list && args.query.columns_list->projections) for (auto & projection_ast : args.query.columns_list->projections->children) { - auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, args.columns, args.getContext()); + auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, columns, args.getContext()); metadata.projections.add(std::move(projection)); } @@ -720,10 +756,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) constraints.push_back(constraint); metadata.constraints = ConstraintsDescription(constraints); - auto column_ttl_asts = args.columns.getColumnTTLs(); + auto column_ttl_asts = columns.getColumnTTLs(); for (const auto & [name, ast] : column_ttl_asts) { - auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, args.columns, args.getContext(), metadata.primary_key); + auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, columns, args.getContext(), metadata.primary_key); metadata.column_ttls_by_name[name] = new_ttl_entry; } @@ -850,6 +886,7 @@ void registerStorageMergeTree(StorageFactory & factory) features.supports_replication = true; features.supports_deduplication = true; + features.supports_schema_inference = true; factory.registerStorage("ReplicatedMergeTree", create, features); factory.registerStorage("ReplicatedCollapsingMergeTree", create, features); diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 87a8ea2315d..0cc401aa93c 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -126,7 +126,13 @@ StorageBuffer::StorageBuffer( , bg_pool(getContext()->getBufferFlushSchedulePool()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto dest_table = DatabaseCatalog::instance().getTable(destination_id, context_); + storage_metadata.setColumns(dest_table->getInMemoryMetadataPtr()->getColumns()); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -1167,6 +1173,7 @@ void registerStorageBuffer(StorageFactory & factory) }, { .supports_parallel_insert = true, + .supports_schema_inference = true, }); } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index ddf363e3957..19869b77106 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -63,7 +62,6 @@ #include #include -#include #include #include @@ -71,8 +69,6 @@ #include #include -#include - #include #include #include @@ -329,7 +325,16 @@ StorageDistributed::StorageDistributed( , rng(randomSeed()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + StorageID id = StorageID::createEmpty(); + id.table_name = remote_table; + id.database_name = remote_database; + storage_metadata.setColumns(getStructureOfRemoteTable(*getCluster(), id, getContext(), remote_table_function_ptr)); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -1398,6 +1403,7 @@ void registerStorageDistributed(StorageFactory & factory) { .supports_settings = true, .supports_parallel_insert = true, + .supports_schema_inference = true, .source_access_type = AccessType::REMOTE, }); } diff --git a/src/Storages/StorageFactory.h b/src/Storages/StorageFactory.h index 20db1a44897..6ffa6327176 100644 --- a/src/Storages/StorageFactory.h +++ b/src/Storages/StorageFactory.h @@ -66,6 +66,7 @@ public: bool supports_deduplication = false; /// See also IStorage::supportsParallelInsert() bool supports_parallel_insert = false; + bool supports_schema_inference = false; AccessType source_access_type = AccessType::NONE; }; @@ -98,6 +99,7 @@ public: .supports_replication = false, .supports_deduplication = false, .supports_parallel_insert = false, + .supports_schema_inference = false, .source_access_type = AccessType::NONE, }); @@ -126,6 +128,12 @@ public: AccessType getSourceAccessType(const String & table_engine) const; + bool checkIfStorageSupportsSchemaInterface(const String & storage_name) + { + if (storages.contains(storage_name)) + return storages[storage_name].features.supports_schema_inference; + return false; + } private: Storages storages; }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 13a70af2ada..a479f982c70 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -15,8 +15,9 @@ #include #include -#include #include +#include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +65,7 @@ namespace ErrorCodes extern const int INCOMPATIBLE_COLUMNS; extern const int CANNOT_STAT; extern const int LOGICAL_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } namespace @@ -135,6 +138,56 @@ void checkCreationIsAllowed(ContextPtr context_global, const std::string & db_di throw Exception("File must not be a directory", ErrorCodes::INCORRECT_FILE_NAME); } +std::unique_ptr createReadBuffer( + const String & current_path, + bool use_table_fd, + const String & storage_name, + int table_fd, + const String & compression_method, + ContextPtr context) +{ + std::unique_ptr nested_buffer; + CompressionMethod method; + + struct stat file_stat{}; + + if (use_table_fd) + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != fstat(table_fd, &file_stat)) + throwFromErrno("Cannot stat table file descriptor, inside " + storage_name, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(table_fd); + else + nested_buffer = std::make_unique(table_fd); + + method = chooseCompressionMethod("", compression_method); + } + else + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != stat(current_path.c_str(), &file_stat)) + throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + else + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + + method = chooseCompressionMethod(current_path, compression_method); + } + + /// For clickhouse-local add progress callback to display progress bar. + if (context->getApplicationType() == Context::ApplicationType::LOCAL) + { + auto & in = static_cast(*nested_buffer); + in.setProgressCallback(context); + } + + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); +} + } Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) @@ -164,6 +217,42 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user return paths; } + +ColumnsDescription StorageFile::getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context) +{ + if (format == "Distributed") + { + if (paths.empty()) + throw Exception( + "Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); + + auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); + return ColumnsDescription(source->getOutputs().front().getHeader().getNamesAndTypesList()); + } + + auto read_buffer_creator = [&]() + { + String path; + auto it = std::find_if(paths.begin(), paths.end(), [](const String & p){ return std::filesystem::exists(p); }); + if (it == paths.end()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path. You must specify " + "table structure manually", + format); + + path = *it; + return createReadBuffer(path, false, "File", -1, compression_method, context); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + bool StorageFile::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -182,10 +271,13 @@ StorageFile::StorageFile(int table_fd_, CommonArguments args) throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); if (args.format_name == "Distributed") throw Exception("Distributed format is allowed only with explicit file path", ErrorCodes::INCORRECT_FILE_NAME); + if (args.columns.empty()) + throw Exception("Automatic schema inference is not allowed when using file descriptor as source of storage", ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE); is_db_table = false; use_table_fd = true; table_fd = table_fd_; + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & table_path_, const std::string & user_files_path, CommonArguments args) @@ -194,22 +286,7 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us is_db_table = false; paths = getPathsList(table_path_, user_files_path, args.getContext(), total_bytes_to_read); path_for_partitioned_write = table_path_; - - if (args.format_name == "Distributed") - { - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - - auto & first_path = paths[0]; - Block header = StorageDistributedDirectoryMonitor::createSourceFromFile(first_path)->getOutputs().front().getHeader(); - - StorageInMemoryMetadata storage_metadata; - auto columns = ColumnsDescription(header.getNamesAndTypesList()); - if (!args.columns.empty() && columns != args.columns) - throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); - storage_metadata.setColumns(columns); - setInMemoryMetadata(storage_metadata); - } + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArguments args) @@ -225,6 +302,8 @@ StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArgu paths = {getTablePath(table_dir_path, format_name)}; if (fs::exists(paths[0])) total_bytes_to_read = fs::file_size(paths[0]); + + setStorageMetadata(args); } StorageFile::StorageFile(CommonArguments args) @@ -233,9 +312,21 @@ StorageFile::StorageFile(CommonArguments args) , format_settings(args.format_settings) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) +{ +} + +void StorageFile::setStorageMetadata(CommonArguments args) { StorageInMemoryMetadata storage_metadata; - if (args.format_name != "Distributed") + + if (args.format_name == "Distributed" || args.columns.empty()) + { + auto columns = getTableStructureFromData(format_name, paths, compression_method, format_settings, args.getContext()); + if (!args.columns.empty() && args.columns != columns) + throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); + storage_metadata.setColumns(columns); + } + else storage_metadata.setColumns(args.columns); storage_metadata.setConstraints(args.constraints); @@ -350,46 +441,7 @@ public: } } - std::unique_ptr nested_buffer; - CompressionMethod method; - - struct stat file_stat{}; - - if (storage->use_table_fd) - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != fstat(storage->table_fd, &file_stat)) - throwFromErrno("Cannot stat table file descriptor, inside " + storage->getName(), ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(storage->table_fd); - else - nested_buffer = std::make_unique(storage->table_fd); - - method = chooseCompressionMethod("", storage->compression_method); - } - else - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != stat(current_path.c_str(), &file_stat)) - throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - else - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - - method = chooseCompressionMethod(current_path, storage->compression_method); - } - - /// For clickhouse-local add progress callback to display progress bar. - if (context->getApplicationType() == Context::ApplicationType::LOCAL) - { - auto & in = static_cast(*nested_buffer); - in.setProgressCallback(context); - } - - read_buf = wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); + read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); auto get_block_for_format = [&]() -> Block { @@ -853,7 +905,8 @@ void registerStorageFile(StorageFactory & factory) { StorageFactory::StorageFeatures storage_features{ .supports_settings = true, - .source_access_type = AccessType::FILE + .supports_schema_inference = true, + .source_access_type = AccessType::FILE, }; factory.registerStorage( diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index f48d1c285da..6b015976589 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -1,6 +1,7 @@ #pragma once #include + #include #include @@ -70,6 +71,13 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context); + protected: friend class StorageFileSource; friend class StorageFileSink; @@ -86,6 +94,8 @@ protected: private: explicit StorageFile(CommonArguments args); + void setStorageMetadata(CommonArguments args); + std::string format_name; // We use format settings from global context + CREATE query for File table // function -- in this case, format_settings is set. diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index f82f9d21217..bdb7ddb744a 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -43,6 +43,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } StorageMerge::StorageMerge( @@ -61,7 +62,7 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } @@ -82,11 +83,19 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageMerge::getColumnsDescriptionFromSourceTables() const +{ + auto table = getFirstTable([](auto && t) { return t; }); + if (!table) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "There are no tables satisfied provided regexp, you must specify table structure manually"}; + return table->getInMemoryMetadataPtr()->getColumns(); +} + template StoragePtr StorageMerge::getFirstTable(F && predicate) const { @@ -762,7 +771,6 @@ void StorageMerge::convertingSourceStream( IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const { - auto first_materialized_mysql = getFirstTable([](const StoragePtr & table) { return table && table->getName() == "MaterializedMySQL"; }); if (!first_materialized_mysql) return {}; @@ -816,6 +824,9 @@ void registerStorageMerge(StorageFactory & factory) return StorageMerge::create( args.table_id, args.columns, args.comment, source_database_name_or_regexp, is_regexp, table_name_regexp, args.getContext()); + }, + { + .supports_schema_inference = true }); } diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 56adeab9279..ad3075efd08 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -132,6 +132,8 @@ protected: static SelectQueryInfo getModifiedQueryInfo( const SelectQueryInfo & query_info, ContextPtr modified_context, const StorageID & current_storage_id, bool is_merge_engine); + + ColumnsDescription getColumnsDescriptionFromSourceTables() const; }; } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a1f82e14868..ce5576bd809 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3,7 +3,6 @@ #include "Common/hex.h" #include #include -#include #include #include #include @@ -20,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +33,6 @@ #include -#include #include #include @@ -45,7 +42,6 @@ #include #include #include -#include #include #include @@ -68,7 +64,6 @@ #include -#include #include #include @@ -194,56 +189,6 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper() const return res; } -static std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') - zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') - { - /// Do not allow this for new tables, print warning for tables created in old versions - if (check_starts_with_slash) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); - if (log) - LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); - zookeeper_path = "/" + zookeeper_path; - } - - return zookeeper_path; -} - -static String extractZooKeeperName(const String & path) -{ - static constexpr auto default_zookeeper_name = "default"; - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return default_zookeeper_name; - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - auto zookeeper_name = path.substr(0, pos); - if (zookeeper_name.empty()) - throw Exception("Zookeeper path should start with '/' or ':/'", ErrorCodes::BAD_ARGUMENTS); - return zookeeper_name; - } - return default_zookeeper_name; -} - -static String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return normalizeZooKeeperPath(path, check_starts_with_slash, log); - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); - } - return normalizeZooKeeperPath(path, check_starts_with_slash, log); -} - static MergeTreePartInfo makeDummyDropRangeForMovePartitionOrAttachPartitionFrom(const String & partition_id) { /// NOTE We don't have special log entry type for MOVE PARTITION/ATTACH PARTITION FROM, @@ -287,8 +232,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( true, /// require_part_metadata attach, [this] (const std::string & name) { enqueuePartForCheck(name); }) - , zookeeper_name(extractZooKeeperName(zookeeper_path_)) - , zookeeper_path(extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) + , zookeeper_name(zkutil::extractZooKeeperName(zookeeper_path_)) + , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) , replica_name(replica_name_) , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_) , reader(*this) @@ -5561,8 +5506,8 @@ void StorageReplicatedMergeTree::fetchPartition( info.table_id = getStorageID(); info.table_id.uuid = UUIDHelpers::Nil; auto expand_from = query_context->getMacros()->expand(from_, info); - String auxiliary_zookeeper_name = extractZooKeeperName(expand_from); - String from = extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); + String auxiliary_zookeeper_name = zkutil::extractZooKeeperName(expand_from); + String from = zkutil::extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); if (from.empty()) throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -6638,7 +6583,7 @@ void StorageReplicatedMergeTree::movePartitionToShard( if (!move_part) throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED); - if (normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) + if (zkutil::normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == zkutil::normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS); auto zookeeper = getZooKeeper(); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6861d89f070..b2721210344 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -263,6 +263,8 @@ public: bool createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name); + static const String getDefaultZooKeeperName() { return default_zookeeper_name; } + private: std::atomic_bool are_restoring_replica {false}; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 3a03ac3906c..3d988472b54 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -3,7 +3,6 @@ #if USE_AWS_S3 -#include #include #include @@ -25,9 +24,9 @@ #include #include -#include #include +#include #include #include @@ -70,6 +69,7 @@ namespace ErrorCodes extern const int S3_ERROR; extern const int UNEXPECTED_EXPRESSION; extern const int CANNOT_OPEN_FILE; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } class IOutputFormat; @@ -480,13 +480,39 @@ StorageS3::StorageS3( { context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri_.uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + updateClientAndAuthSettings(context_, client_auth); + if (columns_.empty()) + { + auto columns = getTableStructureFromDataImpl(format_name, client_auth, max_single_read_retries_, compression_method, distributed_processing_, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - updateClientAndAuthSettings(context_, client_auth); } +std::shared_ptr StorageS3::createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context) +{ + std::shared_ptr iterator_wrapper{nullptr}; + if (distributed_processing) + { + return std::make_shared( + [callback = local_context->getReadTaskCallback()]() -> String { + return callback(); + }); + } + + /// Iterate through disclosed globs and make a source for each file + auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); + return std::make_shared([glob_iterator]() + { + return glob_iterator->next(); + }); +} Pipe StorageS3::read( const Names & column_names, @@ -510,23 +536,7 @@ Pipe StorageS3::read( need_file_column = true; } - std::shared_ptr iterator_wrapper{nullptr}; - if (distributed_processing) - { - iterator_wrapper = std::make_shared( - [callback = local_context->getReadTaskCallback()]() -> String { - return callback(); - }); - } - else - { - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); - iterator_wrapper = std::make_shared([glob_iterator]() - { - return glob_iterator->next(); - }); - } + std::shared_ptr iterator_wrapper = createFileIterator(client_auth, distributed_processing, local_context); for (size_t i = 0; i < num_streams; ++i) { @@ -707,6 +717,51 @@ StorageS3Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPt return configuration; } +ColumnsDescription StorageS3::getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + ClientAuthentication client_auth{uri, access_key_id, secret_access_key, max_connections, {}, {}}; + updateClientAndAuthSettings(ctx, client_auth); + return getTableStructureFromDataImpl(format, client_auth, max_single_read_retries, compression_method, distributed_processing, format_settings, ctx); +} + +ColumnsDescription StorageS3::getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + auto file_iterator = createFileIterator(client_auth, distributed_processing, ctx); + String current_key = (*file_iterator)(); + if (current_key.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path in S3. You must specify " + "table structure manually", + format); + + return wrapReadBufferWithCompressionMethod( + std::make_unique(client_auth.client, client_auth.uri.bucket, current_key, max_single_read_retries, ctx->getReadSettings()), + chooseCompressionMethod(current_key, compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, ctx); +} + void registerStorageS3Impl(const String & name, StorageFactory & factory) { @@ -775,6 +830,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) { .supports_settings = true, .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::S3, }); } diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 248238379dc..0690040915d 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -147,8 +147,19 @@ public: static StorageS3Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); -private: + static ColumnsDescription getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); +private: friend class StorageS3Cluster; friend class TableFunctionS3Cluster; @@ -175,6 +186,17 @@ private: ASTPtr partition_by; static void updateClientAndAuthSettings(ContextPtr, ClientAuthentication &); + + static std::shared_ptr createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context); + + static ColumnsDescription getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 0eec77ac8e7..471b460d349 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -13,8 +13,9 @@ #include #include -#include +#include #include +#include #include #include @@ -40,7 +41,7 @@ namespace ErrorCodes IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr /*context_*/, + ContextPtr context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -61,12 +62,48 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context) +{ + auto read_buffer_creator = [&]() + { + auto parsed_uri = Poco::URI(uri); + return wrapReadBufferWithCompressionMethod( + std::make_unique( + parsed_uri, + Poco::Net::HTTPRequest::HTTP_GET, + nullptr, + ConnectionTimeouts::getHTTPTimeouts(context), + Poco::Net::HTTPBasicCredentials{}, + context->getSettingsRef().max_http_get_redirects, + DBMS_DEFAULT_BUFFER_SIZE, + context->getReadSettings(), + headers, + ReadWriteBufferFromHTTP::Range{}, + context->getRemoteHostFilter()), + chooseCompressionMethod(parsed_uri.getPath(), compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + namespace { ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders( @@ -642,6 +679,7 @@ void registerStorageURL(StorageFactory & factory) }, { .supports_settings = true, + .supports_schema_inference = true, .source_access_type = AccessType::URL, }); } diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index cf72352a183..790f01135d3 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -41,6 +41,14 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context); + protected: IStorageURLBase( const String & uri_, diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index fa7f6e52220..42b24abdbbe 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -15,25 +15,23 @@ namespace DB { StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const + ColumnsDescription cached_columns, bool use_global_context) const { ProfileEvents::increment(ProfileEvents::TableFunctionExecute); context->checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName())); + auto context_to_use = use_global_context ? context->getGlobalContext() : context; + if (cached_columns.empty()) return executeImpl(ast_function, context, table_name, std::move(cached_columns)); - /// We have table structure, so it's CREATE AS table_function(). - /// We should use global context here because there will be no query context on server startup - /// and because storage lifetime is bigger than query context lifetime. - auto global_context = context->getGlobalContext(); if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) - return executeImpl(ast_function, global_context, table_name, std::move(cached_columns)); + return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns)); auto this_table_function = shared_from_this(); auto get_storage = [=]() -> StoragePtr { - return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index 56147ffd598..93cf5057e88 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -54,7 +54,7 @@ public: /// Create storage according to the query. StoragePtr - execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}) const; + execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false) const; virtual ~ITableFunction() = default; diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index 699ad698bd8..4395c318983 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -6,16 +5,16 @@ #include #include -#include #include #include -#include #include #include +#include + namespace DB { @@ -23,10 +22,27 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int INCORRECT_FILE_NAME; extern const int BAD_ARGUMENTS; } +namespace +{ + void checkIfFormatSupportsAutoStructure(const String & name, const String & format) + { + if (name == "file" && format == "Distributed") + return; + + if (FormatFactory::instance().checkIfFormatHasAnySchemaReader(format)) + return; + + throw Exception( + "Table function '" + name + + "' allows automatic structure determination only for formats that support schema inference and for Distributed format in table function " + "'file'", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + } +} + void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Parse args @@ -46,21 +62,23 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context filename = args[0]->as().value.safeGet(); format = args[1]->as().value.safeGet(); - if (args.size() == 2 && getName() == "file") + if (args.size() == 2) { - if (format == "Distributed") - return; - throw Exception("Table function '" + getName() + "' allows 2 arguments only for Distributed format.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + checkIfFormatSupportsAutoStructure(getName(), format); + return; } if (args.size() != 3 && args.size() != 4) - throw Exception("Table function '" + getName() + "' requires 3 or 4 arguments: filename, format, structure and compression method (default auto).", + throw Exception("Table function '" + getName() + "' requires 2, 3 or 4 arguments: filename, format, structure (default auto) and compression method (default auto)", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); structure = args[2]->as().value.safeGet(); + if (structure == "auto") + checkIfFormatSupportsAutoStructure(getName(), format); + if (structure.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table structure is empty for table function '{}'", + "Table structure is empty for table function '{}'. If you want to use automatic schema inference, use 'auto'", ast_function->formatForErrorMessage()); if (args.size() == 4) @@ -69,25 +87,12 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context StoragePtr ITableFunctionFileLike::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - auto columns = getActualTableStructure(context); + ColumnsDescription columns; + if (structure != "auto") + columns = parseColumnsListFromString(structure, context); StoragePtr storage = getStorage(filename, format, columns, context, table_name, compression_method); storage->startup(); return storage; } -ColumnsDescription ITableFunctionFileLike::getActualTableStructure(ContextPtr context) const -{ - if (structure.empty()) - { - assert(getName() == "file" && format == "Distributed"); - size_t total_bytes_to_read = 0; - Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); - return ColumnsDescription{source->getOutputs().front().getHeader().getNamesAndTypesList()}; - } - return parseColumnsListFromString(structure, context); -} - } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 2069f02b0dd..2ceafdee229 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -8,7 +8,7 @@ class ColumnsDescription; class Context; /* - * function(source, format, structure) - creates a temporary storage from formatted source + * function(source, format, structure[, compression_method]) - creates a temporary storage from formatted source */ class ITableFunctionFileLike : public ITableFunction { @@ -18,7 +18,7 @@ protected: String filename; String format; - String structure; + String structure = "auto"; String compression_method = "auto"; private: @@ -28,8 +28,7 @@ private: const String & source, const String & format, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method) const = 0; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; - - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return structure != "auto"; } }; + } diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index d8bdb3b45c4..71aba5494e8 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -1,4 +1,5 @@ #include +#include #include "registerTableFunctions.h" #include @@ -9,11 +10,13 @@ namespace DB { + StoragePtr TableFunctionFile::getStorage(const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const { + LOG_DEBUG(&Poco::Logger::get("TableFunctionFile"), "getStorage"); // For `file` table function, we are going to use format settings from the // query context. StorageFile::CommonArguments args{ @@ -30,8 +33,21 @@ StoragePtr TableFunctionFile::getStorage(const String & source, return StorageFile::create(source, global_context->getUserFilesPath(), args); } +ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + { + size_t total_bytes_to_read = 0; + Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); + return StorageFile::getTableStructureFromData(format, paths, compression_method, std::nullopt, context); + } + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionFile(TableFunctionFactory & factory) { factory.registerFunction(); } + } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 460656a7218..f26e4a9c06d 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -6,7 +6,7 @@ namespace DB { -/* file(path, format, structure) - creates a temporary storage from file +/* file(path, format[, structure, compression]) - creates a temporary storage from file * * The file must be in the clickhouse data directory. * The relative path begins with the clickhouse data directory. @@ -20,9 +20,13 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const override; const char * getStorageTypeName() const override { return "File"; } -};} +}; + +} diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 245674b0e06..b626f563977 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -6,9 +6,11 @@ #include #include #include +#include namespace DB { + StoragePtr TableFunctionHDFS::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const @@ -24,12 +26,18 @@ StoragePtr TableFunctionHDFS::getStorage( compression_method_); } +ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); + + return parseColumnsListFromString(structure, context); +} -#if USE_HDFS void registerTableFunctionHDFS(TableFunctionFactory & factory) { factory.registerFunction(); } -#endif + } #endif diff --git a/src/TableFunctions/TableFunctionHDFS.h b/src/TableFunctions/TableFunctionHDFS.h index 70bdc67efc8..74139818209 100644 --- a/src/TableFunctions/TableFunctionHDFS.h +++ b/src/TableFunctions/TableFunctionHDFS.h @@ -12,7 +12,7 @@ namespace DB class Context; -/* hdfs(URI, format, structure) - creates a temporary storage from hdfs files +/* hdfs(URI, format[, structure, compression]) - creates a temporary storage from hdfs files * */ class TableFunctionHDFS : public ITableFunctionFileLike @@ -24,6 +24,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, diff --git a/src/TableFunctions/TableFunctionRemote.h b/src/TableFunctions/TableFunctionRemote.h index 845c36182dc..976397ddc45 100644 --- a/src/TableFunctions/TableFunctionRemote.h +++ b/src/TableFunctions/TableFunctionRemote.h @@ -27,6 +27,7 @@ public: bool needStructureConversion() const override { return false; } private: + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; const char * getStorageTypeName() const override { return "Distributed"; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index e26c282c622..c4be01c6b5c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "registerTableFunctions.h" @@ -28,6 +29,7 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con const auto message = fmt::format( "The signature of table function {} could be the following:\n" \ + " - url, format\n" \ " - url, format, structure\n" \ " - url, format, structure, compression_method\n" \ " - url, access_key_id, secret_access_key, format, structure\n" \ @@ -69,17 +71,32 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con /// Size -> argument indexes static auto size_to_args = std::map> { + {2, {{"format", 1}}}, {3, {{"format", 1}, {"structure", 2}}}, - {4, {{"format", 1}, {"structure", 2}, {"compression_method", 3}}}, {5, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}}, {6, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}} }; + std::map args_to_idx; + /// For 4 arguments we support 2 possible variants: + /// s3(source, format, structure, compression_method) and s3(source, access_key_id, access_key_id, format) + /// We can distinguish them by looking at the 4-th argument: check if it's a format name or not. + if (args.size() == 4) + { + auto last_arg = args[3]->as().value.safeGet(); + if (FormatFactory::instance().getAllFormats().contains(last_arg)) + args_to_idx = {{"access_key_id", 1}, {"access_key_id", 2}, {"format", 3}}; + else + args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + } + else + { + args_to_idx = size_to_args[args.size()]; + } + /// This argument is always the first configuration.url = args[0]->as().value.safeGet(); - auto & args_to_idx = size_to_args[args.size()]; - if (args_to_idx.contains("format")) configuration.format = args[args_to_idx["format"]]->as().value.safeGet(); @@ -101,6 +118,21 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const { + if (s3_configuration->structure == "auto") + { + return StorageS3::getTableStructureFromData( + s3_configuration->format, + S3::URI(Poco::URI(s3_configuration->url)), + s3_configuration->access_key_id, + s3_configuration->secret_access_key, + context->getSettingsRef().s3_max_connections, + context->getSettingsRef().s3_max_single_read_retries, + s3_configuration->compression_method, + false, + std::nullopt, + context); + } + return parseColumnsListFromString(s3_configuration->structure, context); } @@ -113,6 +145,10 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context UInt64 max_single_part_upload_size = context->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = context->getSettingsRef().s3_max_connections; + ColumnsDescription columns; + if (s3_configuration->structure != "auto") + columns = parseColumnsListFromString(s3_configuration->structure, context); + StoragePtr storage = StorageS3::create( s3_uri, s3_configuration->access_key_id, diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index 8d4c1391236..374e653072e 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -13,7 +13,7 @@ namespace DB class Context; -/* s3(source, [access_key_id, secret_access_key,] format, structure) - creates a temporary storage for a file in S3 +/* s3(source, [access_key_id, secret_access_key,] format, structure[, compression]) - creates a temporary storage for a file in S3 */ class TableFunctionS3 : public ITableFunction { @@ -23,7 +23,7 @@ public: { return name; } - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return s3_configuration->structure != "auto"; } protected: StoragePtr executeImpl( diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index c3ea30f800f..7c4d7b4a444 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -2,11 +2,11 @@ #include "registerTableFunctions.h" #include -#include #include #include #include #include +#include #include @@ -59,20 +59,10 @@ void TableFunctionURL::parseArguments(const ASTPtr & ast_function, ContextPtr co } } - StoragePtr TableFunctionURL::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const { - ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; - for (const auto & [header, value] : configuration.headers) - { - auto value_literal = value.safeGet(); - if (header == "Range") - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); - headers.emplace_back(std::make_pair(header, value_literal)); - } - return StorageURL::create( source, StorageID(getDatabaseName(), table_name), @@ -83,10 +73,31 @@ StoragePtr TableFunctionURL::getStorage( String{}, global_context, compression_method_, - headers, + getHeaders(), configuration.http_method); } +ReadWriteBufferFromHTTP::HTTPHeaderEntries TableFunctionURL::getHeaders() const +{ + ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; + for (const auto & [header, value] : configuration.headers) + { + auto value_literal = value.safeGet(); + if (header == "Range") + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); + headers.emplace_back(std::make_pair(header, value_literal)); + } + return headers; +} + +ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageURL::getTableStructureFromData(format, filename, compression_method, getHeaders(), std::nullopt, context); + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionURL(TableFunctionFactory & factory) { factory.registerFunction(); diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index 9425112acb2..798a37dc478 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB @@ -9,7 +10,7 @@ namespace DB class Context; -/* url(source, format, structure) - creates a temporary storage from url +/* url(source, format[, structure, compression]) - creates a temporary storage from url */ class TableFunctionURL : public ITableFunctionFileLike { @@ -20,6 +21,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + protected: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; @@ -29,6 +32,8 @@ private: const std::string & table_name, const String & compression_method_) const override; const char * getStorageTypeName() const override { return "URL"; } + ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders() const; + URLBasedDataSourceConfiguration configuration; }; diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 33ce94a7a29..f317fb5429a 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -310,6 +310,7 @@ def test_seekable_formats(started_cluster): result = node1.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) + def test_read_table_with_default(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -322,6 +323,22 @@ def test_read_table_with_default(started_cluster): "select * from hdfs('hdfs://hdfs1:9000/simple_table_function', 'TSVWithNames', 'n UInt32, m UInt32 DEFAULT n * 2') FORMAT TSVWithNames") == output +def test_schema_inference(started_cluster): + node1.query(f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)") + + result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert(int(result) == 5000000) + + node1.query(f"create table schema_inference engine=HDFS('hdfs://hdfs1:9000/native', 'Native')") + result = node1.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + def test_hdfsCluster(started_cluster): hdfs_api = started_cluster.hdfs_api diff --git a/tests/integration/test_storage_s3/configs/named_collections.xml b/tests/integration/test_storage_s3/configs/named_collections.xml index dfcbeeb2d4a..efadedc1bde 100644 --- a/tests/integration/test_storage_s3/configs/named_collections.xml +++ b/tests/integration/test_storage_s3/configs/named_collections.xml @@ -15,5 +15,10 @@ minio minio123 + + http://minio1:9001/root/test_native + minio + minio123 + diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index f3c4b1dd0cf..885a37f875c 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -126,7 +126,7 @@ def run_query(instance, query, stdin=None, settings=None): pytest.param("'wrongid','wrongkey',", False, 'xz', id="xz"), pytest.param("'wrongid','wrongkey',", False, 'zstd', id="zstd") ]) -def test_put(started_cluster, maybe_auth, positive, compression): +def _test_put(started_cluster, maybe_auth, positive, compression): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -148,7 +148,7 @@ def test_put(started_cluster, maybe_auth, positive, compression): assert values_csv == get_s3_file_content(started_cluster, bucket, filename) -def test_partition_by(started_cluster): +def _test_partition_by(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -173,7 +173,7 @@ def test_partition_by(started_cluster): assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv") -def test_partition_by_string_column(started_cluster): +def _test_partition_by_string_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "col_num UInt32, col_str String" @@ -191,7 +191,7 @@ def test_partition_by_string_column(started_cluster): assert '78,"你好"\n' == get_s3_file_content(started_cluster, bucket, "test_你好.csv") -def test_partition_by_const_column(started_cluster): +def _test_partition_by_const_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -212,7 +212,7 @@ def test_partition_by_const_column(started_cluster): "space", "plus" ]) -def test_get_file_with_special(started_cluster, special): +def _test_get_file_with_special(started_cluster, special): symbol = {"space": " ", "plus": "+"}[special] urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special] auth = "'minio','minio123'," @@ -239,7 +239,7 @@ def test_get_file_with_special(started_cluster, special): "plus", "plus2" ]) -def test_get_path_with_special(started_cluster, special): +def _test_get_path_with_special(started_cluster, special): symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special] safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special] auth = "'minio','minio123'," @@ -253,7 +253,7 @@ def test_get_path_with_special(started_cluster, special): @pytest.mark.parametrize("auth", [ pytest.param("'minio','minio123',", id="minio") ]) -def test_empty_put(started_cluster, auth): +def _test_empty_put(started_cluster, auth): # type: (ClickHouseCluster, str) -> None bucket = started_cluster.minio_bucket @@ -291,7 +291,7 @@ def test_empty_put(started_cluster, auth): pytest.param("'minio','minio123',", True, id="auth_positive"), pytest.param("'wrongid','wrongkey',", False, id="negative"), ]) -def test_put_csv(started_cluster, maybe_auth, positive): +def _test_put_csv(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster, bool, str) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -313,7 +313,7 @@ def test_put_csv(started_cluster, maybe_auth, positive): # Test put and get with S3 server redirect. -def test_put_get_with_redirect(started_cluster): +def _test_put_get_with_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -340,7 +340,7 @@ def test_put_get_with_redirect(started_cluster): # Test put with restricted S3 server redirect. -def test_put_with_zero_redirect(started_cluster): +def _test_put_with_zero_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -367,7 +367,7 @@ def test_put_with_zero_redirect(started_cluster): assert exception_raised -def test_put_get_with_globs(started_cluster): +def _test_put_get_with_globs(started_cluster): # type: (ClickHouseCluster) -> None unique_prefix = random.randint(1,10000) bucket = started_cluster.minio_bucket @@ -399,7 +399,7 @@ def test_put_get_with_globs(started_cluster): pytest.param("'wrongid','wrongkey'", False, id="negative"), # ("'minio','minio123',",True), Redirect with credentials not working with nginx. ]) -def test_multipart_put(started_cluster, maybe_auth, positive): +def _test_multipart_put(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -439,7 +439,7 @@ def test_multipart_put(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) -def test_remote_host_filter(started_cluster): +def _test_remote_host_filter(started_cluster): instance = started_cluster.instances["restricted_dummy"] format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -457,7 +457,7 @@ def test_remote_host_filter(started_cluster): pytest.param("''", id="1_argument"), pytest.param("'','','','','',''", id="6_arguments"), ]) -def test_wrong_s3_syntax(started_cluster, s3_storage_args): +def _test_wrong_s3_syntax(started_cluster, s3_storage_args): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance expected_err_msg = "Code: 42" # NUMBER_OF_ARGUMENTS_DOESNT_MATCH @@ -466,7 +466,7 @@ def test_wrong_s3_syntax(started_cluster, s3_storage_args): # https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights -def test_s3_glob_scheherazade(started_cluster): +def _test_s3_glob_scheherazade(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -535,7 +535,7 @@ def replace_config(old, new): config.close() -def test_custom_auth_headers(started_cluster): +def _test_custom_auth_headers(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format( @@ -566,7 +566,7 @@ def test_custom_auth_headers(started_cluster): instance.query("DROP TABLE test") -def test_custom_auth_headers_exclusion(started_cluster): +def _test_custom_auth_headers_exclusion(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')" @@ -580,7 +580,7 @@ def test_custom_auth_headers_exclusion(started_cluster): assert 'Forbidden Error' in ei.value.stderr -def test_infinite_redirect(started_cluster): +def _test_infinite_redirect(started_cluster): bucket = "redirected" table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" @@ -598,7 +598,7 @@ def test_infinite_redirect(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz"), ]) -def test_storage_s3_get_gzip(started_cluster, extension, method): +def _test_storage_s3_get_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_get_gzip.{extension}" @@ -638,7 +638,7 @@ def test_storage_s3_get_gzip(started_cluster, extension, method): run_query(instance, f"DROP TABLE {name}") -def test_storage_s3_get_unstable(started_cluster): +def _test_storage_s3_get_unstable(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64" @@ -647,7 +647,7 @@ def test_storage_s3_get_unstable(started_cluster): assert result.splitlines() == ["500001,500000,0"] -def test_storage_s3_put_uncompressed(started_cluster): +def _test_storage_s3_put_uncompressed(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = "test_put_uncompressed.bin" @@ -684,7 +684,7 @@ def test_storage_s3_put_uncompressed(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz") ]) -def test_storage_s3_put_gzip(started_cluster, extension, method): +def _test_storage_s3_put_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_put_gzip.{extension}" @@ -721,7 +721,7 @@ def test_storage_s3_put_gzip(started_cluster, extension, method): assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 708 -def test_truncate_table(started_cluster): +def _test_truncate_table(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "truncate" @@ -745,7 +745,7 @@ def test_truncate_table(started_cluster): assert instance.query("SELECT * FROM {}".format(name)) == "" -def test_predefined_connection_configuration(started_cluster): +def _test_predefined_connection_configuration(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "test_table" @@ -762,7 +762,7 @@ def test_predefined_connection_configuration(started_cluster): result = "" -def test_url_reconnect_in_the_middle(started_cluster): +def _test_url_reconnect_in_the_middle(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "id String, data String" @@ -783,7 +783,7 @@ def test_url_reconnect_in_the_middle(started_cluster): f"""select sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}') settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)""") - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) thread = threading.Thread(target=select) thread.start() @@ -796,10 +796,10 @@ def test_url_reconnect_in_the_middle(started_cluster): thread.join() - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) -def test_seekable_formats(started_cluster): +def _test_seekable_formats(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance @@ -821,7 +821,7 @@ def test_seekable_formats(started_cluster): assert(int(result[:3]) < 200) -def test_seekable_formats_url(started_cluster): +def _test_seekable_formats_url(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] @@ -865,3 +865,53 @@ def test_insert_with_path_with_globs(started_cluster): table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')" instance.query_and_get_error(f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)") + + +def test_s3_schema_inference(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query(f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000)") + result = instance.query(f"desc s3(s3_native, format='Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from s3(s3_native, format='Native')") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference engine=S3(s3_native, format='Native')") + result = instance.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + + + table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')" + result = instance.query(f"desc {table_function}") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from {table_function}") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference_2 engine=URL('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')") + result = instance.query(f"desc schema_inference_2") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference_2") + assert(int(result) == 5000000) + + +def test_empty_file(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + name = "empty" + url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}' + + minio = started_cluster.minio_client + minio.put_object(bucket, name, io.BytesIO(b""), 0) + + table_function = f"s3('{url}', 'CSV', 'id Int32')" + result = instance.query(f"SELECT count() FROM {table_function}") + assert(int(result) == 0) + diff --git a/tests/queries/0_stateless/00646_url_engine.python b/tests/queries/0_stateless/00646_url_engine.python index 85ae3e776ed..4f47e819328 100644 --- a/tests/queries/0_stateless/00646_url_engine.python +++ b/tests/queries/0_stateless/00646_url_engine.python @@ -156,6 +156,7 @@ def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,do if table_name: get_ch_answer("drop table if exists {}".format(table_name)) + def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]): with open(CSV_DATA, 'w') as f: # flush test file f.write('') diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh index 2731e4bcce3..8d9e2689e26 100755 --- a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh @@ -25,15 +25,15 @@ SELECT * FROM enum_mapping_protobuf_00825; EOF BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_enum_mapping.XXXXXX.binary") -$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" > "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" > "$BINARY_FILE_PATH" # Check the output in the protobuf format echo -$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:Message" --input "$BINARY_FILE_PATH" +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage" --input "$BINARY_FILE_PATH" # Check the input in the protobuf format (now the table contains the same data twice). echo -$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" < "$BINARY_FILE_PATH" $CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825" rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary new file mode 100644 index 00000000000..4b7b97a300f Binary files /dev/null and b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary differ diff --git a/tests/queries/0_stateless/02149_external_schema_inference.reference b/tests/queries/0_stateless/02149_external_schema_inference.reference new file mode 100644 index 00000000000..875659c7fb6 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.reference @@ -0,0 +1,168 @@ +Protobuf + +a_b_c Array(Array(Array(Int32))) + +a String +b_c Array(Array(Float64)) + +x Enum8(\'FIRST\' = 0, \'SECOND\' = 1, \'TEN\' = 10, \'HUNDRED\' = 100) + +a Map(String, UInt32) + +x_y_z Array(Array(Int32)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureUnits Array(Tuple(unit String, coef Float32)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) + +location Array(Int32) +pi Float32 +uuid String +newFieldBool UInt8 +name String +gender Enum8(\'male\' = 0, \'female\' = 1) +zodiacSign Int32 +birthDate Int64 +age String +isOnline Enum8(\'offline\' = 0, \'online\' = 1) +someRatio Float64 +visitTime UInt64 +newMessage Tuple(empty Array(Tuple()), z Float32) +randomBigNumber Int64 +newFieldInt Array(Int32) +color Array(Float32) +lotteryWin UInt64 +surname String +phoneNumber UInt64 +temperature Int32 +newFieldStr String +measureUnits_unit Array(String) +measureUnits_coef Array(Float32) +nestiness_a_b_c_d UInt32 +nestiness_a_b_c_e Array(UInt32) + +uuid String +name String +surname String +gender String +birthDate String +phoneNumber String +isOnline String +visitTime String +age String +zodiacSign String +songs Array(String) +color Array(String) +hometown String +location Array(String) +pi String +lotteryWin String +someRatio String +temperature String +randomBigNumber String +measureUnits Tuple(unit Array(String), coef Array(String)) +nestiness_a_b_c Tuple(d String, e Array(String)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureunits Tuple(coef Array(Float32), unit Array(String)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) +newFieldStr String +newFieldInt Int32 +newBool UInt8 + +identifier String +modules Array(Tuple(module_id UInt32, supply UInt32, temp UInt32, nodes Array(Tuple(node_id UInt32, opening_time UInt32, closing_time UInt32, current UInt32, coords_y Float32)))) + +Capnproto + +value Enum8(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2) + +value UInt64 +list1 Array(UInt64) +list2 Array(Array(Array(UInt64))) + +lc1 String +lc2 Nullable(String) +lc3 Array(Nullable(String)) + +value UInt64 +nested Tuple(a Tuple(b UInt64, c Array(Array(UInt64))), d Array(Tuple(e Array(Array(Tuple(f UInt64, g UInt64))), h Array(Tuple(k Array(UInt64)))))) + +nested Tuple(value Array(UInt64), array Array(Array(UInt64)), tuple Array(Tuple(one UInt64, two UInt64))) + +a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64))) + +nullable Nullable(UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(nullable Nullable(UInt64)) + +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +float32 Float32 +float64 Float64 +string String +fixed String +data String +date UInt16 +datetime UInt32 +datetime64 Int64 + +value UInt64 +tuple1 Tuple(one UInt64, two Tuple(three UInt64, four UInt64)) +tuple2 Tuple(nested1 Tuple(nested2 Tuple(x UInt64))) + +RawBLOB + +raw_blob String + +LineAsString + +line String + +JSONAsString + +json String diff --git a/tests/queries/0_stateless/02149_external_schema_inference.sh b/tests/queries/0_stateless/02149_external_schema_inference.sh new file mode 100755 index 00000000000..df2b9a43565 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'CapnProto', 'val1 char') settings format_schema='nonexist:Message'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist.capnp)") +CLIENT_SCHEMADIR=$CURDIR/format_schemas +SERVER_SCHEMADIR=test_02149 +mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR +cp -r $CLIENT_SCHEMADIR/* $SCHEMADIR/$SERVER_SCHEMADIR/ + +echo -e "Protobuf\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_3dim:ABC'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_of_arrays:AA'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_enum_mapping.proto:EnumMessage'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_map:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:AltPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:StrPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons_syntax2:Syntax2Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage'" + + +echo -e "\nCapnproto\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_lists:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_low_cardinality:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_lists_and_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_table:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nullable:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_tuples:Message'" + +echo -e "\nRawBLOB\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'RawBLOB')" + +echo -e "\nLineAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'LineAsString')" + +echo -e "\nJSONAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONAsString')" + + + +rm -rf ${SCHEMADIR:?}/$SERVER_SCHEMADIR +rm $DATA_FILE diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference new file mode 100644 index 00000000000..f46e3bee101 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -0,0 +1,170 @@ +TSV +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +c4 Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +TSVWithNames +number Nullable(String) +string Nullable(String) +array Nullable(String) +tuple Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +CSV +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c4 Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +c1 Nullable(String) +c2 Nullable(String) +42 String +String 42 +c1 Nullable(String) +c2 Nullable(String) +\N [NULL, NULL] +\N [] +CSVWithNames +a Nullable(Float64) +b Nullable(String) +c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +d Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +JSONCompactEachRow +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1 +JSONCompactEachRowWithNames +a Nullable(Float64) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +d Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +JSONEachRow +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 +b Nullable(String) +c Array(Nullable(Float64)) +a Nullable(Float64) +s1 [] 1 +\N [2] 2 +\N [] \N +\N [] \N +\N [3] \N +TSKV +b Nullable(String) +c Nullable(String) +a Nullable(String) +s1 \N 1 +} [2] 2 +\N \N \N +\N \N \N +\N [3] \N +Values +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(String)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 Some string [1,2,3] (1,'2') ([1,2],[(3,'4'),(5,'6')]) +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) +\N Some string [10] (1,2) ([],[]) +Regexp +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] +2 Some string 2 [([4, 5, 6], String 2), ([], String 2)] +312 Some string 3 [([1, 2, 3], String 2), ([], String 2)] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +3 Some string 2 [([3,5,1],'String 2'),([],'String 2')] +244 Some string 3 [([],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +2 Some string 2 [([],'String 2'),([],'String 2')] +43 Some string 3 [([1,5,3],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +52 Some string 2 [([],'String 2'),([1],'String 2')] +24 Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +CustomSeparated +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +Template +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +MsgPack +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Float32) +c4 Nullable(String) +c5 Array(Array(Nullable(Int64))) +c6 Map(Int64, Array(Nullable(Int64))) +\N 0 0 Str: 0 [[0,1],[0]] {0:[0,1]} +1 \N 1 Str: 1 [[1,2],[1]] {1:[1,2]} +\N 2 2 Str: 2 [[2,3],[2]] {2:[2,3]} diff --git a/tests/queries/0_stateless/02149_schema_inference.sh b/tests/queries/0_stateless/02149_schema_inference.sh new file mode 100755 index 00000000000..1ccec240627 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'Template', 'val1 char') settings format_template_row='nonexist'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist)") + +echo "TSV" + +echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo "TSVWithNames" + +echo -e "number\tstring\tarray\ttuple +42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSVWithNames')" + +echo "CSV" + +echo -e "\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "42,\"String\" +\"String\",42" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\N,\"[NULL, NULL]\" +\N,[]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo "CSVWithNames" + +echo -e "a,b,c,d +\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSVWithNames')" + +echo "JSONCompactEachRow" + +echo -e "[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e "[null, [[1, \"String\"], [2, null]], {\"key\" : null, \"key2\" : 24}, null] +[32, [[2, \"String 2\"], [3, \"hello\"]], {\"key3\" : 4242, \"key4\" : 2424}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo "JSONCompactEachRowWithNames" + +echo -e "[\"a\", \"b\", \"c\", \"d\"] +[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRowWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRowWithNames')" + + +echo "JSONEachRow" +echo -e '{"a" : 42.42, "b" : [[1, "String"], [2, "abcd"]], "c" : {"key" : 42, "key2" : 24}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : null, "b" : [[1, "String"], [2, null]], "c" : {"key" : null, "key2" : 24}, "d" : null} +{"a" : 32, "b" : [[2, "String 2"], [3, "hello"]], "c" : {"key3" : 4242, "key4" : 2424}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : 1, "b" : "s1", "c" : null} +{"c" : [2], "a" : 2, "b" : null} +{} +{"a" : null} +{"c" : [3], "a" : null}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + + +echo "TSKV" + +echo -e 'a=1\tb=s1\tc=\N +c=[2]\ta=2\tb=\N} + +a=\N +c=[3]\ta=\N' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" + + +echo "Values" + +echo -e "(42.42, 'Some string', [1, 2, 3], (1, '2'), ([1, 2], [(3, '4'), (5, '6')]))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + +echo -e "(42.42, NULL, [1, NULL, 3], (1, NULL), ([1, 2], [(3, '4'), (5, '6')])), (NULL, 'Some string', [10], (1, 2), ([], []))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + + +echo "Regexp" + +REGEXP="^Line: value_1=(.+?), value_2=(.+?), value_3=(.+?)" + +echo "Line: value_1=42, value_2=Some string 1, value_3=[([1, 2, 3], String 1), ([], String 1)] +Line: value_1=2, value_2=Some string 2, value_3=[([4, 5, 6], String 2), ([], String 2)] +Line: value_1=312, value_2=Some string 3, value_3=[([1, 2, 3], String 2), ([], String 2)]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=\"[([1, 2, 3], 'String 1'), ([], 'String 1')]\" +Line: value_1=3, value_2=\"Some string 2\", value_3=\"[([3, 5, 1], 'String 2'), ([], 'String 2')]\" +Line: value_1=244, value_2=\"Some string 3\", value_3=\"[([], 'String 3'), ([], 'String 3')]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" + + +echo "Line: value_1=42, value_2='Some string 1', value_3=[([1, 2, 3], 'String 1'), ([], 'String 1')] +Line: value_1=2, value_2='Some string 2', value_3=[([], 'String 2'), ([], 'String 2')] +Line: value_1=43, value_2='Some string 3', value_3=[([1, 5, 3], 'String 3'), ([], 'String 3')]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] +Line: value_1=52, value_2=\"Some string 2\", value_3=[[[], \"String 2\"], [[1], \"String 2\"]] +Line: value_1=24, value_2=\"Some string 3\", value_3=[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" + + +echo "CustomSeparated" + +CUSTOM_SETTINGS="SETTINGS format_custom_row_before_delimiter='', format_custom_row_after_delimiter='\n', format_custom_row_between_delimiter='\n', format_custom_result_before_delimiter='\n', format_custom_result_after_delimiter='\n', format_custom_field_delimiter=''" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" + + +echo "Template" + +echo -e " +\${data}" > $SCHEMADIR/resultset_format_02149 + +echo -e "\${column_1:CSV}\${column_2:CSV}\${column_3:CSV}" > $SCHEMADIR/row_format_02149 + +TEMPLATE_SETTINGS="SETTINGS format_template_rows_between_delimiter='\n', format_template_row='row_format_02149', format_template_resultset='resultset_format_02149'" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:Quoted}\${column_2:Quoted}\${column_3:Quoted}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:JSON}\${column_2:JSON}\${column_3:JSON}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + + +echo "MsgPack" + +$CLICKHOUSE_CLIENT -q "select toInt32(number % 2 ? number : NULL) as int, toUInt64(number % 2 ? NULL : number) as uint, toFloat32(number) as float, concat('Str: ', toString(number)) as str, [[number, number + 1], [number]] as arr, map(number, [number, number + 1]) as map from numbers(3) format MsgPack" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" + + +rm $SCHEMADIR/resultset_format_02149 $SCHEMADIR/row_format_02149 +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference new file mode 100644 index 00000000000..dae12318ce0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference @@ -0,0 +1,40 @@ +0 Str: 0 [0,1] +1 Str: 1 [1,2] +2 Str: 2 [2,3] +3 Str: 3 [3,4] +4 Str: 4 [4,5] +5 Str: 5 [5,6] +6 Str: 6 [6,7] +7 Str: 7 [7,8] +8 Str: 8 [8,9] +9 Str: 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh new file mode 100755 index 00000000000..f00f2531dd0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir $USER_FILES_PATH/test_02149 +FILE_NAME=test_02149/data.Parquet +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +$CLICKHOUSE_CLIENT -q "select number as num, concat('Str: ', toString(number)) as str, [number, number + 1] as arr from numbers(10) format Parquet" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02149" +$CLICKHOUSE_CLIENT -q "create table test_02149 engine=File('Parquet', '$FILE_NAME')" +$CLICKHOUSE_CLIENT -q "select * from test_02149" +$CLICKHOUSE_CLIENT -q "drop table test_02149" + +$CLICKHOUSE_CLIENT -q "create table test_02149 (x UInt32, s String, a Array(UInt32)) engine=Memory" +$CLICKHOUSE_CLIENT -q "insert into test_02149 select number, toString(number), [number, number + 1] from numbers(10)" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_merge" +$CLICKHOUSE_CLIENT -q "create table test_merge engine=Merge(currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_merge" +$CLICKHOUSE_CLIENT -q "drop table test_merge" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_distributed" +$CLICKHOUSE_CLIENT -q "create table test_distributed engine=Distributed(test_shard_localhost, currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_distributed" +$CLICKHOUSE_CLIENT -q "drop table test_distributed" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_buffer" +$CLICKHOUSE_CLIENT -q "create table test_buffer engine=Buffer(currentDatabase(), 'test_02149', 16, 10, 100, 10000, 1000000, 10000000, 100000000)" +$CLICKHOUSE_CLIENT -q "select * from test_buffer" +$CLICKHOUSE_CLIENT -q "drop table test_buffer" + +rm -rf ${USER_FILES_PATH:?}/test_02149 + diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference new file mode 100644 index 00000000000..d3d2d86d696 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference @@ -0,0 +1,435 @@ +Arrow +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ArrowStream +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Parquet +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 Int64 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ORC +int8 Int8 +uint8 Int8 +int16 Int16 +uint16 Int16 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date32 +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(Int64) +tuple Tuple(`tuple.0` Int64, `tuple.1` String) +map Map(String, Int64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Native +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVRawWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactStringsEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +RowBinaryWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CustomSeparatedWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Avro +CustomSeparatedWithNamesAndTypes +int8 Int32 +uint8 Int32 +int16 Int32 +uint16 Int32 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +0 0 +1.2 0.7692307692307692 +date Int32 +0 +1 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(Int64) +nested Array(Array(Array(Int64))) +[0,1] [[[0],[1]]] +[1,2] [[[1],[2]]] diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh new file mode 100755 index 00000000000..d263ef63681 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +for format in Arrow ArrowStream Parquet ORC Native TSVWithNamesAndTypes TSVRawWithNamesAndTypes CSVWithNamesAndTypes JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRowWithNamesAndTypes RowBinaryWithNamesAndTypes CustomSeparatedWithNamesAndTypes +do + echo $format + $CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64, toDecimal32(number / 0.3, 5) as decimal32, toDecimal64(number / 0.003, 5) as decimal64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toDate(number) as date, toDate32(number) as date32 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, (number, toString(number)) as tuple, map(toString(number), number) as map from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [([number, number + 1], map('42', number)), ([], map()), ([42], map('42', 42))] as nested1, (([[number], [number + 1], []], map(number, [(number, '42'), (number + 1, '42')])), 42) as nested2 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" +done + +echo "Avro" + +echo $format +$CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toDate(number) as date from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, [[[number], [number + 1]]] as nested from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto index ba558dbbadb..048a689d021 100644 --- a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto +++ b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto @@ -1,6 +1,6 @@ syntax = "proto3"; -message Message +message EnumMessage { enum Enum { @@ -10,4 +10,4 @@ message Message HUNDRED = 100; }; Enum x = 1; -}; \ No newline at end of file +};