diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 5c93d6719fa..a7142ef7f2e 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -13,6 +13,7 @@ #include #include +#include namespace DB @@ -269,8 +270,18 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const } -const ColumnWithTypeAndName * Block::findByName(const std::string & name) const +const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const { + if (case_insensitive) + { + auto found = std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); }); + if (found == data.end()) + { + return nullptr; + } + return &*found; + } + auto it = index_by_name.find(name); if (index_by_name.end() == it) { @@ -280,19 +291,23 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name) const } -const ColumnWithTypeAndName & Block::getByName(const std::string & name) const +const ColumnWithTypeAndName & Block::getByName(const std::string & name, bool case_insensitive) const { - const auto * result = findByName(name); + const auto * result = findByName(name, case_insensitive); if (!result) - throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() - , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + throw Exception( + "Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); return *result; } -bool Block::has(const std::string & name) const +bool Block::has(const std::string & name, bool case_insensitive) const { + if (case_insensitive) + return std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); }) + != data.end(); + return index_by_name.end() != index_by_name.find(name); } @@ -301,8 +316,8 @@ size_t Block::getPositionByName(const std::string & name) const { auto it = index_by_name.find(name); if (index_by_name.end() == it) - throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() - , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + throw Exception( + "Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); return it->second; } diff --git a/src/Core/Block.h b/src/Core/Block.h index 66e16b70f47..c5d3e1ae35a 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -60,21 +60,21 @@ public: ColumnWithTypeAndName & safeGetByPosition(size_t position); const ColumnWithTypeAndName & safeGetByPosition(size_t position) const; - ColumnWithTypeAndName* findByName(const std::string & name) + ColumnWithTypeAndName* findByName(const std::string & name, bool case_insensitive = false) { return const_cast( - const_cast(this)->findByName(name)); + const_cast(this)->findByName(name, case_insensitive)); } - const ColumnWithTypeAndName * findByName(const std::string & name) const; + const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const; - ColumnWithTypeAndName & getByName(const std::string & name) + ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false) { return const_cast( - const_cast(this)->getByName(name)); + const_cast(this)->getByName(name, case_insensitive)); } - const ColumnWithTypeAndName & getByName(const std::string & name) const; + const ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false) const; Container::iterator begin() { return data.begin(); } Container::iterator end() { return data.end(); } @@ -83,7 +83,7 @@ public: Container::const_iterator cbegin() const { return data.cbegin(); } Container::const_iterator cend() const { return data.cend(); } - bool has(const std::string & name) const; + bool has(const std::string & name, bool case_insensitive = false) const; size_t getPositionByName(const std::string & name) const; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ca2e9f12e66..f81b61ea648 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -616,11 +616,13 @@ class IColumn; M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \ M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \ - M(Bool, input_format_use_lowercase_column_name, false, "Use lowercase column name while reading input formats", 0) \ M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \ + M(Bool, input_format_arrow_case_insensitive_column_matching, false, "Ignore case when matching Arrow columns with CH columns.", 0) \ M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \ M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ + M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ + M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \ M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \ diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index df504bc34a8..8f5e40de5b8 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -15,6 +15,8 @@ #include +#include + namespace DB { @@ -227,14 +229,17 @@ void validateArraySizes(const Block & block) } -std::unordered_set getAllTableNames(const Block & block) +std::unordered_set getAllTableNames(const Block & block, bool to_lower_case) { std::unordered_set nested_table_names; - for (auto & name : block.getNames()) + for (const auto & name : block.getNames()) { auto nested_table_name = Nested::extractTableName(name); + if (to_lower_case) + boost::to_lower(nested_table_name); + if (!nested_table_name.empty()) - nested_table_names.insert(nested_table_name); + nested_table_names.insert(std::move(nested_table_name)); } return nested_table_names; } diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index 2ca5c17dc74..f6dc42d5c58 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -32,7 +32,7 @@ namespace Nested void validateArraySizes(const Block & block); /// Get all nested tables names from a block. - std::unordered_set getAllTableNames(const Block & block); + std::unordered_set getAllTableNames(const Block & block, bool to_lower_case = false); } } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 08554cf7e07..3aa82cb79b4 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -89,10 +89,10 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; format_settings.json.quote_denormals = settings.output_format_json_quote_denormals; format_settings.null_as_default = settings.input_format_null_as_default; - format_settings.use_lowercase_column_name = settings.input_format_use_lowercase_column_name; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; format_settings.parquet.import_nested = settings.input_format_parquet_import_nested; + format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; @@ -123,9 +123,11 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; + format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 4881c1a43c8..bd0a84d9ded 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -32,7 +32,6 @@ struct FormatSettings bool null_as_default = true; bool decimal_trailing_zeros = false; bool defaults_for_omitted_fields = true; - bool use_lowercase_column_name = false; bool seekable_read = true; UInt64 max_rows_to_read_for_schema_inference = 100; @@ -75,6 +74,7 @@ struct FormatSettings bool low_cardinality_as_dictionary = false; bool import_nested = false; bool allow_missing_columns = false; + bool case_insensitive_column_matching = false; } arrow; struct @@ -137,6 +137,7 @@ struct FormatSettings UInt64 row_group_size = 1000000; bool import_nested = false; bool allow_missing_columns = false; + bool case_insensitive_column_matching = false; } parquet; struct Pretty @@ -217,6 +218,7 @@ struct FormatSettings bool import_nested = false; bool allow_missing_columns = false; int64_t row_batch_size = 100'000; + bool case_insensitive_column_matching = false; } orc; /// For capnProto format we should determine how to diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index cf5cfa681a1..37a107ae367 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -139,7 +139,11 @@ void ArrowBlockInputFormat::prepareReader() } arrow_column_to_ch_column = std::make_unique( - getPort().getHeader(), "Arrow", format_settings.arrow.import_nested, format_settings.arrow.allow_missing_columns); + getPort().getHeader(), + "Arrow", + format_settings.arrow.import_nested, + format_settings.arrow.allow_missing_columns, + format_settings.arrow.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); if (stream) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 14c81a0d90d..0a72e561e4e 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -31,6 +31,7 @@ #include #include #include +#include /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. #define FOR_ARROW_NUMERIC_TYPES(M) \ @@ -484,19 +485,22 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( + const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case) { ColumnsWithTypeAndName sample_columns; std::unordered_set nested_table_names; if (hint_header) - nested_table_names = Nested::getAllTableNames(*hint_header); + nested_table_names = Nested::getAllTableNames(*hint_header, ignore_case); + for (const auto & field : schema.fields()) { - if (hint_header && !hint_header->has(field->name()) && !nested_table_names.contains(field->name())) + if (hint_header && !hint_header->has(field->name(), ignore_case) + && !nested_table_names.contains(ignore_case ? boost::to_lower_copy(field->name()) : field->name())) continue; /// Create empty arrow column by it's type and convert it to ClickHouse column. - arrow::MemoryPool* pool = arrow::default_memory_pool(); + arrow::MemoryPool * pool = arrow::default_memory_pool(); std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, field->type(), &array_builder); checkStatus(status, field->name(), format_name); @@ -516,20 +520,31 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, } ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const Block & header_, const std::string & format_name_, bool import_nested_, bool allow_missing_columns_) - : header(header_), format_name(format_name_), import_nested(import_nested_), allow_missing_columns(allow_missing_columns_) + const Block & header_, + const std::string & format_name_, + bool import_nested_, + bool allow_missing_columns_, + bool case_insensitive_matching_) + : header(header_) + , format_name(format_name_) + , import_nested(import_nested_) + , allow_missing_columns(allow_missing_columns_) + , case_insensitive_matching(case_insensitive_matching_) { } void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr & table) { NameToColumnPtr name_to_column_ptr; - for (const auto & column_name : table->ColumnNames()) + for (auto column_name : table->ColumnNames()) { std::shared_ptr arrow_column = table->GetColumnByName(column_name); if (!arrow_column) throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Column '{}' is duplicated", column_name); - name_to_column_ptr[column_name] = arrow_column; + + if (case_insensitive_matching) + boost::to_lower(column_name); + name_to_column_ptr[std::move(column_name)] = arrow_column; } arrowColumnsToCHChunk(res, name_to_column_ptr); @@ -548,22 +563,31 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & { const ColumnWithTypeAndName & header_column = header.getByPosition(column_i); + auto search_column_name = header_column.name; + if (case_insensitive_matching) + boost::to_lower(search_column_name); + bool read_from_nested = false; String nested_table_name = Nested::extractTableName(header_column.name); - if (!name_to_column_ptr.contains(header_column.name)) + String search_nested_table_name = nested_table_name; + if (case_insensitive_matching) + boost::to_lower(search_nested_table_name); + + if (!name_to_column_ptr.contains(search_column_name)) { /// Check if it's a column from nested table. - if (import_nested && name_to_column_ptr.contains(nested_table_name)) + if (import_nested && name_to_column_ptr.contains(search_nested_table_name)) { - if (!nested_tables.contains(nested_table_name)) + if (!nested_tables.contains(search_nested_table_name)) { - std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; + std::shared_ptr arrow_column = name_to_column_ptr[search_nested_table_name]; + ColumnsWithTypeAndName cols + = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); - nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); + nested_tables[search_nested_table_name] = std::make_shared(Nested::flatten(block)); } - read_from_nested = nested_tables[nested_table_name]->has(header_column.name); + read_from_nested = nested_tables[search_nested_table_name]->has(header_column.name, case_insensitive_matching); } if (!read_from_nested) @@ -580,13 +604,19 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & } } - std::shared_ptr arrow_column = name_to_column_ptr[header_column.name]; ColumnWithTypeAndName column; if (read_from_nested) - column = nested_tables[nested_table_name]->getByName(header_column.name); + { + column = nested_tables[search_nested_table_name]->getByName(header_column.name, case_insensitive_matching); + if (case_insensitive_matching) + column.name = header_column.name; + } else + { + auto arrow_column = name_to_column_ptr[search_column_name]; column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); + } try { @@ -594,8 +624,11 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & } catch (Exception & e) { - e.addMessage(fmt::format("while converting column {} from type {} to type {}", - backQuote(header_column.name), column.type->getName(), header_column.type->getName())); + e.addMessage(fmt::format( + "while converting column {} from type {} to type {}", + backQuote(header_column.name), + column.type->getName(), + header_column.type->getName())); throw; } @@ -609,22 +642,23 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & std::vector ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const { std::vector missing_columns; - auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header); + auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching); auto flatten_block_from_arrow = Nested::flatten(block_from_arrow); + for (size_t i = 0, columns = header.columns(); i < columns; ++i) { - const auto & column = header.getByPosition(i); + const auto & header_column = header.getByPosition(i); bool read_from_nested = false; - String nested_table_name = Nested::extractTableName(column.name); - if (!block_from_arrow.has(column.name)) + String nested_table_name = Nested::extractTableName(header_column.name); + if (!block_from_arrow.has(header_column.name, case_insensitive_matching)) { - if (import_nested && block_from_arrow.has(nested_table_name)) - read_from_nested = flatten_block_from_arrow.has(column.name); + if (import_nested && block_from_arrow.has(nested_table_name, case_insensitive_matching)) + read_from_nested = flatten_block_from_arrow.has(header_column.name, case_insensitive_matching); if (!read_from_nested) { if (!allow_missing_columns) - throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", column.name}; + throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", header_column.name}; missing_columns.push_back(i); } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index cf4f6bb3ff3..0a712326941 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -25,7 +25,8 @@ public: const Block & header_, const std::string & format_name_, bool import_nested_, - bool allow_missing_columns_); + bool allow_missing_columns_, + bool case_insensitive_matching_ = false); void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); @@ -36,7 +37,8 @@ public: /// Transform arrow schema to ClickHouse header. If hint_header is provided, /// we will skip columns in schema that are not in hint_header. - static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr); + static Block arrowSchemaToCHHeader( + const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false); private: const Block & header; @@ -44,6 +46,7 @@ private: bool import_nested; /// If false, throw exception if some columns in header not exists in arrow table. bool allow_missing_columns; + bool case_insensitive_matching; /// Map {column name : dictionary column}. /// To avoid converting dictionary from Arrow Dictionary diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index aa9f7874ae8..c68b59833db 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -53,9 +53,6 @@ Chunk ORCBlockInputFormat::generate() if (!table || !table->num_rows()) return res; - if (format_settings.use_lowercase_column_name) - table = *table->RenameColumns(include_column_names); - arrow_column_to_ch_column->arrowTableToCHChunk(res, table); /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// Otherwise fill the missing columns with zero values of its type. @@ -73,7 +70,6 @@ void ORCBlockInputFormat::resetParser() file_reader.reset(); include_indices.clear(); - include_column_names.clear(); block_missing_values.clear(); } @@ -125,20 +121,6 @@ static void getFileReaderAndSchema( if (!read_schema_result.ok()) throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); schema = std::move(read_schema_result).ValueOrDie(); - - if (format_settings.use_lowercase_column_name) - { - std::vector> fields; - fields.reserve(schema->num_fields()); - for (int i = 0; i < schema->num_fields(); ++i) - { - const auto& field = schema->field(i); - auto name = field->name(); - boost::to_lower(name); - fields.push_back(field->WithName(name)); - } - schema = arrow::schema(fields, schema->metadata()); - } } void ORCBlockInputFormat::prepareReader() @@ -149,12 +131,17 @@ void ORCBlockInputFormat::prepareReader() return; arrow_column_to_ch_column = std::make_unique( - getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns); + getPort().getHeader(), + "ORC", + format_settings.orc.import_nested, + format_settings.orc.allow_missing_columns, + format_settings.orc.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); + const bool ignore_case = format_settings.orc.case_insensitive_column_matching; std::unordered_set nested_table_names; if (format_settings.orc.import_nested) - nested_table_names = Nested::getAllTableNames(getPort().getHeader()); + nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case); /// In ReadStripe column indices should be started from 1, /// because 0 indicates to select all columns. @@ -165,19 +152,18 @@ void ORCBlockInputFormat::prepareReader() /// so we should recursively count the number of indices we need for this type. int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - if (getPort().getHeader().has(name) || nested_table_names.contains(name)) + if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name)) { for (int j = 0; j != indexes_count; ++j) - { include_indices.push_back(index + j); - include_column_names.push_back(name); - } } + index += indexes_count; } } -ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : ISchemaReader(in_), format_settings(format_settings_) { } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index bd2151d78ff..b7a771730ea 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -47,7 +47,6 @@ private: // indices of columns to read from ORC file std::vector include_indices; - std::vector include_column_names; std::vector missing_columns; BlockMissingValues block_missing_values; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 548bf0138f5..13582ce5019 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -53,11 +53,7 @@ Chunk ParquetBlockInputFormat::generate() std::shared_ptr table; arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, column_indices, &table); if (!read_status.ok()) - throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), - ErrorCodes::CANNOT_READ_ALL_DATA}; - - if (format_settings.use_lowercase_column_name) - table = *table->RenameColumns(column_names); + throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA}; ++row_group_current; @@ -78,7 +74,6 @@ void ParquetBlockInputFormat::resetParser() file_reader.reset(); column_indices.clear(); - column_names.clear(); row_group_current = 0; block_missing_values.clear(); } @@ -123,20 +118,6 @@ static void getFileReaderAndSchema( return; THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); - - if (format_settings.use_lowercase_column_name) - { - std::vector> fields; - fields.reserve(schema->num_fields()); - for (int i = 0; i < schema->num_fields(); ++i) - { - const auto& field = schema->field(i); - auto name = field->name(); - boost::to_lower(name); - fields.push_back(field->WithName(name)); - } - schema = arrow::schema(fields, schema->metadata()); - } } void ParquetBlockInputFormat::prepareReader() @@ -149,12 +130,18 @@ void ParquetBlockInputFormat::prepareReader() row_group_total = file_reader->num_row_groups(); row_group_current = 0; - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested, format_settings.parquet.allow_missing_columns); + arrow_column_to_ch_column = std::make_unique( + getPort().getHeader(), + "Parquet", + format_settings.parquet.import_nested, + format_settings.parquet.allow_missing_columns, + format_settings.parquet.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); + const bool ignore_case = format_settings.parquet.case_insensitive_column_matching; std::unordered_set nested_table_names; if (format_settings.parquet.import_nested) - nested_table_names = Nested::getAllTableNames(getPort().getHeader()); + nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case); int index = 0; for (int i = 0; i < schema->num_fields(); ++i) @@ -164,19 +151,19 @@ void ParquetBlockInputFormat::prepareReader() /// count the number of indices we need for this type. int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - if (getPort().getHeader().has(name) || nested_table_names.contains(name)) + + if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name)) { for (int j = 0; j != indexes_count; ++j) - { column_indices.push_back(index + j); - column_names.push_back(name); - } } + index += indexes_count; } } -ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : ISchemaReader(in_), format_settings(format_settings_) { } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index eba9aac29f2..1faadaa3d21 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -40,7 +40,6 @@ private: int row_group_total = 0; // indices of columns to read from Parquet file std::vector column_indices; - std::vector column_names; std::unique_ptr arrow_column_to_ch_column; int row_group_current = 0; std::vector missing_columns; diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference index 6ecff505b2e..b295a226853 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load.reference @@ -88,6 +88,9 @@ idx10 ['This','is','a','test'] 22 23 24 +=== Try load data from case_insensitive_column_matching.parquet +123 1 +456 2 === Try load data from datapage_v2.snappy.parquet Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA) @@ -339,9 +342,6 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unkno (NULL) === Try load data from single_nan.parquet \N -=== Try load data from test_setting_input_format_use_lowercase_column_name.parquet -123 1 -456 2 === Try load data from userdata1.parquet 1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02 1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV diff --git a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh b/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh deleted file mode 100755 index b946addd01c..00000000000 --- a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -# Tags: no-ubsan, no-fasttest - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -echo "Parquet" -DATA_FILE=$CUR_DIR/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet -${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load" -${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (id String, score Int32) ENGINE = Memory" -cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_use_lowercase_column_name=true" -${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load" -${CLICKHOUSE_CLIENT} --query="drop table parquet_load" - -echo "ORC" -DATA_FILE=$CUR_DIR/data_orc/test_setting_input_format_use_lowercase_column_name.orc -${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" -${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (id String, score Int32) ENGINE = Memory" -cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_use_lowercase_column_name=true" -${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load" -${CLICKHOUSE_CLIENT} --query="drop table orc_load" diff --git a/tests/queries/0_stateless/02241_parquet_bad_column.reference b/tests/queries/0_stateless/02241_parquet_bad_column.reference index f599e28b8ab..b2f7f08c170 100644 --- a/tests/queries/0_stateless/02241_parquet_bad_column.reference +++ b/tests/queries/0_stateless/02241_parquet_bad_column.reference @@ -1 +1,2 @@ 10 +10 diff --git a/tests/queries/0_stateless/02241_parquet_bad_column.sh b/tests/queries/0_stateless/02241_parquet_bad_column.sh index a160671a088..9efd11cbbe1 100755 --- a/tests/queries/0_stateless/02241_parquet_bad_column.sh +++ b/tests/queries/0_stateless/02241_parquet_bad_column.sh @@ -5,23 +5,25 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "drop table if exists test_02241" -$CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String), - caption Nullable(String), - NSFW Nullable(String), - similarity Nullable(Float64), - LICENSE Nullable(String), - url Nullable(String), - key Nullable(UInt64), - shard_id Nullable(UInt64), - status Nullable(String), - width Nullable(UInt32), - height Nullable(UInt32), - exif Nullable(String), - original_width Nullable(UInt32), - original_height Nullable(UInt32)) engine=Memory" +for case_insensitive in "true" "false"; do + $CLICKHOUSE_CLIENT -q "drop table if exists test_02241" + $CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String), + caption Nullable(String), + NSFW Nullable(String), + similarity Nullable(Float64), + LICENSE Nullable(String), + url Nullable(String), + key Nullable(UInt64), + shard_id Nullable(UInt64), + status Nullable(String), + width Nullable(UInt32), + height Nullable(UInt32), + exif Nullable(String), + original_width Nullable(UInt32), + original_height Nullable(UInt32)) engine=Memory" -cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet" + cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=$case_insensitive" -$CLICKHOUSE_CLIENT -q "select count() from test_02241" -$CLICKHOUSE_CLIENT -q "drop table test_02241" + $CLICKHOUSE_CLIENT -q "select count() from test_02241" + $CLICKHOUSE_CLIENT -q "drop table test_02241" +done diff --git a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference b/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference similarity index 66% rename from tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference rename to tests/queries/0_stateless/02242_case_insensitive_column_matching.reference index 5c383cb3035..9732211a286 100644 --- a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference +++ b/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference @@ -4,3 +4,6 @@ Parquet ORC 123 1 456 2 +Arrow +123 1 +456 2 diff --git a/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh b/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh new file mode 100755 index 00000000000..8ebf2952ab3 --- /dev/null +++ b/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Tags: no-ubsan, no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +echo "Parquet" +DATA_FILE=$CUR_DIR/data_parquet/case_insensitive_column_matching.parquet +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (iD String, scOre Int32) ENGINE = Memory" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=true" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load" +${CLICKHOUSE_CLIENT} --query="drop table parquet_load" + +echo "ORC" +DATA_FILE=$CUR_DIR/data_orc/case_insensitive_column_matching.orc +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (iD String, sCorE Int32) ENGINE = Memory" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_orc_case_insensitive_column_matching=true" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load" +${CLICKHOUSE_CLIENT} --query="drop table orc_load" + +echo "Arrow" +DATA_FILE=$CUR_DIR/data_arrow/case_insensitive_column_matching.arrow +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_load (iD String, sCorE Int32) ENGINE = Memory" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_load FORMAT Arrow SETTINGS input_format_arrow_case_insensitive_column_matching=true" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_load" +${CLICKHOUSE_CLIENT} --query="drop table arrow_load" diff --git a/tests/queries/0_stateless/02242_case_insensitive_nested.reference b/tests/queries/0_stateless/02242_case_insensitive_nested.reference new file mode 100644 index 00000000000..58d66d3230a --- /dev/null +++ b/tests/queries/0_stateless/02242_case_insensitive_nested.reference @@ -0,0 +1,12 @@ +Arrow +[1,2,3] ['123','456','789'] [9.8,10.12,11.14] +[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] +[7,8,9] ['101','415','118'] [13.08,1.12,0.414] +Parquet +[1,2,3] ['123','456','789'] [9.8,10.12,11.14] +[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] +[7,8,9] ['101','415','118'] [13.08,1.12,0.414] +ORC +[1,2,3] ['123','456','789'] [9.8,10.12,11.14] +[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] +[7,8,9] ['101','415','118'] [13.08,1.12,0.414] diff --git a/tests/queries/0_stateless/02242_case_insensitive_nested.sh b/tests/queries/0_stateless/02242_case_insensitive_nested.sh new file mode 100755 index 00000000000..c22f5695dc3 --- /dev/null +++ b/tests/queries/0_stateless/02242_case_insensitive_nested.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_table" +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_nested_table" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE nested_table (table Nested(eLeM1 Int32, elEm2 String, ELEM3 Float32)) engine=Memory" + +formats=('Arrow' 'Parquet' 'ORC') +format_files=('arrow' 'parquet' 'orc') + +for ((i = 0; i < 3; i++)) do + echo ${formats[i]} + + ${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE nested_table" + cat $CUR_DIR/data_orc_arrow_parquet_nested/nested_table.${format_files[i]} | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nested_table FORMAT ${formats[i]} SETTINGS input_format_${format_files[i]}_import_nested = 1, input_format_${format_files[i]}_case_insensitive_column_matching = true" + + ${CLICKHOUSE_CLIENT} --query="SELECT * FROM nested_table" + +done + +${CLICKHOUSE_CLIENT} --query="DROP TABLE nested_table" diff --git a/tests/queries/0_stateless/data_arrow/case_insensitive_column_matching.arrow b/tests/queries/0_stateless/data_arrow/case_insensitive_column_matching.arrow new file mode 100644 index 00000000000..4350d5c3e49 Binary files /dev/null and b/tests/queries/0_stateless/data_arrow/case_insensitive_column_matching.arrow differ diff --git a/tests/queries/0_stateless/data_orc/case_insensitive_column_matching.orc b/tests/queries/0_stateless/data_orc/case_insensitive_column_matching.orc new file mode 100644 index 00000000000..136f9980064 Binary files /dev/null and b/tests/queries/0_stateless/data_orc/case_insensitive_column_matching.orc differ diff --git a/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet similarity index 100% rename from tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet rename to tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet diff --git a/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet.columns b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns similarity index 100% rename from tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet.columns rename to tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns