Merge pull request #35459 from ClickHouse/case-insensitive-column-matching

Support for case insensitive column matching for ORC/Arrow/Parquet files
This commit is contained in:
Antonio Andelic 2022-03-25 08:09:29 +01:00 committed by GitHub
commit f6439efcad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 238 additions and 148 deletions

View File

@ -13,6 +13,7 @@
#include <iterator>
#include <base/sort.h>
#include <boost/algorithm/string.hpp>
namespace DB
@ -269,8 +270,18 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const
}
const ColumnWithTypeAndName * Block::findByName(const std::string & name) const
const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const
{
if (case_insensitive)
{
auto found = std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); });
if (found == data.end())
{
return nullptr;
}
return &*found;
}
auto it = index_by_name.find(name);
if (index_by_name.end() == it)
{
@ -280,19 +291,23 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name) const
}
const ColumnWithTypeAndName & Block::getByName(const std::string & name) const
const ColumnWithTypeAndName & Block::getByName(const std::string & name, bool case_insensitive) const
{
const auto * result = findByName(name);
const auto * result = findByName(name, case_insensitive);
if (!result)
throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames()
, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
throw Exception(
"Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
return *result;
}
bool Block::has(const std::string & name) const
bool Block::has(const std::string & name, bool case_insensitive) const
{
if (case_insensitive)
return std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); })
!= data.end();
return index_by_name.end() != index_by_name.find(name);
}
@ -301,8 +316,8 @@ size_t Block::getPositionByName(const std::string & name) const
{
auto it = index_by_name.find(name);
if (index_by_name.end() == it)
throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames()
, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
throw Exception(
"Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
return it->second;
}

View File

@ -60,21 +60,21 @@ public:
ColumnWithTypeAndName & safeGetByPosition(size_t position);
const ColumnWithTypeAndName & safeGetByPosition(size_t position) const;
ColumnWithTypeAndName* findByName(const std::string & name)
ColumnWithTypeAndName* findByName(const std::string & name, bool case_insensitive = false)
{
return const_cast<ColumnWithTypeAndName *>(
const_cast<const Block *>(this)->findByName(name));
const_cast<const Block *>(this)->findByName(name, case_insensitive));
}
const ColumnWithTypeAndName * findByName(const std::string & name) const;
const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const;
ColumnWithTypeAndName & getByName(const std::string & name)
ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false)
{
return const_cast<ColumnWithTypeAndName &>(
const_cast<const Block *>(this)->getByName(name));
const_cast<const Block *>(this)->getByName(name, case_insensitive));
}
const ColumnWithTypeAndName & getByName(const std::string & name) const;
const ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false) const;
Container::iterator begin() { return data.begin(); }
Container::iterator end() { return data.end(); }
@ -83,7 +83,7 @@ public:
Container::const_iterator cbegin() const { return data.cbegin(); }
Container::const_iterator cend() const { return data.cend(); }
bool has(const std::string & name) const;
bool has(const std::string & name, bool case_insensitive = false) const;
size_t getPositionByName(const std::string & name) const;

View File

@ -616,11 +616,13 @@ class IColumn;
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
M(Bool, input_format_use_lowercase_column_name, false, "Use lowercase column name while reading input formats", 0) \
M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \
M(Bool, input_format_arrow_case_insensitive_column_matching, false, "Ignore case when matching Arrow columns with CH columns.", 0) \
M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \
M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \
M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \
M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \
M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \

View File

@ -15,6 +15,8 @@
#include <Parsers/IAST.h>
#include <boost/algorithm/string/case_conv.hpp>
namespace DB
{
@ -227,14 +229,17 @@ void validateArraySizes(const Block & block)
}
std::unordered_set<String> getAllTableNames(const Block & block)
std::unordered_set<String> getAllTableNames(const Block & block, bool to_lower_case)
{
std::unordered_set<String> nested_table_names;
for (auto & name : block.getNames())
for (const auto & name : block.getNames())
{
auto nested_table_name = Nested::extractTableName(name);
if (to_lower_case)
boost::to_lower(nested_table_name);
if (!nested_table_name.empty())
nested_table_names.insert(nested_table_name);
nested_table_names.insert(std::move(nested_table_name));
}
return nested_table_names;
}

View File

@ -32,7 +32,7 @@ namespace Nested
void validateArraySizes(const Block & block);
/// Get all nested tables names from a block.
std::unordered_set<String> getAllTableNames(const Block & block);
std::unordered_set<String> getAllTableNames(const Block & block, bool to_lower_case = false);
}
}

View File

@ -89,10 +89,10 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.use_lowercase_column_name = settings.input_format_use_lowercase_column_name;
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
format_settings.pretty.color = settings.output_format_pretty_color;
@ -123,9 +123,11 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.seekable_read = settings.input_format_allow_seeks;

View File

@ -32,7 +32,6 @@ struct FormatSettings
bool null_as_default = true;
bool decimal_trailing_zeros = false;
bool defaults_for_omitted_fields = true;
bool use_lowercase_column_name = false;
bool seekable_read = true;
UInt64 max_rows_to_read_for_schema_inference = 100;
@ -75,6 +74,7 @@ struct FormatSettings
bool low_cardinality_as_dictionary = false;
bool import_nested = false;
bool allow_missing_columns = false;
bool case_insensitive_column_matching = false;
} arrow;
struct
@ -137,6 +137,7 @@ struct FormatSettings
UInt64 row_group_size = 1000000;
bool import_nested = false;
bool allow_missing_columns = false;
bool case_insensitive_column_matching = false;
} parquet;
struct Pretty
@ -217,6 +218,7 @@ struct FormatSettings
bool import_nested = false;
bool allow_missing_columns = false;
int64_t row_batch_size = 100'000;
bool case_insensitive_column_matching = false;
} orc;
/// For capnProto format we should determine how to

View File

@ -139,7 +139,11 @@ void ArrowBlockInputFormat::prepareReader()
}
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
getPort().getHeader(), "Arrow", format_settings.arrow.import_nested, format_settings.arrow.allow_missing_columns);
getPort().getHeader(),
"Arrow",
format_settings.arrow.import_nested,
format_settings.arrow.allow_missing_columns,
format_settings.arrow.case_insensitive_column_matching);
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
if (stream)

View File

@ -31,6 +31,7 @@
#include <algorithm>
#include <arrow/builder.h>
#include <arrow/array.h>
#include <boost/algorithm/string/case_conv.hpp>
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
#define FOR_ARROW_NUMERIC_TYPES(M) \
@ -484,19 +485,22 @@ static void checkStatus(const arrow::Status & status, const String & column_name
throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()};
}
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header)
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case)
{
ColumnsWithTypeAndName sample_columns;
std::unordered_set<String> nested_table_names;
if (hint_header)
nested_table_names = Nested::getAllTableNames(*hint_header);
nested_table_names = Nested::getAllTableNames(*hint_header, ignore_case);
for (const auto & field : schema.fields())
{
if (hint_header && !hint_header->has(field->name()) && !nested_table_names.contains(field->name()))
if (hint_header && !hint_header->has(field->name(), ignore_case)
&& !nested_table_names.contains(ignore_case ? boost::to_lower_copy(field->name()) : field->name()))
continue;
/// Create empty arrow column by it's type and convert it to ClickHouse column.
arrow::MemoryPool* pool = arrow::default_memory_pool();
arrow::MemoryPool * pool = arrow::default_memory_pool();
std::unique_ptr<arrow::ArrayBuilder> array_builder;
arrow::Status status = MakeBuilder(pool, field->type(), &array_builder);
checkStatus(status, field->name(), format_name);
@ -516,20 +520,31 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema,
}
ArrowColumnToCHColumn::ArrowColumnToCHColumn(
const Block & header_, const std::string & format_name_, bool import_nested_, bool allow_missing_columns_)
: header(header_), format_name(format_name_), import_nested(import_nested_), allow_missing_columns(allow_missing_columns_)
const Block & header_,
const std::string & format_name_,
bool import_nested_,
bool allow_missing_columns_,
bool case_insensitive_matching_)
: header(header_)
, format_name(format_name_)
, import_nested(import_nested_)
, allow_missing_columns(allow_missing_columns_)
, case_insensitive_matching(case_insensitive_matching_)
{
}
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table)
{
NameToColumnPtr name_to_column_ptr;
for (const auto & column_name : table->ColumnNames())
for (auto column_name : table->ColumnNames())
{
std::shared_ptr<arrow::ChunkedArray> arrow_column = table->GetColumnByName(column_name);
if (!arrow_column)
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Column '{}' is duplicated", column_name);
name_to_column_ptr[column_name] = arrow_column;
if (case_insensitive_matching)
boost::to_lower(column_name);
name_to_column_ptr[std::move(column_name)] = arrow_column;
}
arrowColumnsToCHChunk(res, name_to_column_ptr);
@ -548,22 +563,31 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
{
const ColumnWithTypeAndName & header_column = header.getByPosition(column_i);
auto search_column_name = header_column.name;
if (case_insensitive_matching)
boost::to_lower(search_column_name);
bool read_from_nested = false;
String nested_table_name = Nested::extractTableName(header_column.name);
if (!name_to_column_ptr.contains(header_column.name))
String search_nested_table_name = nested_table_name;
if (case_insensitive_matching)
boost::to_lower(search_nested_table_name);
if (!name_to_column_ptr.contains(search_column_name))
{
/// Check if it's a column from nested table.
if (import_nested && name_to_column_ptr.contains(nested_table_name))
if (import_nested && name_to_column_ptr.contains(search_nested_table_name))
{
if (!nested_tables.contains(nested_table_name))
if (!nested_tables.contains(search_nested_table_name))
{
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[nested_table_name];
ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name];
ColumnsWithTypeAndName cols
= {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
Block block(cols);
nested_tables[nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
nested_tables[search_nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
}
read_from_nested = nested_tables[nested_table_name]->has(header_column.name);
read_from_nested = nested_tables[search_nested_table_name]->has(header_column.name, case_insensitive_matching);
}
if (!read_from_nested)
@ -580,13 +604,19 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
}
}
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[header_column.name];
ColumnWithTypeAndName column;
if (read_from_nested)
column = nested_tables[nested_table_name]->getByName(header_column.name);
{
column = nested_tables[search_nested_table_name]->getByName(header_column.name, case_insensitive_matching);
if (case_insensitive_matching)
column.name = header_column.name;
}
else
{
auto arrow_column = name_to_column_ptr[search_column_name];
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true);
}
try
{
@ -594,8 +624,11 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
}
catch (Exception & e)
{
e.addMessage(fmt::format("while converting column {} from type {} to type {}",
backQuote(header_column.name), column.type->getName(), header_column.type->getName()));
e.addMessage(fmt::format(
"while converting column {} from type {} to type {}",
backQuote(header_column.name),
column.type->getName(),
header_column.type->getName()));
throw;
}
@ -609,22 +642,23 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
std::vector<size_t> ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const
{
std::vector<size_t> missing_columns;
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header);
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching);
auto flatten_block_from_arrow = Nested::flatten(block_from_arrow);
for (size_t i = 0, columns = header.columns(); i < columns; ++i)
{
const auto & column = header.getByPosition(i);
const auto & header_column = header.getByPosition(i);
bool read_from_nested = false;
String nested_table_name = Nested::extractTableName(column.name);
if (!block_from_arrow.has(column.name))
String nested_table_name = Nested::extractTableName(header_column.name);
if (!block_from_arrow.has(header_column.name, case_insensitive_matching))
{
if (import_nested && block_from_arrow.has(nested_table_name))
read_from_nested = flatten_block_from_arrow.has(column.name);
if (import_nested && block_from_arrow.has(nested_table_name, case_insensitive_matching))
read_from_nested = flatten_block_from_arrow.has(header_column.name, case_insensitive_matching);
if (!read_from_nested)
{
if (!allow_missing_columns)
throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", column.name};
throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", header_column.name};
missing_columns.push_back(i);
}

View File

@ -25,7 +25,8 @@ public:
const Block & header_,
const std::string & format_name_,
bool import_nested_,
bool allow_missing_columns_);
bool allow_missing_columns_,
bool case_insensitive_matching_ = false);
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
@ -36,7 +37,8 @@ public:
/// Transform arrow schema to ClickHouse header. If hint_header is provided,
/// we will skip columns in schema that are not in hint_header.
static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr);
static Block arrowSchemaToCHHeader(
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false);
private:
const Block & header;
@ -44,6 +46,7 @@ private:
bool import_nested;
/// If false, throw exception if some columns in header not exists in arrow table.
bool allow_missing_columns;
bool case_insensitive_matching;
/// Map {column name : dictionary column}.
/// To avoid converting dictionary from Arrow Dictionary

View File

@ -53,9 +53,6 @@ Chunk ORCBlockInputFormat::generate()
if (!table || !table->num_rows())
return res;
if (format_settings.use_lowercase_column_name)
table = *table->RenameColumns(include_column_names);
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.
@ -73,7 +70,6 @@ void ORCBlockInputFormat::resetParser()
file_reader.reset();
include_indices.clear();
include_column_names.clear();
block_missing_values.clear();
}
@ -125,20 +121,6 @@ static void getFileReaderAndSchema(
if (!read_schema_result.ok())
throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS);
schema = std::move(read_schema_result).ValueOrDie();
if (format_settings.use_lowercase_column_name)
{
std::vector<std::shared_ptr<::arrow::Field>> fields;
fields.reserve(schema->num_fields());
for (int i = 0; i < schema->num_fields(); ++i)
{
const auto& field = schema->field(i);
auto name = field->name();
boost::to_lower(name);
fields.push_back(field->WithName(name));
}
schema = arrow::schema(fields, schema->metadata());
}
}
void ORCBlockInputFormat::prepareReader()
@ -149,12 +131,17 @@ void ORCBlockInputFormat::prepareReader()
return;
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns);
getPort().getHeader(),
"ORC",
format_settings.orc.import_nested,
format_settings.orc.allow_missing_columns,
format_settings.orc.case_insensitive_column_matching);
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
std::unordered_set<String> nested_table_names;
if (format_settings.orc.import_nested)
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);
/// In ReadStripe column indices should be started from 1,
/// because 0 indicates to select all columns.
@ -165,19 +152,18 @@ void ORCBlockInputFormat::prepareReader()
/// so we should recursively count the number of indices we need for this type.
int indexes_count = countIndicesForType(schema->field(i)->type());
const auto & name = schema->field(i)->name();
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
{
for (int j = 0; j != indexes_count; ++j)
{
include_indices.push_back(index + j);
include_column_names.push_back(name);
}
}
index += indexes_count;
}
}
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: ISchemaReader(in_), format_settings(format_settings_)
{
}

View File

@ -47,7 +47,6 @@ private:
// indices of columns to read from ORC file
std::vector<int> include_indices;
std::vector<String> include_column_names;
std::vector<size_t> missing_columns;
BlockMissingValues block_missing_values;

View File

@ -53,11 +53,7 @@ Chunk ParquetBlockInputFormat::generate()
std::shared_ptr<arrow::Table> table;
arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, column_indices, &table);
if (!read_status.ok())
throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(),
ErrorCodes::CANNOT_READ_ALL_DATA};
if (format_settings.use_lowercase_column_name)
table = *table->RenameColumns(column_names);
throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA};
++row_group_current;
@ -78,7 +74,6 @@ void ParquetBlockInputFormat::resetParser()
file_reader.reset();
column_indices.clear();
column_names.clear();
row_group_current = 0;
block_missing_values.clear();
}
@ -123,20 +118,6 @@ static void getFileReaderAndSchema(
return;
THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader));
THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema));
if (format_settings.use_lowercase_column_name)
{
std::vector<std::shared_ptr<::arrow::Field>> fields;
fields.reserve(schema->num_fields());
for (int i = 0; i < schema->num_fields(); ++i)
{
const auto& field = schema->field(i);
auto name = field->name();
boost::to_lower(name);
fields.push_back(field->WithName(name));
}
schema = arrow::schema(fields, schema->metadata());
}
}
void ParquetBlockInputFormat::prepareReader()
@ -149,12 +130,18 @@ void ParquetBlockInputFormat::prepareReader()
row_group_total = file_reader->num_row_groups();
row_group_current = 0;
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested, format_settings.parquet.allow_missing_columns);
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
getPort().getHeader(),
"Parquet",
format_settings.parquet.import_nested,
format_settings.parquet.allow_missing_columns,
format_settings.parquet.case_insensitive_column_matching);
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
const bool ignore_case = format_settings.parquet.case_insensitive_column_matching;
std::unordered_set<String> nested_table_names;
if (format_settings.parquet.import_nested)
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);
int index = 0;
for (int i = 0; i < schema->num_fields(); ++i)
@ -164,19 +151,19 @@ void ParquetBlockInputFormat::prepareReader()
/// count the number of indices we need for this type.
int indexes_count = countIndicesForType(schema->field(i)->type());
const auto & name = schema->field(i)->name();
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
{
for (int j = 0; j != indexes_count; ++j)
{
column_indices.push_back(index + j);
column_names.push_back(name);
}
}
index += indexes_count;
}
}
ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: ISchemaReader(in_), format_settings(format_settings_)
{
}

View File

@ -40,7 +40,6 @@ private:
int row_group_total = 0;
// indices of columns to read from Parquet file
std::vector<int> column_indices;
std::vector<String> column_names;
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
int row_group_current = 0;
std::vector<size_t> missing_columns;

View File

@ -88,6 +88,9 @@ idx10 ['This','is','a','test']
22
23
24
=== Try load data from case_insensitive_column_matching.parquet
123 1
456 2
=== Try load data from datapage_v2.snappy.parquet
Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA)
@ -339,9 +342,6 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unkno
(NULL)
=== Try load data from single_nan.parquet
\N
=== Try load data from test_setting_input_format_use_lowercase_column_name.parquet
123 1
456 2
=== Try load data from userdata1.parquet
1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02
1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV

View File

@ -1,22 +0,0 @@
#!/usr/bin/env bash
# Tags: no-ubsan, no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
echo "Parquet"
DATA_FILE=$CUR_DIR/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (id String, score Int32) ENGINE = Memory"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_use_lowercase_column_name=true"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load"
${CLICKHOUSE_CLIENT} --query="drop table parquet_load"
echo "ORC"
DATA_FILE=$CUR_DIR/data_orc/test_setting_input_format_use_lowercase_column_name.orc
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (id String, score Int32) ENGINE = Memory"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_use_lowercase_column_name=true"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load"
${CLICKHOUSE_CLIENT} --query="drop table orc_load"

View File

@ -5,23 +5,25 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_02241"
$CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String),
caption Nullable(String),
NSFW Nullable(String),
similarity Nullable(Float64),
LICENSE Nullable(String),
url Nullable(String),
key Nullable(UInt64),
shard_id Nullable(UInt64),
status Nullable(String),
width Nullable(UInt32),
height Nullable(UInt32),
exif Nullable(String),
original_width Nullable(UInt32),
original_height Nullable(UInt32)) engine=Memory"
for case_insensitive in "true" "false"; do
$CLICKHOUSE_CLIENT -q "drop table if exists test_02241"
$CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String),
caption Nullable(String),
NSFW Nullable(String),
similarity Nullable(Float64),
LICENSE Nullable(String),
url Nullable(String),
key Nullable(UInt64),
shard_id Nullable(UInt64),
status Nullable(String),
width Nullable(UInt32),
height Nullable(UInt32),
exif Nullable(String),
original_width Nullable(UInt32),
original_height Nullable(UInt32)) engine=Memory"
cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet"
cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=$case_insensitive"
$CLICKHOUSE_CLIENT -q "select count() from test_02241"
$CLICKHOUSE_CLIENT -q "drop table test_02241"
$CLICKHOUSE_CLIENT -q "select count() from test_02241"
$CLICKHOUSE_CLIENT -q "drop table test_02241"
done

View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Tags: no-ubsan, no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
echo "Parquet"
DATA_FILE=$CUR_DIR/data_parquet/case_insensitive_column_matching.parquet
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (iD String, scOre Int32) ENGINE = Memory"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=true"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load"
${CLICKHOUSE_CLIENT} --query="drop table parquet_load"
echo "ORC"
DATA_FILE=$CUR_DIR/data_orc/case_insensitive_column_matching.orc
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (iD String, sCorE Int32) ENGINE = Memory"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_orc_case_insensitive_column_matching=true"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load"
${CLICKHOUSE_CLIENT} --query="drop table orc_load"
echo "Arrow"
DATA_FILE=$CUR_DIR/data_arrow/case_insensitive_column_matching.arrow
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_load (iD String, sCorE Int32) ENGINE = Memory"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_load FORMAT Arrow SETTINGS input_format_arrow_case_insensitive_column_matching=true"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_load"
${CLICKHOUSE_CLIENT} --query="drop table arrow_load"

View File

@ -0,0 +1,12 @@
Arrow
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
Parquet
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
ORC
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_table"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_nested_table"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE nested_table (table Nested(eLeM1 Int32, elEm2 String, ELEM3 Float32)) engine=Memory"
formats=('Arrow' 'Parquet' 'ORC')
format_files=('arrow' 'parquet' 'orc')
for ((i = 0; i < 3; i++)) do
echo ${formats[i]}
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE nested_table"
cat $CUR_DIR/data_orc_arrow_parquet_nested/nested_table.${format_files[i]} | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nested_table FORMAT ${formats[i]} SETTINGS input_format_${format_files[i]}_import_nested = 1, input_format_${format_files[i]}_case_insensitive_column_matching = true"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM nested_table"
done
${CLICKHOUSE_CLIENT} --query="DROP TABLE nested_table"