mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge pull request #35459 from ClickHouse/case-insensitive-column-matching
Support for case insensitive column matching for ORC/Arrow/Parquet files
This commit is contained in:
commit
f6439efcad
@ -13,6 +13,7 @@
|
||||
|
||||
#include <iterator>
|
||||
#include <base/sort.h>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -269,8 +270,18 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const
|
||||
}
|
||||
|
||||
|
||||
const ColumnWithTypeAndName * Block::findByName(const std::string & name) const
|
||||
const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const
|
||||
{
|
||||
if (case_insensitive)
|
||||
{
|
||||
auto found = std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); });
|
||||
if (found == data.end())
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
return &*found;
|
||||
}
|
||||
|
||||
auto it = index_by_name.find(name);
|
||||
if (index_by_name.end() == it)
|
||||
{
|
||||
@ -280,19 +291,23 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name) const
|
||||
}
|
||||
|
||||
|
||||
const ColumnWithTypeAndName & Block::getByName(const std::string & name) const
|
||||
const ColumnWithTypeAndName & Block::getByName(const std::string & name, bool case_insensitive) const
|
||||
{
|
||||
const auto * result = findByName(name);
|
||||
const auto * result = findByName(name, case_insensitive);
|
||||
if (!result)
|
||||
throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames()
|
||||
, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
|
||||
throw Exception(
|
||||
"Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
|
||||
|
||||
return *result;
|
||||
}
|
||||
|
||||
|
||||
bool Block::has(const std::string & name) const
|
||||
bool Block::has(const std::string & name, bool case_insensitive) const
|
||||
{
|
||||
if (case_insensitive)
|
||||
return std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); })
|
||||
!= data.end();
|
||||
|
||||
return index_by_name.end() != index_by_name.find(name);
|
||||
}
|
||||
|
||||
@ -301,8 +316,8 @@ size_t Block::getPositionByName(const std::string & name) const
|
||||
{
|
||||
auto it = index_by_name.find(name);
|
||||
if (index_by_name.end() == it)
|
||||
throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames()
|
||||
, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
|
||||
throw Exception(
|
||||
"Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK);
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
@ -60,21 +60,21 @@ public:
|
||||
ColumnWithTypeAndName & safeGetByPosition(size_t position);
|
||||
const ColumnWithTypeAndName & safeGetByPosition(size_t position) const;
|
||||
|
||||
ColumnWithTypeAndName* findByName(const std::string & name)
|
||||
ColumnWithTypeAndName* findByName(const std::string & name, bool case_insensitive = false)
|
||||
{
|
||||
return const_cast<ColumnWithTypeAndName *>(
|
||||
const_cast<const Block *>(this)->findByName(name));
|
||||
const_cast<const Block *>(this)->findByName(name, case_insensitive));
|
||||
}
|
||||
|
||||
const ColumnWithTypeAndName * findByName(const std::string & name) const;
|
||||
const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const;
|
||||
|
||||
ColumnWithTypeAndName & getByName(const std::string & name)
|
||||
ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false)
|
||||
{
|
||||
return const_cast<ColumnWithTypeAndName &>(
|
||||
const_cast<const Block *>(this)->getByName(name));
|
||||
const_cast<const Block *>(this)->getByName(name, case_insensitive));
|
||||
}
|
||||
|
||||
const ColumnWithTypeAndName & getByName(const std::string & name) const;
|
||||
const ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false) const;
|
||||
|
||||
Container::iterator begin() { return data.begin(); }
|
||||
Container::iterator end() { return data.end(); }
|
||||
@ -83,7 +83,7 @@ public:
|
||||
Container::const_iterator cbegin() const { return data.cbegin(); }
|
||||
Container::const_iterator cend() const { return data.cend(); }
|
||||
|
||||
bool has(const std::string & name) const;
|
||||
bool has(const std::string & name, bool case_insensitive = false) const;
|
||||
|
||||
size_t getPositionByName(const std::string & name) const;
|
||||
|
||||
|
@ -616,11 +616,13 @@ class IColumn;
|
||||
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
|
||||
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
|
||||
M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
|
||||
M(Bool, input_format_use_lowercase_column_name, false, "Use lowercase column name while reading input formats", 0) \
|
||||
M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \
|
||||
M(Bool, input_format_arrow_case_insensitive_column_matching, false, "Ignore case when matching Arrow columns with CH columns.", 0) \
|
||||
M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \
|
||||
M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \
|
||||
M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \
|
||||
M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \
|
||||
M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \
|
||||
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
|
||||
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
|
||||
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
|
||||
|
@ -15,6 +15,8 @@
|
||||
|
||||
#include <Parsers/IAST.h>
|
||||
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -227,14 +229,17 @@ void validateArraySizes(const Block & block)
|
||||
}
|
||||
|
||||
|
||||
std::unordered_set<String> getAllTableNames(const Block & block)
|
||||
std::unordered_set<String> getAllTableNames(const Block & block, bool to_lower_case)
|
||||
{
|
||||
std::unordered_set<String> nested_table_names;
|
||||
for (auto & name : block.getNames())
|
||||
for (const auto & name : block.getNames())
|
||||
{
|
||||
auto nested_table_name = Nested::extractTableName(name);
|
||||
if (to_lower_case)
|
||||
boost::to_lower(nested_table_name);
|
||||
|
||||
if (!nested_table_name.empty())
|
||||
nested_table_names.insert(nested_table_name);
|
||||
nested_table_names.insert(std::move(nested_table_name));
|
||||
}
|
||||
return nested_table_names;
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ namespace Nested
|
||||
void validateArraySizes(const Block & block);
|
||||
|
||||
/// Get all nested tables names from a block.
|
||||
std::unordered_set<String> getAllTableNames(const Block & block);
|
||||
std::unordered_set<String> getAllTableNames(const Block & block, bool to_lower_case = false);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -89,10 +89,10 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
|
||||
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
|
||||
format_settings.null_as_default = settings.input_format_null_as_default;
|
||||
format_settings.use_lowercase_column_name = settings.input_format_use_lowercase_column_name;
|
||||
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;
|
||||
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
|
||||
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
|
||||
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
|
||||
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
|
||||
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||
@ -123,9 +123,11 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
|
||||
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
||||
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
|
||||
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
|
||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
||||
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
||||
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
||||
format_settings.seekable_read = settings.input_format_allow_seeks;
|
||||
|
@ -32,7 +32,6 @@ struct FormatSettings
|
||||
bool null_as_default = true;
|
||||
bool decimal_trailing_zeros = false;
|
||||
bool defaults_for_omitted_fields = true;
|
||||
bool use_lowercase_column_name = false;
|
||||
|
||||
bool seekable_read = true;
|
||||
UInt64 max_rows_to_read_for_schema_inference = 100;
|
||||
@ -75,6 +74,7 @@ struct FormatSettings
|
||||
bool low_cardinality_as_dictionary = false;
|
||||
bool import_nested = false;
|
||||
bool allow_missing_columns = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
} arrow;
|
||||
|
||||
struct
|
||||
@ -137,6 +137,7 @@ struct FormatSettings
|
||||
UInt64 row_group_size = 1000000;
|
||||
bool import_nested = false;
|
||||
bool allow_missing_columns = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
} parquet;
|
||||
|
||||
struct Pretty
|
||||
@ -217,6 +218,7 @@ struct FormatSettings
|
||||
bool import_nested = false;
|
||||
bool allow_missing_columns = false;
|
||||
int64_t row_batch_size = 100'000;
|
||||
bool case_insensitive_column_matching = false;
|
||||
} orc;
|
||||
|
||||
/// For capnProto format we should determine how to
|
||||
|
@ -139,7 +139,11 @@ void ArrowBlockInputFormat::prepareReader()
|
||||
}
|
||||
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
|
||||
getPort().getHeader(), "Arrow", format_settings.arrow.import_nested, format_settings.arrow.allow_missing_columns);
|
||||
getPort().getHeader(),
|
||||
"Arrow",
|
||||
format_settings.arrow.import_nested,
|
||||
format_settings.arrow.allow_missing_columns,
|
||||
format_settings.arrow.case_insensitive_column_matching);
|
||||
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
|
||||
|
||||
if (stream)
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include <algorithm>
|
||||
#include <arrow/builder.h>
|
||||
#include <arrow/array.h>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
|
||||
#define FOR_ARROW_NUMERIC_TYPES(M) \
|
||||
@ -484,19 +485,22 @@ static void checkStatus(const arrow::Status & status, const String & column_name
|
||||
throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()};
|
||||
}
|
||||
|
||||
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header)
|
||||
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case)
|
||||
{
|
||||
ColumnsWithTypeAndName sample_columns;
|
||||
std::unordered_set<String> nested_table_names;
|
||||
if (hint_header)
|
||||
nested_table_names = Nested::getAllTableNames(*hint_header);
|
||||
nested_table_names = Nested::getAllTableNames(*hint_header, ignore_case);
|
||||
|
||||
for (const auto & field : schema.fields())
|
||||
{
|
||||
if (hint_header && !hint_header->has(field->name()) && !nested_table_names.contains(field->name()))
|
||||
if (hint_header && !hint_header->has(field->name(), ignore_case)
|
||||
&& !nested_table_names.contains(ignore_case ? boost::to_lower_copy(field->name()) : field->name()))
|
||||
continue;
|
||||
|
||||
/// Create empty arrow column by it's type and convert it to ClickHouse column.
|
||||
arrow::MemoryPool* pool = arrow::default_memory_pool();
|
||||
arrow::MemoryPool * pool = arrow::default_memory_pool();
|
||||
std::unique_ptr<arrow::ArrayBuilder> array_builder;
|
||||
arrow::Status status = MakeBuilder(pool, field->type(), &array_builder);
|
||||
checkStatus(status, field->name(), format_name);
|
||||
@ -516,20 +520,31 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema,
|
||||
}
|
||||
|
||||
ArrowColumnToCHColumn::ArrowColumnToCHColumn(
|
||||
const Block & header_, const std::string & format_name_, bool import_nested_, bool allow_missing_columns_)
|
||||
: header(header_), format_name(format_name_), import_nested(import_nested_), allow_missing_columns(allow_missing_columns_)
|
||||
const Block & header_,
|
||||
const std::string & format_name_,
|
||||
bool import_nested_,
|
||||
bool allow_missing_columns_,
|
||||
bool case_insensitive_matching_)
|
||||
: header(header_)
|
||||
, format_name(format_name_)
|
||||
, import_nested(import_nested_)
|
||||
, allow_missing_columns(allow_missing_columns_)
|
||||
, case_insensitive_matching(case_insensitive_matching_)
|
||||
{
|
||||
}
|
||||
|
||||
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table)
|
||||
{
|
||||
NameToColumnPtr name_to_column_ptr;
|
||||
for (const auto & column_name : table->ColumnNames())
|
||||
for (auto column_name : table->ColumnNames())
|
||||
{
|
||||
std::shared_ptr<arrow::ChunkedArray> arrow_column = table->GetColumnByName(column_name);
|
||||
if (!arrow_column)
|
||||
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Column '{}' is duplicated", column_name);
|
||||
name_to_column_ptr[column_name] = arrow_column;
|
||||
|
||||
if (case_insensitive_matching)
|
||||
boost::to_lower(column_name);
|
||||
name_to_column_ptr[std::move(column_name)] = arrow_column;
|
||||
}
|
||||
|
||||
arrowColumnsToCHChunk(res, name_to_column_ptr);
|
||||
@ -548,22 +563,31 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
{
|
||||
const ColumnWithTypeAndName & header_column = header.getByPosition(column_i);
|
||||
|
||||
auto search_column_name = header_column.name;
|
||||
if (case_insensitive_matching)
|
||||
boost::to_lower(search_column_name);
|
||||
|
||||
bool read_from_nested = false;
|
||||
String nested_table_name = Nested::extractTableName(header_column.name);
|
||||
if (!name_to_column_ptr.contains(header_column.name))
|
||||
String search_nested_table_name = nested_table_name;
|
||||
if (case_insensitive_matching)
|
||||
boost::to_lower(search_nested_table_name);
|
||||
|
||||
if (!name_to_column_ptr.contains(search_column_name))
|
||||
{
|
||||
/// Check if it's a column from nested table.
|
||||
if (import_nested && name_to_column_ptr.contains(nested_table_name))
|
||||
if (import_nested && name_to_column_ptr.contains(search_nested_table_name))
|
||||
{
|
||||
if (!nested_tables.contains(nested_table_name))
|
||||
if (!nested_tables.contains(search_nested_table_name))
|
||||
{
|
||||
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[nested_table_name];
|
||||
ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
|
||||
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name];
|
||||
ColumnsWithTypeAndName cols
|
||||
= {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
|
||||
Block block(cols);
|
||||
nested_tables[nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
|
||||
nested_tables[search_nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
|
||||
}
|
||||
|
||||
read_from_nested = nested_tables[nested_table_name]->has(header_column.name);
|
||||
read_from_nested = nested_tables[search_nested_table_name]->has(header_column.name, case_insensitive_matching);
|
||||
}
|
||||
|
||||
if (!read_from_nested)
|
||||
@ -580,13 +604,19 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[header_column.name];
|
||||
|
||||
ColumnWithTypeAndName column;
|
||||
if (read_from_nested)
|
||||
column = nested_tables[nested_table_name]->getByName(header_column.name);
|
||||
{
|
||||
column = nested_tables[search_nested_table_name]->getByName(header_column.name, case_insensitive_matching);
|
||||
if (case_insensitive_matching)
|
||||
column.name = header_column.name;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto arrow_column = name_to_column_ptr[search_column_name];
|
||||
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
@ -594,8 +624,11 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
e.addMessage(fmt::format("while converting column {} from type {} to type {}",
|
||||
backQuote(header_column.name), column.type->getName(), header_column.type->getName()));
|
||||
e.addMessage(fmt::format(
|
||||
"while converting column {} from type {} to type {}",
|
||||
backQuote(header_column.name),
|
||||
column.type->getName(),
|
||||
header_column.type->getName()));
|
||||
throw;
|
||||
}
|
||||
|
||||
@ -609,22 +642,23 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
std::vector<size_t> ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const
|
||||
{
|
||||
std::vector<size_t> missing_columns;
|
||||
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header);
|
||||
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching);
|
||||
auto flatten_block_from_arrow = Nested::flatten(block_from_arrow);
|
||||
|
||||
for (size_t i = 0, columns = header.columns(); i < columns; ++i)
|
||||
{
|
||||
const auto & column = header.getByPosition(i);
|
||||
const auto & header_column = header.getByPosition(i);
|
||||
bool read_from_nested = false;
|
||||
String nested_table_name = Nested::extractTableName(column.name);
|
||||
if (!block_from_arrow.has(column.name))
|
||||
String nested_table_name = Nested::extractTableName(header_column.name);
|
||||
if (!block_from_arrow.has(header_column.name, case_insensitive_matching))
|
||||
{
|
||||
if (import_nested && block_from_arrow.has(nested_table_name))
|
||||
read_from_nested = flatten_block_from_arrow.has(column.name);
|
||||
if (import_nested && block_from_arrow.has(nested_table_name, case_insensitive_matching))
|
||||
read_from_nested = flatten_block_from_arrow.has(header_column.name, case_insensitive_matching);
|
||||
|
||||
if (!read_from_nested)
|
||||
{
|
||||
if (!allow_missing_columns)
|
||||
throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", column.name};
|
||||
throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", header_column.name};
|
||||
|
||||
missing_columns.push_back(i);
|
||||
}
|
||||
|
@ -25,7 +25,8 @@ public:
|
||||
const Block & header_,
|
||||
const std::string & format_name_,
|
||||
bool import_nested_,
|
||||
bool allow_missing_columns_);
|
||||
bool allow_missing_columns_,
|
||||
bool case_insensitive_matching_ = false);
|
||||
|
||||
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
|
||||
|
||||
@ -36,7 +37,8 @@ public:
|
||||
|
||||
/// Transform arrow schema to ClickHouse header. If hint_header is provided,
|
||||
/// we will skip columns in schema that are not in hint_header.
|
||||
static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr);
|
||||
static Block arrowSchemaToCHHeader(
|
||||
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false);
|
||||
|
||||
private:
|
||||
const Block & header;
|
||||
@ -44,6 +46,7 @@ private:
|
||||
bool import_nested;
|
||||
/// If false, throw exception if some columns in header not exists in arrow table.
|
||||
bool allow_missing_columns;
|
||||
bool case_insensitive_matching;
|
||||
|
||||
/// Map {column name : dictionary column}.
|
||||
/// To avoid converting dictionary from Arrow Dictionary
|
||||
|
@ -53,9 +53,6 @@ Chunk ORCBlockInputFormat::generate()
|
||||
if (!table || !table->num_rows())
|
||||
return res;
|
||||
|
||||
if (format_settings.use_lowercase_column_name)
|
||||
table = *table->RenameColumns(include_column_names);
|
||||
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
|
||||
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
||||
/// Otherwise fill the missing columns with zero values of its type.
|
||||
@ -73,7 +70,6 @@ void ORCBlockInputFormat::resetParser()
|
||||
|
||||
file_reader.reset();
|
||||
include_indices.clear();
|
||||
include_column_names.clear();
|
||||
block_missing_values.clear();
|
||||
}
|
||||
|
||||
@ -125,20 +121,6 @@ static void getFileReaderAndSchema(
|
||||
if (!read_schema_result.ok())
|
||||
throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS);
|
||||
schema = std::move(read_schema_result).ValueOrDie();
|
||||
|
||||
if (format_settings.use_lowercase_column_name)
|
||||
{
|
||||
std::vector<std::shared_ptr<::arrow::Field>> fields;
|
||||
fields.reserve(schema->num_fields());
|
||||
for (int i = 0; i < schema->num_fields(); ++i)
|
||||
{
|
||||
const auto& field = schema->field(i);
|
||||
auto name = field->name();
|
||||
boost::to_lower(name);
|
||||
fields.push_back(field->WithName(name));
|
||||
}
|
||||
schema = arrow::schema(fields, schema->metadata());
|
||||
}
|
||||
}
|
||||
|
||||
void ORCBlockInputFormat::prepareReader()
|
||||
@ -149,12 +131,17 @@ void ORCBlockInputFormat::prepareReader()
|
||||
return;
|
||||
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
|
||||
getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns);
|
||||
getPort().getHeader(),
|
||||
"ORC",
|
||||
format_settings.orc.import_nested,
|
||||
format_settings.orc.allow_missing_columns,
|
||||
format_settings.orc.case_insensitive_column_matching);
|
||||
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
|
||||
|
||||
const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
|
||||
std::unordered_set<String> nested_table_names;
|
||||
if (format_settings.orc.import_nested)
|
||||
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
|
||||
nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);
|
||||
|
||||
/// In ReadStripe column indices should be started from 1,
|
||||
/// because 0 indicates to select all columns.
|
||||
@ -165,19 +152,18 @@ void ORCBlockInputFormat::prepareReader()
|
||||
/// so we should recursively count the number of indices we need for this type.
|
||||
int indexes_count = countIndicesForType(schema->field(i)->type());
|
||||
const auto & name = schema->field(i)->name();
|
||||
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
|
||||
if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
|
||||
{
|
||||
for (int j = 0; j != indexes_count; ++j)
|
||||
{
|
||||
include_indices.push_back(index + j);
|
||||
include_column_names.push_back(name);
|
||||
}
|
||||
}
|
||||
|
||||
index += indexes_count;
|
||||
}
|
||||
}
|
||||
|
||||
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
|
||||
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
: ISchemaReader(in_), format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -47,7 +47,6 @@ private:
|
||||
|
||||
// indices of columns to read from ORC file
|
||||
std::vector<int> include_indices;
|
||||
std::vector<String> include_column_names;
|
||||
|
||||
std::vector<size_t> missing_columns;
|
||||
BlockMissingValues block_missing_values;
|
||||
|
@ -53,11 +53,7 @@ Chunk ParquetBlockInputFormat::generate()
|
||||
std::shared_ptr<arrow::Table> table;
|
||||
arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, column_indices, &table);
|
||||
if (!read_status.ok())
|
||||
throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(),
|
||||
ErrorCodes::CANNOT_READ_ALL_DATA};
|
||||
|
||||
if (format_settings.use_lowercase_column_name)
|
||||
table = *table->RenameColumns(column_names);
|
||||
throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA};
|
||||
|
||||
++row_group_current;
|
||||
|
||||
@ -78,7 +74,6 @@ void ParquetBlockInputFormat::resetParser()
|
||||
|
||||
file_reader.reset();
|
||||
column_indices.clear();
|
||||
column_names.clear();
|
||||
row_group_current = 0;
|
||||
block_missing_values.clear();
|
||||
}
|
||||
@ -123,20 +118,6 @@ static void getFileReaderAndSchema(
|
||||
return;
|
||||
THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader));
|
||||
THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema));
|
||||
|
||||
if (format_settings.use_lowercase_column_name)
|
||||
{
|
||||
std::vector<std::shared_ptr<::arrow::Field>> fields;
|
||||
fields.reserve(schema->num_fields());
|
||||
for (int i = 0; i < schema->num_fields(); ++i)
|
||||
{
|
||||
const auto& field = schema->field(i);
|
||||
auto name = field->name();
|
||||
boost::to_lower(name);
|
||||
fields.push_back(field->WithName(name));
|
||||
}
|
||||
schema = arrow::schema(fields, schema->metadata());
|
||||
}
|
||||
}
|
||||
|
||||
void ParquetBlockInputFormat::prepareReader()
|
||||
@ -149,12 +130,18 @@ void ParquetBlockInputFormat::prepareReader()
|
||||
row_group_total = file_reader->num_row_groups();
|
||||
row_group_current = 0;
|
||||
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested, format_settings.parquet.allow_missing_columns);
|
||||
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
|
||||
getPort().getHeader(),
|
||||
"Parquet",
|
||||
format_settings.parquet.import_nested,
|
||||
format_settings.parquet.allow_missing_columns,
|
||||
format_settings.parquet.case_insensitive_column_matching);
|
||||
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
|
||||
|
||||
const bool ignore_case = format_settings.parquet.case_insensitive_column_matching;
|
||||
std::unordered_set<String> nested_table_names;
|
||||
if (format_settings.parquet.import_nested)
|
||||
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
|
||||
nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);
|
||||
|
||||
int index = 0;
|
||||
for (int i = 0; i < schema->num_fields(); ++i)
|
||||
@ -164,19 +151,19 @@ void ParquetBlockInputFormat::prepareReader()
|
||||
/// count the number of indices we need for this type.
|
||||
int indexes_count = countIndicesForType(schema->field(i)->type());
|
||||
const auto & name = schema->field(i)->name();
|
||||
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
|
||||
|
||||
if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
|
||||
{
|
||||
for (int j = 0; j != indexes_count; ++j)
|
||||
{
|
||||
column_indices.push_back(index + j);
|
||||
column_names.push_back(name);
|
||||
}
|
||||
}
|
||||
|
||||
index += indexes_count;
|
||||
}
|
||||
}
|
||||
|
||||
ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
|
||||
ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
: ISchemaReader(in_), format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,6 @@ private:
|
||||
int row_group_total = 0;
|
||||
// indices of columns to read from Parquet file
|
||||
std::vector<int> column_indices;
|
||||
std::vector<String> column_names;
|
||||
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
|
||||
int row_group_current = 0;
|
||||
std::vector<size_t> missing_columns;
|
||||
|
@ -88,6 +88,9 @@ idx10 ['This','is','a','test']
|
||||
22
|
||||
23
|
||||
24
|
||||
=== Try load data from case_insensitive_column_matching.parquet
|
||||
123 1
|
||||
456 2
|
||||
=== Try load data from datapage_v2.snappy.parquet
|
||||
Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA)
|
||||
|
||||
@ -339,9 +342,6 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unkno
|
||||
(NULL)
|
||||
=== Try load data from single_nan.parquet
|
||||
\N
|
||||
=== Try load data from test_setting_input_format_use_lowercase_column_name.parquet
|
||||
123 1
|
||||
456 2
|
||||
=== Try load data from userdata1.parquet
|
||||
1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02
|
||||
1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV
|
||||
|
@ -1,22 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-ubsan, no-fasttest
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
echo "Parquet"
|
||||
DATA_FILE=$CUR_DIR/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (id String, score Int32) ENGINE = Memory"
|
||||
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_use_lowercase_column_name=true"
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load"
|
||||
${CLICKHOUSE_CLIENT} --query="drop table parquet_load"
|
||||
|
||||
echo "ORC"
|
||||
DATA_FILE=$CUR_DIR/data_orc/test_setting_input_format_use_lowercase_column_name.orc
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (id String, score Int32) ENGINE = Memory"
|
||||
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_use_lowercase_column_name=true"
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load"
|
||||
${CLICKHOUSE_CLIENT} --query="drop table orc_load"
|
@ -1 +1,2 @@
|
||||
10
|
||||
10
|
||||
|
@ -5,23 +5,25 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_02241"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String),
|
||||
caption Nullable(String),
|
||||
NSFW Nullable(String),
|
||||
similarity Nullable(Float64),
|
||||
LICENSE Nullable(String),
|
||||
url Nullable(String),
|
||||
key Nullable(UInt64),
|
||||
shard_id Nullable(UInt64),
|
||||
status Nullable(String),
|
||||
width Nullable(UInt32),
|
||||
height Nullable(UInt32),
|
||||
exif Nullable(String),
|
||||
original_width Nullable(UInt32),
|
||||
original_height Nullable(UInt32)) engine=Memory"
|
||||
for case_insensitive in "true" "false"; do
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_02241"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String),
|
||||
caption Nullable(String),
|
||||
NSFW Nullable(String),
|
||||
similarity Nullable(Float64),
|
||||
LICENSE Nullable(String),
|
||||
url Nullable(String),
|
||||
key Nullable(UInt64),
|
||||
shard_id Nullable(UInt64),
|
||||
status Nullable(String),
|
||||
width Nullable(UInt32),
|
||||
height Nullable(UInt32),
|
||||
exif Nullable(String),
|
||||
original_width Nullable(UInt32),
|
||||
original_height Nullable(UInt32)) engine=Memory"
|
||||
|
||||
cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet"
|
||||
cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=$case_insensitive"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select count() from test_02241"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_02241"
|
||||
$CLICKHOUSE_CLIENT -q "select count() from test_02241"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_02241"
|
||||
done
|
||||
|
@ -4,3 +4,6 @@ Parquet
|
||||
ORC
|
||||
123 1
|
||||
456 2
|
||||
Arrow
|
||||
123 1
|
||||
456 2
|
30
tests/queries/0_stateless/02242_case_insensitive_column_matching.sh
Executable file
30
tests/queries/0_stateless/02242_case_insensitive_column_matching.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-ubsan, no-fasttest
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
echo "Parquet"
|
||||
DATA_FILE=$CUR_DIR/data_parquet/case_insensitive_column_matching.parquet
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (iD String, scOre Int32) ENGINE = Memory"
|
||||
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=true"
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load"
|
||||
${CLICKHOUSE_CLIENT} --query="drop table parquet_load"
|
||||
|
||||
echo "ORC"
|
||||
DATA_FILE=$CUR_DIR/data_orc/case_insensitive_column_matching.orc
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (iD String, sCorE Int32) ENGINE = Memory"
|
||||
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_orc_case_insensitive_column_matching=true"
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load"
|
||||
${CLICKHOUSE_CLIENT} --query="drop table orc_load"
|
||||
|
||||
echo "Arrow"
|
||||
DATA_FILE=$CUR_DIR/data_arrow/case_insensitive_column_matching.arrow
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_load"
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_load (iD String, sCorE Int32) ENGINE = Memory"
|
||||
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_load FORMAT Arrow SETTINGS input_format_arrow_case_insensitive_column_matching=true"
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_load"
|
||||
${CLICKHOUSE_CLIENT} --query="drop table arrow_load"
|
@ -0,0 +1,12 @@
|
||||
Arrow
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
|
||||
Parquet
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
|
||||
ORC
|
||||
[1,2,3] ['123','456','789'] [9.8,10.12,11.14]
|
||||
[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414]
|
||||
[7,8,9] ['101','415','118'] [13.08,1.12,0.414]
|
26
tests/queries/0_stateless/02242_case_insensitive_nested.sh
Executable file
26
tests/queries/0_stateless/02242_case_insensitive_nested.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_table"
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_nested_table"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE nested_table (table Nested(eLeM1 Int32, elEm2 String, ELEM3 Float32)) engine=Memory"
|
||||
|
||||
formats=('Arrow' 'Parquet' 'ORC')
|
||||
format_files=('arrow' 'parquet' 'orc')
|
||||
|
||||
for ((i = 0; i < 3; i++)) do
|
||||
echo ${formats[i]}
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE nested_table"
|
||||
cat $CUR_DIR/data_orc_arrow_parquet_nested/nested_table.${format_files[i]} | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nested_table FORMAT ${formats[i]} SETTINGS input_format_${format_files[i]}_import_nested = 1, input_format_${format_files[i]}_case_insensitive_column_matching = true"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM nested_table"
|
||||
|
||||
done
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE nested_table"
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user