mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
Fix reading columns that are not presented in input data in Parquet/ORC formats
This commit is contained in:
parent
a4525bb98f
commit
e262e375dc
@ -71,7 +71,7 @@ Chunk ArrowBlockInputFormat::generate()
|
||||
|
||||
++record_batch_current;
|
||||
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result);
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result, (*table_result)->num_rows());
|
||||
|
||||
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
||||
/// Otherwise fill the missing columns with zero values of its type.
|
||||
|
@ -810,7 +810,7 @@ ArrowColumnToCHColumn::ArrowColumnToCHColumn(
|
||||
{
|
||||
}
|
||||
|
||||
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table)
|
||||
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, size_t num_rows)
|
||||
{
|
||||
NameToColumnPtr name_to_column_ptr;
|
||||
for (auto column_name : table->ColumnNames())
|
||||
@ -824,16 +824,12 @@ void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arr
|
||||
name_to_column_ptr[std::move(column_name)] = arrow_column;
|
||||
}
|
||||
|
||||
arrowColumnsToCHChunk(res, name_to_column_ptr);
|
||||
arrowColumnsToCHChunk(res, name_to_column_ptr, num_rows);
|
||||
}
|
||||
|
||||
void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr)
|
||||
void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows)
|
||||
{
|
||||
if (unlikely(name_to_column_ptr.empty()))
|
||||
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Columns is empty");
|
||||
|
||||
Columns columns_list;
|
||||
UInt64 num_rows = name_to_column_ptr.begin()->second->length();
|
||||
columns_list.reserve(header.columns());
|
||||
std::unordered_map<String, std::pair<BlockPtr, std::shared_ptr<NestedColumnExtractHelper>>> nested_tables;
|
||||
bool skipped = false;
|
||||
|
@ -28,9 +28,9 @@ public:
|
||||
bool allow_missing_columns_,
|
||||
bool case_insensitive_matching_ = false);
|
||||
|
||||
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
|
||||
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, size_t num_rows);
|
||||
|
||||
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr);
|
||||
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows);
|
||||
|
||||
/// Get missing columns that exists in header but not in arrow::Schema
|
||||
std::vector<size_t> getMissingColumns(const arrow::Schema & schema) const;
|
||||
|
@ -54,14 +54,19 @@ Chunk ORCBlockInputFormat::generate()
|
||||
throw ParsingException(
|
||||
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString());
|
||||
|
||||
/// We should extract the number of rows directly from the stripe, because in case when
|
||||
/// record batch contains 0 columns (for example if we requested only columns that
|
||||
/// are not presented in data) the number of rows in record batch will be 0.
|
||||
size_t num_rows = file_reader->GetRawORCReader()->getStripe(stripe_current)->getNumberOfRows();
|
||||
|
||||
auto table = table_result.ValueOrDie();
|
||||
if (!table || !table->num_rows())
|
||||
if (!table || !num_rows)
|
||||
return {};
|
||||
|
||||
++stripe_current;
|
||||
|
||||
Chunk res;
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, num_rows);
|
||||
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
||||
/// Otherwise fill the missing columns with zero values of its type.
|
||||
if (format_settings.defaults_for_omitted_fields)
|
||||
|
@ -70,7 +70,7 @@ Chunk ParquetBlockInputFormat::generate()
|
||||
|
||||
++row_group_current;
|
||||
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
|
||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, table->num_rows());
|
||||
|
||||
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
||||
/// Otherwise fill the missing columns with zero values of its type.
|
||||
|
@ -0,0 +1,8 @@
|
||||
Hello
|
||||
Hello
|
||||
Hello
|
||||
6 6
|
||||
Hello
|
||||
Hello
|
||||
Hello
|
||||
6 6
|
18
tests/queries/0_stateless/02511_parquet_orc_missing_columns.sh
Executable file
18
tests/queries/0_stateless/02511_parquet_orc_missing_columns.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select number as x from numbers(3) format Parquet" > 02511_data1.parquet
|
||||
$CLICKHOUSE_LOCAL -q "select y from file(02511_data1.parquet, auto, 'x UInt64, y String default \'Hello\'') settings input_format_parquet_allow_missing_columns=1"
|
||||
$CLICKHOUSE_LOCAL -q "select number as x, 'Hello' as y from numbers(3) format Parquet" > 02511_data2.parquet
|
||||
$CLICKHOUSE_LOCAL -q "select count(*), count(y) from file('02511_data*.parquet', auto, 'x UInt64, y String') settings input_format_parquet_allow_missing_columns=1"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select number as x from numbers(3) format ORC" > 02511_data1.orc
|
||||
$CLICKHOUSE_LOCAL -q "select y from file(02511_data1.orc, auto, 'x UInt64, y String default \'Hello\'') settings input_format_orc_allow_missing_columns=1"
|
||||
$CLICKHOUSE_LOCAL -q "select number as x, 'Hello' as y from numbers(3) format ORC" > 02511_data2.orc
|
||||
$CLICKHOUSE_LOCAL -q "select count(*), count(y) from file('02511_data*.orc', auto, 'x UInt64, y String') settings input_format_orc_allow_missing_columns=1"
|
||||
|
||||
rm 02511_data*
|
||||
|
Loading…
Reference in New Issue
Block a user