Merge pull request #44405 from Avogar/fix-parquet-orc

Fix reading columns that are not presented in input data in Parquet/ORC formats
This commit is contained in:
Kruglov Pavel 2022-12-27 16:58:35 +01:00 committed by GitHub
commit 6dea7336f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 41 additions and 14 deletions

View File

@ -71,7 +71,7 @@ Chunk ArrowBlockInputFormat::generate()
++record_batch_current;
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result);
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result, (*table_result)->num_rows());
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.

View File

@ -69,7 +69,6 @@ namespace ErrorCodes
extern const int DUPLICATE_COLUMN;
extern const int THERE_IS_NO_COLUMN;
extern const int UNKNOWN_EXCEPTION;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_DATA;
}
@ -810,7 +809,7 @@ ArrowColumnToCHColumn::ArrowColumnToCHColumn(
{
}
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table)
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, size_t num_rows)
{
NameToColumnPtr name_to_column_ptr;
for (auto column_name : table->ColumnNames())
@ -824,16 +823,12 @@ void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arr
name_to_column_ptr[std::move(column_name)] = arrow_column;
}
arrowColumnsToCHChunk(res, name_to_column_ptr);
arrowColumnsToCHChunk(res, name_to_column_ptr, num_rows);
}
void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr)
void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows)
{
if (unlikely(name_to_column_ptr.empty()))
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Columns is empty");
Columns columns_list;
UInt64 num_rows = name_to_column_ptr.begin()->second->length();
columns_list.reserve(header.columns());
std::unordered_map<String, std::pair<BlockPtr, std::shared_ptr<NestedColumnExtractHelper>>> nested_tables;
bool skipped = false;

View File

@ -28,9 +28,9 @@ public:
bool allow_missing_columns_,
bool case_insensitive_matching_ = false);
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, size_t num_rows);
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr);
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows);
/// Get missing columns that exists in header but not in arrow::Schema
std::vector<size_t> getMissingColumns(const arrow::Schema & schema) const;

View File

@ -54,14 +54,19 @@ Chunk ORCBlockInputFormat::generate()
throw ParsingException(
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString());
/// We should extract the number of rows directly from the stripe, because in case when
/// record batch contains 0 columns (for example if we requested only columns that
/// are not presented in data) the number of rows in record batch will be 0.
size_t num_rows = file_reader->GetRawORCReader()->getStripe(stripe_current)->getNumberOfRows();
auto table = table_result.ValueOrDie();
if (!table || !table->num_rows())
if (!table || !num_rows)
return {};
++stripe_current;
Chunk res;
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, num_rows);
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.
if (format_settings.defaults_for_omitted_fields)

View File

@ -70,7 +70,7 @@ Chunk ParquetBlockInputFormat::generate()
++row_group_current;
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, table->num_rows());
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.

View File

@ -0,0 +1,8 @@
Hello
Hello
Hello
6 6
Hello
Hello
Hello
6 6

View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
#Tags: no-fasttest, no-parallel
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select number as x from numbers(3) format Parquet" > 02511_data1.parquet
$CLICKHOUSE_LOCAL -q "select y from file(02511_data1.parquet, auto, 'x UInt64, y String default \'Hello\'') settings input_format_parquet_allow_missing_columns=1"
$CLICKHOUSE_LOCAL -q "select number as x, 'Hello' as y from numbers(3) format Parquet" > 02511_data2.parquet
$CLICKHOUSE_LOCAL -q "select count(*), count(y) from file('02511_data*.parquet', auto, 'x UInt64, y String') settings input_format_parquet_allow_missing_columns=1"
$CLICKHOUSE_LOCAL -q "select number as x from numbers(3) format ORC" > 02511_data1.orc
$CLICKHOUSE_LOCAL -q "select y from file(02511_data1.orc, auto, 'x UInt64, y String default \'Hello\'') settings input_format_orc_allow_missing_columns=1"
$CLICKHOUSE_LOCAL -q "select number as x, 'Hello' as y from numbers(3) format ORC" > 02511_data2.orc
$CLICKHOUSE_LOCAL -q "select count(*), count(y) from file('02511_data*.orc', auto, 'x UInt64, y String') settings input_format_orc_allow_missing_columns=1"
rm 02511_data*