mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Merge pull request #47492 from ClickHouse/revert-45878
Revert #45878 and add a test
This commit is contained in:
commit
278b8f74c2
@ -803,7 +803,6 @@ class IColumn;
|
|||||||
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
|
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
|
||||||
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
|
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
|
||||||
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
|
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
|
||||||
M(UInt64, input_format_parquet_max_block_size, 8192, "Max block size for parquet reader.", 0) \
|
|
||||||
M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \
|
M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \
|
||||||
M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \
|
M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \
|
||||||
M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \
|
M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \
|
||||||
|
@ -117,7 +117,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
|
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
|
||||||
format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
|
format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
|
||||||
format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings.output_format_parquet_fixed_string_as_fixed_byte_array;
|
format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings.output_format_parquet_fixed_string_as_fixed_byte_array;
|
||||||
format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size;
|
|
||||||
format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method;
|
format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method;
|
||||||
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
||||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||||
|
@ -211,7 +211,6 @@ struct FormatSettings
|
|||||||
std::unordered_set<int> skip_row_groups = {};
|
std::unordered_set<int> skip_row_groups = {};
|
||||||
bool output_string_as_string = false;
|
bool output_string_as_string = false;
|
||||||
bool output_fixed_string_as_fixed_byte_array = true;
|
bool output_fixed_string_as_fixed_byte_array = true;
|
||||||
UInt64 max_block_size = 8192;
|
|
||||||
ParquetVersion output_version;
|
ParquetVersion output_version;
|
||||||
ParquetCompression output_compression_method = ParquetCompression::SNAPPY;
|
ParquetCompression output_compression_method = ParquetCompression::SNAPPY;
|
||||||
} parquet;
|
} parquet;
|
||||||
|
@ -45,44 +45,38 @@ Chunk ParquetBlockInputFormat::generate()
|
|||||||
block_missing_values.clear();
|
block_missing_values.clear();
|
||||||
|
|
||||||
if (!file_reader)
|
if (!file_reader)
|
||||||
{
|
|
||||||
prepareReader();
|
prepareReader();
|
||||||
file_reader->set_batch_size(format_settings.parquet.max_block_size);
|
|
||||||
std::vector<int> row_group_indices;
|
|
||||||
for (int i = 0; i < row_group_total; ++i)
|
|
||||||
{
|
|
||||||
if (!skip_row_groups.contains(i))
|
|
||||||
row_group_indices.emplace_back(i);
|
|
||||||
}
|
|
||||||
auto read_status = file_reader->GetRecordBatchReader(row_group_indices, column_indices, ¤t_record_batch_reader);
|
|
||||||
if (!read_status.ok())
|
|
||||||
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", read_status.ToString());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_stopped)
|
if (is_stopped)
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
auto batch = current_record_batch_reader->Next();
|
while (row_group_current < row_group_total && skip_row_groups.contains(row_group_current))
|
||||||
if (!batch.ok())
|
++row_group_current;
|
||||||
{
|
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}",
|
|
||||||
batch.status().ToString());
|
|
||||||
}
|
|
||||||
if (*batch)
|
|
||||||
{
|
|
||||||
auto tmp_table = arrow::Table::FromRecordBatches({*batch});
|
|
||||||
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
|
||||||
/// Otherwise fill the missing columns with zero values of its type.
|
|
||||||
BlockMissingValues * block_missing_values_ptr = format_settings.defaults_for_omitted_fields ? &block_missing_values : nullptr;
|
|
||||||
arrow_column_to_ch_column->arrowTableToCHChunk(res, *tmp_table, (*tmp_table)->num_rows(), block_missing_values_ptr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
current_record_batch_reader.reset();
|
|
||||||
file_reader.reset();
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (row_group_current >= row_group_total)
|
||||||
|
return res;
|
||||||
|
|
||||||
|
std::shared_ptr<arrow::Table> table;
|
||||||
|
|
||||||
|
std::unique_ptr<::arrow::RecordBatchReader> rbr;
|
||||||
|
std::vector<int> row_group_indices { row_group_current };
|
||||||
|
arrow::Status get_batch_reader_status = file_reader->GetRecordBatchReader(row_group_indices, column_indices, &rbr);
|
||||||
|
|
||||||
|
if (!get_batch_reader_status.ok())
|
||||||
|
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}",
|
||||||
|
get_batch_reader_status.ToString());
|
||||||
|
|
||||||
|
arrow::Status read_status = rbr->ReadAll(&table);
|
||||||
|
|
||||||
|
if (!read_status.ok())
|
||||||
|
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", read_status.ToString());
|
||||||
|
|
||||||
|
++row_group_current;
|
||||||
|
|
||||||
|
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
||||||
|
/// Otherwise fill the missing columns with zero values of its type.
|
||||||
|
BlockMissingValues * block_missing_values_ptr = format_settings.defaults_for_omitted_fields ? &block_missing_values : nullptr;
|
||||||
|
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, table->num_rows(), block_missing_values_ptr);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
3
tests/queries/0_stateless/02588_parquet_bug.reference
Normal file
3
tests/queries/0_stateless/02588_parquet_bug.reference
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
cta 224.0.90.10 1670964058771367936 64066044 NYSE cqs_pillar quote \N \N \N 82.92 1 R 82.97 2 R
|
||||||
|
1670964058771367936
|
||||||
|
1670946478544048640 ARCA cqs_pillar
|
11
tests/queries/0_stateless/02588_parquet_bug.sh
Executable file
11
tests/queries/0_stateless/02588_parquet_bug.sh
Executable file
@ -0,0 +1,11 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Tags: no-fasttest
|
||||||
|
|
||||||
|
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||||
|
# shellcheck source=../shell_config.sh
|
||||||
|
. "$CURDIR"/../shell_config.sh
|
||||||
|
|
||||||
|
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_parquet/02588_data.parquet') where exchange_ts = 1670964058771367936"
|
||||||
|
$CLICKHOUSE_LOCAL -q "select exchange_ts from file('$CURDIR/data_parquet/02588_data.parquet') where exchange_ts = 1670964058771367936"
|
||||||
|
$CLICKHOUSE_LOCAL -q "select exchange_ts, market, product from file('$CURDIR/data_parquet/02588_data.parquet') where exchange_ts = 1670946478544048640"
|
||||||
|
|
BIN
tests/queries/0_stateless/data_parquet/02588_data.parquet
Normal file
BIN
tests/queries/0_stateless/data_parquet/02588_data.parquet
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user