suppoort skip splits in orc and parquet

This commit is contained in:
taiyang-li 2022-04-06 16:40:22 +08:00
parent 43e8af697a
commit acb9f1632e
8 changed files with 58 additions and 29 deletions

View File

@ -138,6 +138,7 @@ struct FormatSettings
bool import_nested = false; bool import_nested = false;
bool allow_missing_columns = false; bool allow_missing_columns = false;
bool case_insensitive_column_matching = false; bool case_insensitive_column_matching = false;
std::unordered_set<int> skip_row_groups = {};
} parquet; } parquet;
struct Pretty struct Pretty
@ -219,6 +220,7 @@ struct FormatSettings
bool allow_missing_columns = false; bool allow_missing_columns = false;
int64_t row_batch_size = 100'000; int64_t row_batch_size = 100'000;
bool case_insensitive_column_matching = false; bool case_insensitive_column_matching = false;
std::unordered_set<int> skip_stripes = {};
} orc; } orc;
/// For capnProto format we should determine how to /// For capnProto format we should determine how to

View File

@ -20,13 +20,12 @@ namespace ErrorCodes
} }
ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_)
: IInputFormat(std::move(header_), in_), format_settings(format_settings_) : IInputFormat(std::move(header_), in_), format_settings(format_settings_), skip_stripes(format_settings.orc.skip_stripes)
{ {
} }
Chunk ORCBlockInputFormat::generate() Chunk ORCBlockInputFormat::generate()
{ {
Chunk res;
block_missing_values.clear(); block_missing_values.clear();
if (!file_reader) if (!file_reader)
@ -35,24 +34,32 @@ Chunk ORCBlockInputFormat::generate()
if (is_stopped) if (is_stopped)
return {}; return {};
std::shared_ptr<arrow::RecordBatchReader> batch_reader; for (; stripe_current < stripe_total && skip_stripes.contains(stripe_current); ++stripe_current)
auto result = file_reader->NextStripeReader(format_settings.orc.row_batch_size, include_indices); ;
if (!result.ok())
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", result.status().ToString());
batch_reader = std::move(result).ValueOrDie();
if (!batch_reader)
{
return res;
}
std::shared_ptr<arrow::Table> table; if (stripe_current >= stripe_total)
arrow::Status table_status = batch_reader->ReadAll(&table); return {};
if (!table_status.ok())
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_status.ToString());
auto batch_result = file_reader->ReadStripe(stripe_current, include_indices);
if (!batch_result.ok())
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", batch_result.status().ToString());
auto batch = batch_result.ValueOrDie();
if (!batch)
return {};
auto table_result = arrow::Table::FromRecordBatches({batch});
if (!table_result.ok())
throw ParsingException(
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString());
auto table = table_result.ValueOrDie();
if (!table || !table->num_rows()) if (!table || !table->num_rows())
return res; return {};
++stripe_current;
Chunk res;
arrow_column_to_ch_column->arrowTableToCHChunk(res, table); arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type. /// Otherwise fill the missing columns with zero values of its type.
@ -130,6 +137,9 @@ void ORCBlockInputFormat::prepareReader()
if (is_stopped) if (is_stopped)
return; return;
stripe_total = file_reader->NumberOfStripes();
stripe_current = 0;
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>( arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
getPort().getHeader(), getPort().getHeader(),
"ORC", "ORC",

View File

@ -38,6 +38,7 @@ protected:
} }
private: private:
void prepareReader();
// TODO: check that this class implements every part of its parent // TODO: check that this class implements every part of its parent
@ -52,8 +53,10 @@ private:
BlockMissingValues block_missing_values; BlockMissingValues block_missing_values;
const FormatSettings format_settings; const FormatSettings format_settings;
const std::unordered_set<int> & skip_stripes;
void prepareReader(); int stripe_total = 0;
int stripe_current = 0;
std::atomic<int> is_stopped{0}; std::atomic<int> is_stopped{0};
}; };

View File

@ -32,7 +32,7 @@ namespace ErrorCodes
} while (false) } while (false)
ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_)
: IInputFormat(std::move(header_), in_), format_settings(format_settings_) : IInputFormat(std::move(header_), in_), format_settings(format_settings_), skip_row_groups(format_settings.parquet.skip_row_groups)
{ {
} }
@ -47,6 +47,9 @@ Chunk ParquetBlockInputFormat::generate()
if (is_stopped) if (is_stopped)
return {}; return {};
for (; row_group_current < row_group_total && skip_row_groups.contains(row_group_current); ++row_group_current)
;
if (row_group_current >= row_group_total) if (row_group_current >= row_group_total)
return res; return res;

View File

@ -38,13 +38,14 @@ private:
std::unique_ptr<parquet::arrow::FileReader> file_reader; std::unique_ptr<parquet::arrow::FileReader> file_reader;
int row_group_total = 0; int row_group_total = 0;
int row_group_current = 0;
// indices of columns to read from Parquet file // indices of columns to read from Parquet file
std::vector<int> column_indices; std::vector<int> column_indices;
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column; std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
int row_group_current = 0;
std::vector<size_t> missing_columns; std::vector<size_t> missing_columns;
BlockMissingValues block_missing_values; BlockMissingValues block_missing_values;
const FormatSettings format_settings; const FormatSettings format_settings;
const std::unordered_set<int> & skip_row_groups;
std::atomic<int> is_stopped{0}; std::atomic<int> is_stopped{0};
}; };

View File

@ -139,7 +139,7 @@ void HiveOrcFile::prepareColumnMapping()
for (size_t pos = 0; pos < count; pos++) for (size_t pos = 0; pos < count; pos++)
{ {
/// Column names in hive is case-insensitive. /// Column names in hive is case-insensitive.
String columnn{type.getFieldName(pos)}; String column{type.getFieldName(pos)};
boost::to_lower(column); boost::to_lower(column);
orc_column_positions[column] = pos; orc_column_positions[column] = pos;
} }

View File

@ -125,9 +125,9 @@ public:
virtual const std::vector<MinMaxIndexPtr> & getSubMinMaxIndexes() const { return sub_minmax_idxes; } virtual const std::vector<MinMaxIndexPtr> & getSubMinMaxIndexes() const { return sub_minmax_idxes; }
virtual void setSkipSplits(const std::set<int> & splits) { skip_splits = splits; } virtual void setSkipSplits(const std::unordered_set<int> & skip_splits_) { skip_splits = skip_splits_; }
virtual const std::set<int> & getSkipSplits() const { return skip_splits; } virtual const std::unordered_set<int> & getSkipSplits() const { return skip_splits; }
inline std::string describeMinMaxIndex(const MinMaxIndexPtr & idx) const inline std::string describeMinMaxIndex(const MinMaxIndexPtr & idx) const
{ {
@ -157,7 +157,7 @@ protected:
MinMaxIndexPtr minmax_idx; MinMaxIndexPtr minmax_idx;
std::vector<MinMaxIndexPtr> sub_minmax_idxes; std::vector<MinMaxIndexPtr> sub_minmax_idxes;
/// Skip splits for this file after applying minmax index (if any) /// Skip splits for this file after applying minmax index (if any)
std::set<int> skip_splits; std::unordered_set<int> skip_splits;
std::shared_ptr<HiveSettings> storage_settings; std::shared_ptr<HiveSettings> storage_settings;
}; };

View File

@ -111,9 +111,9 @@ public:
: SourceWithProgress(getHeader(sample_block_, source_info_)) : SourceWithProgress(getHeader(sample_block_, source_info_))
, WithContext(context_) , WithContext(context_)
, source_info(std::move(source_info_)) , source_info(std::move(source_info_))
, hdfs_namenode_url(hdfs_namenode_url_) , hdfs_namenode_url(std::move(hdfs_namenode_url_))
, format(std::move(format_)) , format(std::move(format_))
, compression_method(compression_method_) , compression_method(std::move(compression_method_))
, max_block_size(max_block_size_) , max_block_size(max_block_size_)
, sample_block(std::move(sample_block_)) , sample_block(std::move(sample_block_))
, columns_description(getColumnsDescription(sample_block, source_info)) , columns_description(getColumnsDescription(sample_block, source_info))
@ -121,15 +121,25 @@ public:
, format_settings(getFormatSettings(getContext())) , format_settings(getFormatSettings(getContext()))
{ {
to_read_block = sample_block; to_read_block = sample_block;
/// Initialize to_read_block, which is used to read data from HDFS. /// Initialize to_read_block, which is used to read data from HDFS.
for (const auto & name_type : source_info->partition_name_types) for (const auto & name_type : source_info->partition_name_types)
{ {
if (to_read_block.has(name_type.name)) if (to_read_block.has(name_type.name))
to_read_block.erase(name_type.name); to_read_block.erase(name_type.name);
} }
}
/// Initialize format settings FormatSettings updateFormatSettings(const HiveFilePtr & hive_file)
format_settings.hive_text.input_field_names = text_input_field_names; {
auto updated = format_settings;
if (format == "HiveText")
updated.hive_text.input_field_names = text_input_field_names;
else if (format == "ORC")
updated.orc.skip_stripes = hive_file->getSkipSplits();
else if (format == "Parquet")
updated.parquet.skip_row_groups = hive_file->getSkipSplits();
return updated;
} }
String getName() const override { return "Hive"; } String getName() const override { return "Hive"; }
@ -188,7 +198,7 @@ public:
read_buf = std::move(remote_read_buf); read_buf = std::move(remote_read_buf);
auto input_format = FormatFactory::instance().getInputFormat( auto input_format = FormatFactory::instance().getInputFormat(
format, *read_buf, to_read_block, getContext(), max_block_size, format_settings); format, *read_buf, to_read_block, getContext(), max_block_size, updateFormatSettings(curr_file));
QueryPipelineBuilder builder; QueryPipelineBuilder builder;
builder.init(Pipe(input_format)); builder.init(Pipe(input_format));
@ -545,7 +555,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded(
/// Load sub-file level minmax index and apply /// Load sub-file level minmax index and apply
if (hive_file->hasSubMinMaxIndex()) if (hive_file->hasSubMinMaxIndex())
{ {
std::set<int> skip_splits; std::unordered_set<int> skip_splits;
hive_file->loadSubMinMaxIndex(); hive_file->loadSubMinMaxIndex();
const auto & sub_minmax_idxes = hive_file->getSubMinMaxIndexes(); const auto & sub_minmax_idxes = hive_file->getSubMinMaxIndexes();
for (size_t i = 0; i < sub_minmax_idxes.size(); ++i) for (size_t i = 0; i < sub_minmax_idxes.size(); ++i)