remove some useless virtual and rename some functions in HiveFile

This commit is contained in:
taiyang-li 2022-04-07 11:46:57 +08:00
parent 2ef316801c
commit acc7046d54
3 changed files with 56 additions and 68 deletions

View File

@ -145,7 +145,7 @@ void HiveOrcFile::prepareColumnMapping()
}
}
bool HiveOrcFile::hasMinMaxIndex() const
bool HiveOrcFile::useFileMinMaxIndex() const
{
return storage_settings->enable_orc_file_minmax_index;
}
@ -196,7 +196,7 @@ void HiveOrcFile::loadMinMaxIndex()
minmax_idx = buildMinMaxIndex(statistics.get());
}
bool HiveOrcFile::hasSubMinMaxIndex() const
bool HiveOrcFile::useSplitMinMaxIndex() const
{
return storage_settings->enable_orc_stripe_minmax_index;
}
@ -226,7 +226,7 @@ void HiveOrcFile::loadSubMinMaxIndex()
}
}
bool HiveParquetFile::hasSubMinMaxIndex() const
bool HiveParquetFile::useSplitMinMaxIndex() const
{
return storage_settings->enable_parquet_rowgroup_minmax_index;
}

View File

@ -76,7 +76,7 @@ public:
}
IHiveFile(
const FieldVector & values_,
const FieldVector & partition_values_,
const String & namenode_url_,
const String & path_,
UInt64 last_modify_time_,
@ -85,7 +85,7 @@ public:
const std::shared_ptr<HiveSettings> & storage_settings_,
ContextPtr context_)
: WithContext(context_)
, partition_values(values_)
, partition_values(partition_values_)
, namenode_url(namenode_url_)
, path(path_)
, last_modify_time(last_modify_time_)
@ -96,56 +96,47 @@ public:
}
virtual ~IHiveFile() = default;
virtual FileFormat getFormat() const = 0;
String getFormatName() const { return String(magic_enum::enum_name(getFormat())); }
const String & getPath() const { return path; }
UInt64 getLastModTs() const { return last_modify_time; }
size_t getSize() const { return size; }
const FieldVector & getPartitionValues() const { return partition_values; }
const String & getNamenodeUrl() { return namenode_url; }
MinMaxIndexPtr getMinMaxIndex() const { return minmax_idx; }
const std::vector<MinMaxIndexPtr> & getSubMinMaxIndexes() const { return sub_minmax_idxes; }
virtual String getName() const = 0;
const std::unordered_set<int> & getSkipSplits() const { return skip_splits; }
void setSkipSplits(const std::unordered_set<int> & skip_splits_) { skip_splits = skip_splits_; }
virtual String getPath() const { return path; }
virtual FieldVector getPartitionValues() const { return partition_values; }
virtual String getNamenodeUrl() { return namenode_url; }
virtual bool hasMinMaxIndex() const { return false; }
virtual void loadMinMaxIndex()
{
throw Exception("Method loadMinMaxIndex is not supported by hive file:" + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
virtual MinMaxIndexPtr getMinMaxIndex() const { return minmax_idx; }
// Do hive file contains sub-file level minmax index?
virtual bool hasSubMinMaxIndex() const { return false; }
virtual void loadSubMinMaxIndex()
{
throw Exception("Method loadSubMinMaxIndex is not supported by hive file:" + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
virtual const std::vector<MinMaxIndexPtr> & getSubMinMaxIndexes() const { return sub_minmax_idxes; }
virtual void setSkipSplits(const std::unordered_set<int> & skip_splits_) { skip_splits = skip_splits_; }
virtual const std::unordered_set<int> & getSkipSplits() const { return skip_splits; }
inline std::string describeMinMaxIndex(const MinMaxIndexPtr & idx) const
String describeMinMaxIndex(const MinMaxIndexPtr & idx) const
{
if (!idx)
return "";
std::vector<std::string> strs;
std::vector<String> strs;
strs.reserve(index_names_and_types.size());
size_t i = 0;
for (const auto & name_type : index_names_and_types)
{
strs.push_back(name_type.name + ":" + name_type.type->getName() + idx->hyperrectangle[i++].toString());
}
return boost::algorithm::join(strs, "|");
}
inline UInt64 getLastModTs() const { return last_modify_time; }
inline size_t getSize() const { return size; }
virtual FileFormat getFormat() const = 0;
virtual bool useFileMinMaxIndex() const { return false; }
virtual void loadMinMaxIndex()
{
throw Exception("Method loadMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED);
}
/// If hive query could use contains sub-file level minmax index?
virtual bool useSplitMinMaxIndex() const { return false; }
virtual void loadSubMinMaxIndex()
{
throw Exception("Method loadSubMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED);
}
protected:
FieldVector partition_values;
@ -168,7 +159,7 @@ class HiveTextFile : public IHiveFile
{
public:
HiveTextFile(
const FieldVector & values_,
const FieldVector & partition_values_,
const String & namenode_url_,
const String & path_,
UInt64 last_modify_time_,
@ -176,19 +167,18 @@ public:
const NamesAndTypesList & index_names_and_types_,
const std::shared_ptr<HiveSettings> & hive_settings_,
ContextPtr context_)
: IHiveFile(values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_)
: IHiveFile(partition_values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_)
{
}
virtual FileFormat getFormat() const override { return FileFormat::TEXT; }
virtual String getName() const override { return "TEXT"; }
};
class HiveOrcFile : public IHiveFile
{
public:
HiveOrcFile(
const FieldVector & values_,
const FieldVector & partition_values_,
const String & namenode_url_,
const String & path_,
UInt64 last_modify_time_,
@ -196,23 +186,22 @@ public:
const NamesAndTypesList & index_names_and_types_,
const std::shared_ptr<HiveSettings> & hive_settings_,
ContextPtr context_)
: IHiveFile(values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_)
: IHiveFile(partition_values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_)
{
}
virtual FileFormat getFormat() const override { return FileFormat::ORC; }
virtual String getName() const override { return "ORC"; }
virtual bool hasMinMaxIndex() const override;
virtual void loadMinMaxIndex() override;
FileFormat getFormat() const override { return FileFormat::ORC; }
bool useFileMinMaxIndex() const override;
void loadMinMaxIndex() override;
virtual bool hasSubMinMaxIndex() const override;
virtual void loadSubMinMaxIndex() override;
bool useSplitMinMaxIndex() const override;
void loadSubMinMaxIndex() override;
protected:
virtual std::unique_ptr<MinMaxIndex> buildMinMaxIndex(const orc::Statistics * statistics);
virtual Range buildRange(const orc::ColumnStatistics * col_stats);
virtual void prepareReader();
virtual void prepareColumnMapping();
private:
std::unique_ptr<MinMaxIndex> buildMinMaxIndex(const orc::Statistics * statistics);
Range buildRange(const orc::ColumnStatistics * col_stats);
void prepareReader();
void prepareColumnMapping();
std::unique_ptr<ReadBufferFromHDFS> in;
std::unique_ptr<arrow::adapters::orc::ORCFileReader> reader;
@ -223,7 +212,7 @@ class HiveParquetFile : public IHiveFile
{
public:
HiveParquetFile(
const FieldVector & values_,
const FieldVector & partition_values_,
const String & namenode_url_,
const String & path_,
UInt64 last_modify_time_,
@ -231,18 +220,17 @@ public:
const NamesAndTypesList & index_names_and_types_,
const std::shared_ptr<HiveSettings> & hive_settings_,
ContextPtr context_)
: IHiveFile(values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_)
: IHiveFile(partition_values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_)
{
}
virtual FileFormat getFormat() const override { return FileFormat::PARQUET; }
virtual String getName() const override { return "PARQUET"; }
FileFormat getFormat() const override { return FileFormat::PARQUET; }
virtual bool hasSubMinMaxIndex() const override;
virtual void loadSubMinMaxIndex() override;
bool useSplitMinMaxIndex() const override;
void loadSubMinMaxIndex() override;
protected:
virtual void prepareReader();
private:
void prepareReader();
std::unique_ptr<ReadBufferFromHDFS> in;
std::unique_ptr<parquet::arrow::FileReader> reader;

View File

@ -543,7 +543,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded(
/// Load file level minmax index and apply
const KeyCondition hivefile_key_condition(query_info, getContext(), hivefile_name_types.getNames(), hivefile_minmax_idx_expr);
if (hive_file->hasMinMaxIndex())
if (hive_file->useFileMinMaxIndex())
{
hive_file->loadMinMaxIndex();
if (!hivefile_key_condition.checkInHyperrectangle(hive_file->getMinMaxIndex()->hyperrectangle, hivefile_name_types.getTypes())
@ -556,7 +556,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded(
}
/// Load sub-file level minmax index and apply
if (hive_file->hasSubMinMaxIndex())
if (hive_file->useSplitMinMaxIndex())
{
std::unordered_set<int> skip_splits;
hive_file->loadSubMinMaxIndex();