ClickHouse/src/Storages/Hive/HiveFile.cpp

172 lines
4.5 KiB
C++
Raw Normal View History

#include <Storages/Hive/HiveFile.h>
2021-11-06 02:31:15 +00:00
2021-11-22 02:52:10 +00:00
#if USE_HIVE
2021-11-06 02:31:15 +00:00
#include <boost/algorithm/string/case_conv.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/trim.hpp>
2021-11-06 02:31:15 +00:00
#include <fmt/core.h>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <arrow/status.h>
#include <orc/OrcFile.hh>
#include <orc/Reader.hh>
#include <parquet/arrow/reader.h>
#include <parquet/file_reader.h>
#include <parquet/statistics.h>
#include <Core/Types.h>
#include <Common/Exception.h>
#include <Common/typeid_cast.h>
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/KeyCondition.h>
namespace DB
{
2021-11-24 08:35:11 +00:00
template <class FieldType, class StatisticsType>
Range createRangeFromOrcStatistics(const StatisticsType * stats)
2021-11-06 02:31:15 +00:00
{
2021-11-24 09:52:02 +00:00
/// We must check if there are minumum or maximum values in statistics in case of
/// null values or NaN/Inf values of double type.
2021-11-06 02:31:15 +00:00
if (stats->hasMinimum() && stats->hasMaximum())
{
2021-11-24 08:35:11 +00:00
return Range(FieldType(stats->getMinimum()), true, FieldType(stats->getMaximum()), true);
2021-11-06 02:31:15 +00:00
}
else if (stats->hasMinimum())
{
2021-11-24 08:35:11 +00:00
return Range::createLeftBounded(FieldType(stats->getMinimum()), true);
2021-11-06 02:31:15 +00:00
}
else if (stats->hasMaximum())
{
2021-11-24 08:35:11 +00:00
return Range::createRightBounded(FieldType(stats->getMaximum()), true);
2021-11-06 02:31:15 +00:00
}
else
{
return Range();
}
}
2021-11-24 08:35:11 +00:00
template <class FieldType, class StatisticsType>
Range createRangeFromParquetStatistics(std::shared_ptr<StatisticsType> stats)
2021-11-06 02:31:15 +00:00
{
2021-11-24 09:52:02 +00:00
/// We must check if there are minumum or maximum values in statistics in case of
/// null values or NaN/Inf values of double type.
2021-11-06 02:31:15 +00:00
if (!stats->HasMinMax())
return Range();
2021-11-24 08:35:11 +00:00
return Range(FieldType(stats->min()), true, FieldType(stats->max()), true);
2021-11-06 02:31:15 +00:00
}
Range createRangeFromParquetStatistics(std::shared_ptr<parquet::ByteArrayStatistics> stats)
{
if (!stats->HasMinMax())
return Range();
String min_val(reinterpret_cast<const char *>(stats->min().ptr), stats->min().len);
String max_val(reinterpret_cast<const char *>(stats->max().ptr), stats->max().len);
return Range(min_val, true, max_val, true);
}
Range HiveOrcFile::buildRange(const orc::ColumnStatistics * col_stats)
{
if (!col_stats || col_stats->hasNull())
return {};
if (const auto * int_stats = dynamic_cast<const orc::IntegerColumnStatistics *>(col_stats))
{
return createRangeFromOrcStatistics<Int64>(int_stats);
}
else if (const auto * double_stats = dynamic_cast<const orc::DoubleColumnStatistics *>(col_stats))
{
return createRangeFromOrcStatistics<Float64>(double_stats);
}
else if (const auto * string_stats = dynamic_cast<const orc::StringColumnStatistics *>(col_stats))
{
return createRangeFromOrcStatistics<String>(string_stats);
}
else if (const auto * bool_stats = dynamic_cast<const orc::BooleanColumnStatistics *>(col_stats))
{
auto false_cnt = bool_stats->getFalseCount();
auto true_cnt = bool_stats->getTrueCount();
if (false_cnt && true_cnt)
{
return Range(UInt8(0), true, UInt8(1), true);
}
else if (false_cnt)
{
return Range::createLeftBounded(UInt8(0), true);
}
else if (true_cnt)
{
return Range::createRightBounded(UInt8(1), true);
}
}
else if (const auto * timestamp_stats = dynamic_cast<const orc::TimestampColumnStatistics *>(col_stats))
{
return createRangeFromOrcStatistics<UInt32>(timestamp_stats);
}
else if (const auto * date_stats = dynamic_cast<const orc::DateColumnStatistics *>(col_stats))
{
return createRangeFromOrcStatistics<UInt16>(date_stats);
}
return {};
}
void HiveOrcFile::prepareReader()
{
// TODO To be implemented
}
void HiveOrcFile::prepareColumnMapping()
{
// TODO To be implemented
}
bool HiveOrcFile::hasMinMaxIndex() const
{
return false;
}
std::unique_ptr<IMergeTreeDataPart::MinMaxIndex> HiveOrcFile::buildMinMaxIndex(const orc::Statistics * /*statistics*/)
{
// TODO To be implemented
return {};
}
void HiveOrcFile::loadMinMaxIndex()
{
// TODO To be implemented
}
bool HiveOrcFile::hasSubMinMaxIndex() const
{
return false;
}
void HiveOrcFile::loadSubMinMaxIndex()
{
// TODO To be implemented
}
bool HiveParquetFile::hasSubMinMaxIndex() const
{
// TODO To be implemented
return false;
}
void HiveParquetFile::prepareReader()
{
// TODO To be implemented
}
void HiveParquetFile::loadSubMinMaxIndex()
{
// TODO To be implemented
}
}
#endif