This commit is contained in:
李扬 2024-09-18 23:54:59 +03:00 committed by GitHub
commit 23fb4794f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 223 additions and 24 deletions

View File

@ -1089,6 +1089,7 @@ class IColumn;
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \ M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \ M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \
M(String, input_format_orc_reader_time_zone_name, "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT.", 0) \ M(String, input_format_orc_reader_time_zone_name, "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT.", 0) \
M(Bool, input_format_orc_dictionary_as_low_cardinality, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files", 0) \
M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \ M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \ M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \ M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \

View File

@ -77,6 +77,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."}, {"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
{"create_if_not_exists", false, false, "New setting."}, {"create_if_not_exists", false, false, "New setting."},
{"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"}, {"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"},
{"input_format_orc_dictionary_as_low_cardinality", true, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files"},
{"output_format_always_quote_identifiers", false, false, "New setting."}, {"output_format_always_quote_identifiers", false, false, "New setting."},
{"output_format_identifier_quoting_style", "Backticks", "Backticks", "New setting."}, {"output_format_identifier_quoting_style", "Backticks", "Backticks", "New setting."},
{"parallel_replicas_mark_segment_size", 128, 0, "Value for this setting now determined automatically"}, {"parallel_replicas_mark_segment_size", 128, 0, "Value for this setting now determined automatically"},

View File

@ -237,9 +237,6 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
format_settings.orc.output_compression_method = settings.output_format_orc_compression_method; format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
@ -248,6 +245,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder; format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down; format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down;
format_settings.orc.reader_time_zone_name = settings.input_format_orc_reader_time_zone_name; format_settings.orc.reader_time_zone_name = settings.input_format_orc_reader_time_zone_name;
format_settings.orc.dictionary_as_low_cardinality = settings.input_format_orc_dictionary_as_low_cardinality;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference; format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;

View File

@ -415,6 +415,7 @@ struct FormatSettings
bool filter_push_down = true; bool filter_push_down = true;
UInt64 output_row_index_stride = 10'000; UInt64 output_row_index_stride = 10'000;
String reader_time_zone_name = "GMT"; String reader_time_zone_name = "GMT";
bool dictionary_as_low_cardinality = true;
double output_dictionary_key_size_threshold = 0.0; double output_dictionary_key_size_threshold = 0.0;
} orc{}; } orc{};

View File

@ -6,6 +6,7 @@
# include <Columns/ColumnMap.h> # include <Columns/ColumnMap.h>
# include <Columns/ColumnNullable.h> # include <Columns/ColumnNullable.h>
# include <Columns/ColumnString.h> # include <Columns/ColumnString.h>
# include <Columns/ColumnsCommon.h>
# include <Columns/ColumnsDateTime.h> # include <Columns/ColumnsDateTime.h>
# include <Columns/ColumnsNumber.h> # include <Columns/ColumnsNumber.h>
# include <DataTypes/DataTypeArray.h> # include <DataTypes/DataTypeArray.h>
@ -16,13 +17,13 @@
# include <DataTypes/DataTypeIPv4andIPv6.h> # include <DataTypes/DataTypeIPv4andIPv6.h>
# include <DataTypes/DataTypeLowCardinality.h> # include <DataTypes/DataTypeLowCardinality.h>
# include <DataTypes/DataTypeMap.h> # include <DataTypes/DataTypeMap.h>
# include <DataTypes/DataTypeNested.h>
# include <DataTypes/DataTypeNullable.h> # include <DataTypes/DataTypeNullable.h>
# include <DataTypes/DataTypeString.h> # include <DataTypes/DataTypeString.h>
# include <DataTypes/DataTypeTuple.h> # include <DataTypes/DataTypeTuple.h>
# include <DataTypes/DataTypesDecimal.h> # include <DataTypes/DataTypesDecimal.h>
# include <DataTypes/DataTypesNumber.h> # include <DataTypes/DataTypesNumber.h>
# include <DataTypes/NestedUtils.h> # include <DataTypes/NestedUtils.h>
# include <DataTypes/DataTypeNested.h>
# include <Formats/FormatFactory.h> # include <Formats/FormatFactory.h>
# include <Formats/SchemaInferenceUtils.h> # include <Formats/SchemaInferenceUtils.h>
# include <Formats/insertNullAsDefaultIfNeeded.h> # include <Formats/insertNullAsDefaultIfNeeded.h>
@ -35,6 +36,8 @@
# include <Common/FieldVisitorsAccurateComparison.h> # include <Common/FieldVisitorsAccurateComparison.h>
# include "ArrowBufferedStreams.h" # include "ArrowBufferedStreams.h"
# include <orc/Vector.hh>
namespace DB namespace DB
{ {
@ -110,7 +113,21 @@ static const orc::Type * getORCTypeByName(const orc::Type & schema, const String
return nullptr; return nullptr;
} }
static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_with_unsupported_types, bool & skipped) static bool isDictionaryEncoded(const orc::StripeInformation * stripe_info, const orc::Type * orc_type)
{
if (!stripe_info)
return false;
auto encoding = stripe_info->getColumnEncoding(orc_type->getColumnId());
return encoding == orc::ColumnEncodingKind_DICTIONARY || encoding == orc::ColumnEncodingKind_DICTIONARY_V2;
}
static DataTypePtr parseORCType(
const orc::Type * orc_type,
bool skip_columns_with_unsupported_types,
bool dictionary_as_low_cardinality,
const orc::StripeInformation * stripe_info,
bool & skipped)
{ {
assert(orc_type != nullptr); assert(orc_type != nullptr);
@ -137,12 +154,25 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
return std::make_shared<DataTypeDateTime64>(9); return std::make_shared<DataTypeDateTime64>(9);
case orc::TypeKind::TIMESTAMP_INSTANT: case orc::TypeKind::TIMESTAMP_INSTANT:
return std::make_shared<DataTypeDateTime64>(9, "UTC"); return std::make_shared<DataTypeDateTime64>(9, "UTC");
case orc::TypeKind::CHAR:
case orc::TypeKind::VARCHAR: case orc::TypeKind::VARCHAR:
case orc::TypeKind::BINARY: case orc::TypeKind::BINARY:
case orc::TypeKind::STRING: case orc::TypeKind::STRING: {
return std::make_shared<DataTypeString>(); DataTypePtr type;
case orc::TypeKind::CHAR: if (orc_type->getKind() == orc::TypeKind::CHAR)
return std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength()); type = std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
else
type = std::make_shared<DataTypeString>();
// std::cout << "type:" << type->getName() << std::endl;
// std::cout << "dictionary_as_low_cardinality:" << dictionary_as_low_cardinality << std::endl;
/// Wrap type in LowCardinality if orc column is dictionary encoded and dictionary_as_low_cardinality is true
if (dictionary_as_low_cardinality && isDictionaryEncoded(stripe_info, orc_type))
type = std::make_shared<DataTypeLowCardinality>(type);
return type;
}
case orc::TypeKind::DECIMAL: { case orc::TypeKind::DECIMAL: {
UInt64 precision = orc_type->getPrecision(); UInt64 precision = orc_type->getPrecision();
UInt64 scale = orc_type->getScale(); UInt64 scale = orc_type->getScale();
@ -158,7 +188,8 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
if (subtype_count != 1) if (subtype_count != 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc List type {}", orc_type->toString()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc List type {}", orc_type->toString());
DataTypePtr nested_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped); DataTypePtr nested_type = parseORCType(
orc_type->getSubtype(0), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
if (skipped) if (skipped)
return {}; return {};
@ -168,11 +199,12 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
if (subtype_count != 2) if (subtype_count != 2)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc Map type {}", orc_type->toString()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc Map type {}", orc_type->toString());
DataTypePtr key_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped); DataTypePtr key_type = parseORCType(
orc_type->getSubtype(0), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
if (skipped) if (skipped)
return {}; return {};
DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, skipped); DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
if (skipped) if (skipped)
return {}; return {};
@ -186,7 +218,8 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
for (size_t i = 0; i < orc_type->getSubtypeCount(); ++i) for (size_t i = 0; i < orc_type->getSubtypeCount(); ++i)
{ {
auto parsed_type = parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, skipped); auto parsed_type
= parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
if (skipped) if (skipped)
return {}; return {};
@ -489,7 +522,7 @@ static void buildORCSearchArgumentImpl(
/// For queries with where condition like "a > 10", if a column contains negative values such as "-1", pushing or not pushing /// For queries with where condition like "a > 10", if a column contains negative values such as "-1", pushing or not pushing
/// down filters would result in different outputs. /// down filters would result in different outputs.
bool skipped = false; bool skipped = false;
auto expect_type = makeNullableRecursively(parseORCType(orc_type, true, skipped)); auto expect_type = makeNullableRecursively(parseORCType(orc_type, true, false, nullptr, skipped));
const ColumnWithTypeAndName * column = header.findByName(column_name, format_settings.orc.case_insensitive_column_matching); const ColumnWithTypeAndName * column = header.findByName(column_name, format_settings.orc.case_insensitive_column_matching);
if (!expect_type || !column) if (!expect_type || !column)
{ {
@ -857,11 +890,18 @@ void NativeORCBlockInputFormat::prepareFileReader()
total_stripes = static_cast<int>(file_reader->getNumberOfStripes()); total_stripes = static_cast<int>(file_reader->getNumberOfStripes());
current_stripe = -1; current_stripe = -1;
std::unique_ptr<orc::StripeInformation> stripe_info;
if (file_reader->getNumberOfStripes())
stripe_info = file_reader->getStripe(0);
orc_column_to_ch_column = std::make_unique<ORCColumnToCHColumn>( orc_column_to_ch_column = std::make_unique<ORCColumnToCHColumn>(
getPort().getHeader(), getPort().getHeader(),
format_settings.orc.allow_missing_columns, format_settings.orc.allow_missing_columns,
format_settings.null_as_default, format_settings.null_as_default,
format_settings.orc.case_insensitive_column_matching); format_settings.orc.case_insensitive_column_matching,
format_settings.orc.dictionary_as_low_cardinality,
std::move(stripe_info));
const bool ignore_case = format_settings.orc.case_insensitive_column_matching; const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
const auto & header = getPort().getHeader(); const auto & header = getPort().getHeader();
@ -899,6 +939,7 @@ bool NativeORCBlockInputFormat::prepareStripeReader()
throw Exception(ErrorCodes::INCORRECT_DATA, "ORC stripe {} has no rows", current_stripe); throw Exception(ErrorCodes::INCORRECT_DATA, "ORC stripe {} has no rows", current_stripe);
orc::RowReaderOptions row_reader_options; orc::RowReaderOptions row_reader_options;
row_reader_options.setEnableLazyDecoding(format_settings.orc.dictionary_as_low_cardinality);
row_reader_options.includeTypes(include_indices); row_reader_options.includeTypes(include_indices);
row_reader_options.setTimezoneName(format_settings.orc.reader_time_zone_name); row_reader_options.setTimezoneName(format_settings.orc.reader_time_zone_name);
row_reader_options.range(current_stripe_info->getOffset(), current_stripe_info->getLength()); row_reader_options.range(current_stripe_info->getOffset(), current_stripe_info->getLength());
@ -989,15 +1030,25 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
std::atomic<int> is_stopped = 0; std::atomic<int> is_stopped = 0;
getFileReader(in, file_reader, format_settings, is_stopped); getFileReader(in, file_reader, format_settings, is_stopped);
const auto & schema = file_reader->getType(); const auto & schema = file_reader->getType();
Block header; Block header;
std::unique_ptr<orc::StripeInformation> stripe_info;
if (file_reader->getNumberOfStripes())
stripe_info = file_reader->getStripe(0);
for (size_t i = 0; i < schema.getSubtypeCount(); ++i) for (size_t i = 0; i < schema.getSubtypeCount(); ++i)
{ {
const std::string & name = schema.getFieldName(i); const std::string & name = schema.getFieldName(i);
const orc::Type * orc_type = schema.getSubtype(i); const orc::Type * orc_type = schema.getSubtype(i);
bool skipped = false; bool skipped = false;
DataTypePtr type = parseORCType(orc_type, format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference, skipped); DataTypePtr type = parseORCType(
orc_type,
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference,
format_settings.orc.dictionary_as_low_cardinality,
stripe_info.get(),
skipped);
if (!skipped) if (!skipped)
header.insert(ColumnWithTypeAndName{type, name}); header.insert(ColumnWithTypeAndName{type, name});
} }
@ -1008,11 +1059,18 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
} }
ORCColumnToCHColumn::ORCColumnToCHColumn( ORCColumnToCHColumn::ORCColumnToCHColumn(
const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_) const Block & header_,
bool allow_missing_columns_,
bool null_as_default_,
bool case_insensitive_matching_,
bool dictionary_as_low_cardinality_,
std::unique_ptr<orc::StripeInformation> stripe_info_)
: header(header_) : header(header_)
, allow_missing_columns(allow_missing_columns_) , allow_missing_columns(allow_missing_columns_)
, null_as_default(null_as_default_) , null_as_default(null_as_default_)
, case_insensitive_matching(case_insensitive_matching_) , case_insensitive_matching(case_insensitive_matching_)
, dictionary_as_low_cardinality(dictionary_as_low_cardinality_)
, stripe_info(std::move(stripe_info_))
{ {
} }
@ -1126,6 +1184,117 @@ readColumnWithNumericDataCast(const orc::ColumnVectorBatch * orc_column, const o
return {std::move(internal_column), std::move(internal_type), column_name}; return {std::move(internal_column), std::move(internal_type), column_name};
} }
template <bool fixed_string>
static ColumnWithTypeAndName readColumnWithEncodedStringOrFixedStringData(
const orc::ColumnVectorBatch * orc_column, const orc::Type * orc_type, const String & column_name, bool nullable)
{
const auto * orc_str_column = dynamic_cast<const orc::EncodedStringVectorBatch *>(orc_column);
size_t rows = orc_str_column->numElements;
const auto & orc_dict = *orc_str_column->dictionary;
size_t dict_size = orc_dict.dictionaryOffset.size() - 1;
/// Fill CH holder_column with orc dictionary
/// Note that holder_column is always a ColumnString or ColumnFixedstring whether nullable is true or false, because ORC dictionary doesn't contain null values.
DataTypePtr holder_type;
if constexpr (fixed_string)
holder_type = std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
else
holder_type = std::make_shared<DataTypeString>();
auto holder_column = holder_type->createColumn();
if constexpr (fixed_string)
{
const size_t n = orc_type->getMaximumLength();
auto & concrete_holder_column = assert_cast<ColumnFixedString &>(*holder_column);
PaddedPODArray<UInt8> & column_chars_t = concrete_holder_column.getChars();
size_t reserve_size = dict_size * n;
column_chars_t.resize_exact(reserve_size);
size_t curr_offset = 0;
for (size_t i = 0; i < dict_size; ++i)
{
const auto * buf = orc_dict.dictionaryBlob.data() + orc_dict.dictionaryOffset[i];
size_t buf_size = orc_dict.dictionaryOffset[i + 1] - orc_dict.dictionaryOffset[i];
memcpy(&column_chars_t[curr_offset], buf, buf_size);
curr_offset += n;
}
}
else
{
auto & concrete_holder_column = assert_cast<ColumnString &>(*holder_column);
PaddedPODArray<UInt8> & column_chars_t = concrete_holder_column.getChars();
PaddedPODArray<UInt64> & column_offsets = concrete_holder_column.getOffsets();
size_t reserve_size = orc_dict.dictionaryBlob.size() + dict_size;
column_chars_t.resize_exact(reserve_size);
column_offsets.resize_exact(dict_size);
size_t curr_offset = 0;
for (size_t i = 0; i < dict_size; ++i)
{
const auto * buf = orc_dict.dictionaryBlob.data() + orc_dict.dictionaryOffset[i];
size_t buf_size = orc_dict.dictionaryOffset[i + 1] - orc_dict.dictionaryOffset[i];
memcpy(&column_chars_t[curr_offset], buf, buf_size);
curr_offset += buf_size;
column_chars_t[curr_offset] = 0;
++curr_offset;
column_offsets[i] = curr_offset;
}
}
/// Insert CH dictionary_column from holder_column
DataTypePtr nested_type = nullable ? std::make_shared<DataTypeNullable>(holder_type) : holder_type;
auto internal_type = std::make_shared<DataTypeLowCardinality>(std::move(nested_type));
auto tmp_internal_column = internal_type->createColumn();
auto dictionary_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_internal_column.get())->getDictionaryPtr());
auto index_column
= dynamic_cast<IColumnUnique *>(dictionary_column.get())->uniqueInsertRangeFrom(*holder_column, 0, holder_column->size());
/// Fill index_column and wrap it with LowCardinality
auto call_by_type = [&](auto index_type) -> MutableColumnPtr
{
using IndexType = decltype(index_type);
const ColumnVector<IndexType> * concrete_index_column = checkAndGetColumn<ColumnVector<IndexType>>(index_column.get());
if (!concrete_index_column)
return nullptr;
const auto & index_data = concrete_index_column->getData();
auto new_index_column = ColumnVector<IndexType>::create(rows);
auto & new_index_data = dynamic_cast<ColumnVector<IndexType> &>(*new_index_column).getData();
if (!orc_str_column->hasNulls)
{
for (size_t i = 0; i < rows; ++i)
{
/// First map row index to orc dictionary index, then map orc dictionary index to CH dictionary index
new_index_data[i] = index_data[orc_str_column->index[i]];
}
}
else
{
for (size_t i = 0; i < rows; ++i)
{
/// Set index 0 if we meet null value. If dictionary_column is nullable, 0 represents null value.
/// Otherwise 0 represents default string value, it is reasonable because null values are converted to default values when casting nullable column to non-nullable.
new_index_data[i] = orc_str_column->notNull[i] ? index_data[orc_str_column->index[i]] : 0;
}
}
return ColumnLowCardinality::create(std::move(dictionary_column), std::move(new_index_column));
};
MutableColumnPtr internal_column;
if (!internal_column)
internal_column = call_by_type(UInt8());
if (!internal_column)
internal_column = call_by_type(UInt16());
if (!internal_column)
internal_column = call_by_type(UInt32());
if (!internal_column)
internal_column = call_by_type(UInt64());
return {std::move(internal_column), std::move(internal_type), column_name};
}
static ColumnWithTypeAndName static ColumnWithTypeAndName
readColumnWithStringData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name) readColumnWithStringData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name)
{ {
@ -1369,16 +1538,16 @@ readColumnWithTimestampData(const orc::ColumnVectorBatch * orc_column, const orc
return {std::move(internal_column), std::move(internal_type), column_name}; return {std::move(internal_column), std::move(internal_type), column_name};
} }
static ColumnWithTypeAndName readColumnFromORCColumn( ColumnWithTypeAndName ORCColumnToCHColumn::readColumnFromORCColumn(
const orc::ColumnVectorBatch * orc_column, const orc::ColumnVectorBatch * orc_column,
const orc::Type * orc_type, const orc::Type * orc_type,
const std::string & column_name, const std::string & column_name,
bool inside_nullable, bool inside_nullable,
DataTypePtr type_hint = nullptr) DataTypePtr type_hint) const
{ {
bool skipped = false; bool skipped = false;
if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable())) if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable())) && !orc_column->isEncoded
&& (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP && orc_type->getKind() != orc::STRUCT)) && (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP && orc_type->getKind() != orc::STRUCT))
{ {
DataTypePtr nested_type_hint; DataTypePtr nested_type_hint;
@ -1420,7 +1589,14 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
default:; default:;
} }
} }
return readColumnWithStringData(orc_column, orc_type, column_name);
if (orc_column->isEncoded && dictionary_as_low_cardinality)
{
bool nullable = type_hint ? isNullableOrLowCardinalityNullable(type_hint) : true;
return readColumnWithEncodedStringOrFixedStringData<false>(orc_column, orc_type, column_name, nullable);
}
else
return readColumnWithStringData(orc_column, orc_type, column_name);
} }
case orc::CHAR: { case orc::CHAR: {
if (type_hint) if (type_hint)
@ -1438,7 +1614,14 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
default:; default:;
} }
} }
return readColumnWithFixedStringData(orc_column, orc_type, column_name);
if (orc_column->isEncoded && dictionary_as_low_cardinality)
{
bool nullable = type_hint ? isNullableOrLowCardinalityNullable(type_hint) : true;
return readColumnWithEncodedStringOrFixedStringData<true>(orc_column, orc_type, column_name, nullable);
}
else
return readColumnWithFixedStringData(orc_column, orc_type, column_name);
} }
case orc::BOOLEAN: case orc::BOOLEAN:
return readColumnWithBooleanData(orc_column, orc_type, column_name); return readColumnWithBooleanData(orc_column, orc_type, column_name);
@ -1465,7 +1648,7 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
case orc::TIMESTAMP_INSTANT: case orc::TIMESTAMP_INSTANT:
return readColumnWithTimestampData(orc_column, orc_type, column_name); return readColumnWithTimestampData(orc_column, orc_type, column_name);
case orc::DECIMAL: { case orc::DECIMAL: {
auto interal_type = parseORCType(orc_type, false, skipped); auto interal_type = parseORCType(orc_type, false, false, nullptr, skipped);
auto precision = orc_type->getPrecision(); auto precision = orc_type->getPrecision();
if (precision == 0) if (precision == 0)

View File

@ -111,7 +111,13 @@ public:
using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>; using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>;
using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>; using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>;
ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false); ORCColumnToCHColumn(
const Block & header_,
bool allow_missing_columns_,
bool null_as_default_,
bool case_insensitive_matching_ = false,
bool dictionary_as_low_cardinality_ = false,
std::unique_ptr<orc::StripeInformation> stripe_info_ = {});
void orcTableToCHChunk( void orcTableToCHChunk(
Chunk & res, Chunk & res,
@ -124,11 +130,20 @@ public:
Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr); Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr);
private: private:
ColumnWithTypeAndName readColumnFromORCColumn(
const orc::ColumnVectorBatch * orc_column,
const orc::Type * orc_type,
const std::string & column_name,
bool inside_nullable,
DataTypePtr type_hint = nullptr) const;
const Block & header; const Block & header;
/// If false, throw exception if some columns in header not exists in arrow table. /// If false, throw exception if some columns in header not exists in arrow table.
bool allow_missing_columns; bool allow_missing_columns;
bool null_as_default; bool null_as_default;
bool case_insensitive_matching; bool case_insensitive_matching;
bool dictionary_as_low_cardinality;
std::unique_ptr<orc::StripeInformation> stripe_info;
}; };
} }
#endif #endif