Merge 4e155fae74 into b94a7167a8

2024-09-19 16:20:50 +00:00 · 2024-09-18 23:54:59 +03:00 · 2024-09-18 23:54:59 +03:00 · 23fb4794f1
commit 23fb4794f1
parent b94a7167a8 4e155fae74
6 changed files with 223 additions and 24 deletions
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -1089,6 +1089,7 @@ class IColumn;
    M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
    M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \
    M(String, input_format_orc_reader_time_zone_name, "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT.", 0) \
+    M(Bool, input_format_orc_dictionary_as_low_cardinality, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files", 0) \
    M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \
    M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
    M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -77,6 +77,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
            {"create_if_not_exists", false, false, "New setting."},
            {"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"},
+            {"input_format_orc_dictionary_as_low_cardinality", true, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files"},
            {"output_format_always_quote_identifiers", false, false, "New setting."},
            {"output_format_identifier_quoting_style", "Backticks", "Backticks", "New setting."},
            {"parallel_replicas_mark_segment_size", 128, 0, "Value for this setting now determined automatically"},
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -237,9 +237,6 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
    format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
    format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
    format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
-    format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
-    format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
-    format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
    format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
    format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
    format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
@ -248,6 +245,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
    format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
    format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down;
    format_settings.orc.reader_time_zone_name = settings.input_format_orc_reader_time_zone_name;
+    format_settings.orc.dictionary_as_low_cardinality = settings.input_format_orc_dictionary_as_low_cardinality;
    format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
    format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
    format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -415,6 +415,7 @@ struct FormatSettings
        bool filter_push_down = true;
        UInt64 output_row_index_stride = 10'000;
        String reader_time_zone_name = "GMT";
+        bool dictionary_as_low_cardinality = true;
        double output_dictionary_key_size_threshold = 0.0;
    } orc{};

--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
@ -6,6 +6,7 @@
 #    include <Columns/ColumnMap.h>
 #    include <Columns/ColumnNullable.h>
 #    include <Columns/ColumnString.h>
+#    include <Columns/ColumnsCommon.h>
 #    include <Columns/ColumnsDateTime.h>
 #    include <Columns/ColumnsNumber.h>
 #    include <DataTypes/DataTypeArray.h>
@ -16,13 +17,13 @@
 #    include <DataTypes/DataTypeIPv4andIPv6.h>
 #    include <DataTypes/DataTypeLowCardinality.h>
 #    include <DataTypes/DataTypeMap.h>
+#    include <DataTypes/DataTypeNested.h>
 #    include <DataTypes/DataTypeNullable.h>
 #    include <DataTypes/DataTypeString.h>
 #    include <DataTypes/DataTypeTuple.h>
 #    include <DataTypes/DataTypesDecimal.h>
 #    include <DataTypes/DataTypesNumber.h>
 #    include <DataTypes/NestedUtils.h>
-#    include <DataTypes/DataTypeNested.h>
 #    include <Formats/FormatFactory.h>
 #    include <Formats/SchemaInferenceUtils.h>
 #    include <Formats/insertNullAsDefaultIfNeeded.h>
@ -35,6 +36,8 @@
 #    include <Common/FieldVisitorsAccurateComparison.h>
 #    include "ArrowBufferedStreams.h"

+#    include <orc/Vector.hh>
+

 namespace DB
 {
@ -110,7 +113,21 @@ static const orc::Type * getORCTypeByName(const orc::Type & schema, const String
    return nullptr;
 }

-static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_with_unsupported_types, bool & skipped)
+static bool isDictionaryEncoded(const orc::StripeInformation * stripe_info, const orc::Type * orc_type)
+{
+    if (!stripe_info)
+        return false;
+
+    auto encoding = stripe_info->getColumnEncoding(orc_type->getColumnId());
+    return encoding == orc::ColumnEncodingKind_DICTIONARY || encoding == orc::ColumnEncodingKind_DICTIONARY_V2;
+}
+
+static DataTypePtr parseORCType(
+    const orc::Type * orc_type,
+    bool skip_columns_with_unsupported_types,
+    bool dictionary_as_low_cardinality,
+    const orc::StripeInformation * stripe_info,
+    bool & skipped)
 {
    assert(orc_type != nullptr);

@ -137,12 +154,25 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
            return std::make_shared<DataTypeDateTime64>(9);
        case orc::TypeKind::TIMESTAMP_INSTANT:
            return std::make_shared<DataTypeDateTime64>(9, "UTC");
+        case orc::TypeKind::CHAR:
        case orc::TypeKind::VARCHAR:
        case orc::TypeKind::BINARY:
-        case orc::TypeKind::STRING:
-            return std::make_shared<DataTypeString>();
-        case orc::TypeKind::CHAR:
-            return std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
+        case orc::TypeKind::STRING: {
+            DataTypePtr type;
+            if (orc_type->getKind() == orc::TypeKind::CHAR)
+                type = std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
+            else
+                type = std::make_shared<DataTypeString>();
+
+            // std::cout << "type:" << type->getName() << std::endl;
+            // std::cout << "dictionary_as_low_cardinality:" << dictionary_as_low_cardinality << std::endl;
+
+            /// Wrap type in LowCardinality if orc column is dictionary encoded and dictionary_as_low_cardinality is true
+            if (dictionary_as_low_cardinality && isDictionaryEncoded(stripe_info, orc_type))
+                type = std::make_shared<DataTypeLowCardinality>(type);
+
+            return type;
+        }
        case orc::TypeKind::DECIMAL: {
            UInt64 precision = orc_type->getPrecision();
            UInt64 scale = orc_type->getScale();
@ -158,7 +188,8 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
            if (subtype_count != 1)
                throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc List type {}", orc_type->toString());

-            DataTypePtr nested_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped);
+            DataTypePtr nested_type = parseORCType(
+                orc_type->getSubtype(0), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
            if (skipped)
                return {};

@ -168,11 +199,12 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi
            if (subtype_count != 2)
                throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Orc Map type {}", orc_type->toString());

-            DataTypePtr key_type = parseORCType(orc_type->getSubtype(0), skip_columns_with_unsupported_types, skipped);
+            DataTypePtr key_type = parseORCType(
+                orc_type->getSubtype(0), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
            if (skipped)
                return {};

-            DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, skipped);
+            DataTypePtr value_type = parseORCType(orc_type->getSubtype(1), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
            if (skipped)
                return {};

@ -186,7 +218,8 @@ static DataTypePtr parseORCType(const orc::Type * orc_type, bool skip_columns_wi

            for (size_t i = 0; i < orc_type->getSubtypeCount(); ++i)
            {
-                auto parsed_type = parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, skipped);
+                auto parsed_type
+                    = parseORCType(orc_type->getSubtype(i), skip_columns_with_unsupported_types, dictionary_as_low_cardinality, stripe_info, skipped);
                if (skipped)
                    return {};

@ -489,7 +522,7 @@ static void buildORCSearchArgumentImpl(
            ///     For queries with where condition like "a > 10", if a column contains negative values such as "-1", pushing or not pushing
            ///     down filters would result in different outputs.
            bool skipped = false;
-            auto expect_type = makeNullableRecursively(parseORCType(orc_type, true, skipped));
+            auto expect_type = makeNullableRecursively(parseORCType(orc_type, true, false, nullptr, skipped));
            const ColumnWithTypeAndName * column = header.findByName(column_name, format_settings.orc.case_insensitive_column_matching);
            if (!expect_type || !column)
            {
@ -857,11 +890,18 @@ void NativeORCBlockInputFormat::prepareFileReader()
    total_stripes = static_cast<int>(file_reader->getNumberOfStripes());
    current_stripe = -1;

+
+    std::unique_ptr<orc::StripeInformation> stripe_info;
+    if (file_reader->getNumberOfStripes())
+        stripe_info = file_reader->getStripe(0);
+
    orc_column_to_ch_column = std::make_unique<ORCColumnToCHColumn>(
        getPort().getHeader(),
        format_settings.orc.allow_missing_columns,
        format_settings.null_as_default,
-        format_settings.orc.case_insensitive_column_matching);
+        format_settings.orc.case_insensitive_column_matching,
+        format_settings.orc.dictionary_as_low_cardinality,
+        std::move(stripe_info));

    const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
    const auto & header = getPort().getHeader();
@ -899,6 +939,7 @@ bool NativeORCBlockInputFormat::prepareStripeReader()
        throw Exception(ErrorCodes::INCORRECT_DATA, "ORC stripe {} has no rows", current_stripe);

    orc::RowReaderOptions row_reader_options;
+    row_reader_options.setEnableLazyDecoding(format_settings.orc.dictionary_as_low_cardinality);
    row_reader_options.includeTypes(include_indices);
    row_reader_options.setTimezoneName(format_settings.orc.reader_time_zone_name);
    row_reader_options.range(current_stripe_info->getOffset(), current_stripe_info->getLength());
@ -989,15 +1030,25 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
    std::atomic<int> is_stopped = 0;
    getFileReader(in, file_reader, format_settings, is_stopped);

+
    const auto & schema = file_reader->getType();
    Block header;
+    std::unique_ptr<orc::StripeInformation> stripe_info;
+    if (file_reader->getNumberOfStripes())
+        stripe_info = file_reader->getStripe(0);
+
    for (size_t i = 0; i < schema.getSubtypeCount(); ++i)
    {
        const std::string & name = schema.getFieldName(i);
        const orc::Type * orc_type = schema.getSubtype(i);

        bool skipped = false;
-        DataTypePtr type = parseORCType(orc_type, format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference, skipped);
+        DataTypePtr type = parseORCType(
+            orc_type,
+            format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference,
+            format_settings.orc.dictionary_as_low_cardinality,
+            stripe_info.get(),
+            skipped);
        if (!skipped)
            header.insert(ColumnWithTypeAndName{type, name});
    }
@ -1008,11 +1059,18 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
 }

 ORCColumnToCHColumn::ORCColumnToCHColumn(
-    const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_)
+    const Block & header_,
+    bool allow_missing_columns_,
+    bool null_as_default_,
+    bool case_insensitive_matching_,
+    bool dictionary_as_low_cardinality_,
+    std::unique_ptr<orc::StripeInformation> stripe_info_)
    : header(header_)
    , allow_missing_columns(allow_missing_columns_)
    , null_as_default(null_as_default_)
    , case_insensitive_matching(case_insensitive_matching_)
+    , dictionary_as_low_cardinality(dictionary_as_low_cardinality_)
+    , stripe_info(std::move(stripe_info_))
 {
 }

@ -1126,6 +1184,117 @@ readColumnWithNumericDataCast(const orc::ColumnVectorBatch * orc_column, const o
    return {std::move(internal_column), std::move(internal_type), column_name};
 }

+template <bool fixed_string>
+static ColumnWithTypeAndName readColumnWithEncodedStringOrFixedStringData(
+    const orc::ColumnVectorBatch * orc_column, const orc::Type * orc_type, const String & column_name, bool nullable)
+{
+    const auto * orc_str_column = dynamic_cast<const orc::EncodedStringVectorBatch *>(orc_column);
+    size_t rows = orc_str_column->numElements;
+    const auto & orc_dict = *orc_str_column->dictionary;
+    size_t dict_size = orc_dict.dictionaryOffset.size() - 1;
+
+    /// Fill CH holder_column with orc dictionary
+    /// Note that holder_column is always a ColumnString or ColumnFixedstring whether nullable is true or false, because ORC dictionary doesn't contain null values.
+    DataTypePtr holder_type;
+    if constexpr (fixed_string)
+        holder_type = std::make_shared<DataTypeFixedString>(orc_type->getMaximumLength());
+    else
+        holder_type = std::make_shared<DataTypeString>();
+
+    auto holder_column = holder_type->createColumn();
+    if constexpr (fixed_string)
+    {
+        const size_t n = orc_type->getMaximumLength();
+        auto & concrete_holder_column = assert_cast<ColumnFixedString &>(*holder_column);
+        PaddedPODArray<UInt8> & column_chars_t = concrete_holder_column.getChars();
+        size_t reserve_size = dict_size * n;
+        column_chars_t.resize_exact(reserve_size);
+        size_t curr_offset = 0;
+        for (size_t i = 0; i < dict_size; ++i)
+        {
+            const auto * buf = orc_dict.dictionaryBlob.data() + orc_dict.dictionaryOffset[i];
+            size_t buf_size = orc_dict.dictionaryOffset[i + 1] - orc_dict.dictionaryOffset[i];
+            memcpy(&column_chars_t[curr_offset], buf, buf_size);
+            curr_offset += n;
+        }
+    }
+    else
+    {
+        auto & concrete_holder_column = assert_cast<ColumnString &>(*holder_column);
+        PaddedPODArray<UInt8> & column_chars_t = concrete_holder_column.getChars();
+        PaddedPODArray<UInt64> & column_offsets = concrete_holder_column.getOffsets();
+
+        size_t reserve_size = orc_dict.dictionaryBlob.size() + dict_size;
+        column_chars_t.resize_exact(reserve_size);
+        column_offsets.resize_exact(dict_size);
+        size_t curr_offset = 0;
+        for (size_t i = 0; i < dict_size; ++i)
+        {
+            const auto * buf = orc_dict.dictionaryBlob.data() + orc_dict.dictionaryOffset[i];
+            size_t buf_size = orc_dict.dictionaryOffset[i + 1] - orc_dict.dictionaryOffset[i];
+            memcpy(&column_chars_t[curr_offset], buf, buf_size);
+            curr_offset += buf_size;
+
+            column_chars_t[curr_offset] = 0;
+            ++curr_offset;
+
+            column_offsets[i] = curr_offset;
+        }
+    }
+
+    /// Insert CH dictionary_column from holder_column
+    DataTypePtr nested_type = nullable ? std::make_shared<DataTypeNullable>(holder_type) : holder_type;
+    auto internal_type = std::make_shared<DataTypeLowCardinality>(std::move(nested_type));
+    auto tmp_internal_column = internal_type->createColumn();
+    auto dictionary_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_internal_column.get())->getDictionaryPtr());
+    auto index_column
+        = dynamic_cast<IColumnUnique *>(dictionary_column.get())->uniqueInsertRangeFrom(*holder_column, 0, holder_column->size());
+
+    /// Fill index_column and wrap it with LowCardinality
+    auto call_by_type = [&](auto index_type) -> MutableColumnPtr
+    {
+        using IndexType = decltype(index_type);
+        const ColumnVector<IndexType> * concrete_index_column = checkAndGetColumn<ColumnVector<IndexType>>(index_column.get());
+        if (!concrete_index_column)
+            return nullptr;
+
+        const auto & index_data = concrete_index_column->getData();
+        auto new_index_column = ColumnVector<IndexType>::create(rows);
+        auto & new_index_data = dynamic_cast<ColumnVector<IndexType> &>(*new_index_column).getData();
+
+        if (!orc_str_column->hasNulls)
+        {
+            for (size_t i = 0; i < rows; ++i)
+            {
+                /// First map row index to orc dictionary index, then map orc dictionary index to CH dictionary index
+                new_index_data[i] = index_data[orc_str_column->index[i]];
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < rows; ++i)
+            {
+                /// Set index 0 if we meet null value. If dictionary_column is nullable, 0 represents null value.
+                /// Otherwise 0 represents default string value, it is reasonable because null values are converted to default values when casting nullable column to non-nullable.
+                new_index_data[i] = orc_str_column->notNull[i] ? index_data[orc_str_column->index[i]] : 0;
+            }
+        }
+
+        return ColumnLowCardinality::create(std::move(dictionary_column), std::move(new_index_column));
+    };
+
+    MutableColumnPtr internal_column;
+    if (!internal_column)
+        internal_column = call_by_type(UInt8());
+    if (!internal_column)
+        internal_column = call_by_type(UInt16());
+    if (!internal_column)
+        internal_column = call_by_type(UInt32());
+    if (!internal_column)
+        internal_column = call_by_type(UInt64());
+    return {std::move(internal_column), std::move(internal_type), column_name};
+}
+
 static ColumnWithTypeAndName
 readColumnWithStringData(const orc::ColumnVectorBatch * orc_column, const orc::Type *, const String & column_name)
 {
@ -1369,16 +1538,16 @@ readColumnWithTimestampData(const orc::ColumnVectorBatch * orc_column, const orc
    return {std::move(internal_column), std::move(internal_type), column_name};
 }

-static ColumnWithTypeAndName readColumnFromORCColumn(
+ColumnWithTypeAndName ORCColumnToCHColumn::readColumnFromORCColumn(
    const orc::ColumnVectorBatch * orc_column,
    const orc::Type * orc_type,
    const std::string & column_name,
    bool inside_nullable,
-    DataTypePtr type_hint = nullptr)
+    DataTypePtr type_hint) const
 {
    bool skipped = false;

-    if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable()))
+    if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable())) && !orc_column->isEncoded
        && (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP && orc_type->getKind() != orc::STRUCT))
    {
        DataTypePtr nested_type_hint;
@ -1420,7 +1589,14 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
                    default:;
                }
            }
-            return readColumnWithStringData(orc_column, orc_type, column_name);
+
+            if (orc_column->isEncoded && dictionary_as_low_cardinality)
+            {
+                bool nullable = type_hint ? isNullableOrLowCardinalityNullable(type_hint) : true;
+                return readColumnWithEncodedStringOrFixedStringData<false>(orc_column, orc_type, column_name, nullable);
+            }
+            else
+                return readColumnWithStringData(orc_column, orc_type, column_name);
        }
        case orc::CHAR: {
            if (type_hint)
@ -1438,7 +1614,14 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
                    default:;
                }
            }
-            return readColumnWithFixedStringData(orc_column, orc_type, column_name);
+
+            if (orc_column->isEncoded && dictionary_as_low_cardinality)
+            {
+                bool nullable = type_hint ? isNullableOrLowCardinalityNullable(type_hint) : true;
+                return readColumnWithEncodedStringOrFixedStringData<true>(orc_column, orc_type, column_name, nullable);
+            }
+            else
+                return readColumnWithFixedStringData(orc_column, orc_type, column_name);
        }
        case orc::BOOLEAN:
            return readColumnWithBooleanData(orc_column, orc_type, column_name);
@ -1465,7 +1648,7 @@ static ColumnWithTypeAndName readColumnFromORCColumn(
        case orc::TIMESTAMP_INSTANT:
            return readColumnWithTimestampData(orc_column, orc_type, column_name);
        case orc::DECIMAL: {
-            auto interal_type = parseORCType(orc_type, false, skipped);
+            auto interal_type = parseORCType(orc_type, false, false, nullptr, skipped);

            auto precision = orc_type->getPrecision();
            if (precision == 0)
--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h
@ -111,7 +111,13 @@ public:
    using ORCColumnWithType = std::pair<ORCColumnPtr, ORCTypePtr>;
    using NameToColumnPtr = std::unordered_map<std::string, ORCColumnWithType>;

-    ORCColumnToCHColumn(const Block & header_, bool allow_missing_columns_, bool null_as_default_, bool case_insensitive_matching_ = false);
+    ORCColumnToCHColumn(
+        const Block & header_,
+        bool allow_missing_columns_,
+        bool null_as_default_,
+        bool case_insensitive_matching_ = false,
+        bool dictionary_as_low_cardinality_ = false,
+        std::unique_ptr<orc::StripeInformation> stripe_info_ = {});

    void orcTableToCHChunk(
        Chunk & res,
@ -124,11 +130,20 @@ public:
        Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows, BlockMissingValues * block_missing_values = nullptr);

 private:
+    ColumnWithTypeAndName readColumnFromORCColumn(
+        const orc::ColumnVectorBatch * orc_column,
+        const orc::Type * orc_type,
+        const std::string & column_name,
+        bool inside_nullable,
+        DataTypePtr type_hint = nullptr) const;
+
    const Block & header;
    /// If false, throw exception if some columns in header not exists in arrow table.
    bool allow_missing_columns;
    bool null_as_default;
    bool case_insensitive_matching;
+    bool dictionary_as_low_cardinality;
+    std::unique_ptr<orc::StripeInformation> stripe_info;
 };
 }
 #endif