Merge pull request #40173 from Avogar/arrow-dict

Improve and fix dictionaries in Arrow format
2024-11-22 07:31:57 +00:00 · 2022-08-18 20:54:55 +02:00 · 2022-08-18 20:54:55 +02:00 · b67cb9e378
commit b67cb9e378
parent 2cfeff45ba a6318cecd5
13 changed files with 301 additions and 114 deletions
--- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
+++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
@ -70,6 +70,7 @@ namespace ErrorCodes
    extern const int THERE_IS_NO_COLUMN;
    extern const int UNKNOWN_EXCEPTION;
    extern const int INCORRECT_NUMBER_OF_COLUMNS;
+    extern const int INCORRECT_DATA;
 }

 /// Inserts numeric data right into internal column data to reduce an overhead
@ -266,6 +267,9 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr<arrow::Ch
 /// Creates a null bytemap from arrow's null bitmap
 static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
 {
+    if (!arrow_column->null_count())
+        return ColumnUInt8::create(arrow_column->length(), 0);
+
    auto nullmap_column = ColumnUInt8::create();
    PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(*nullmap_column).getData();
    bytemap_data.reserve(arrow_column->length());
@ -298,14 +302,121 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedAr
    return offsets_column;
 }

-static ColumnPtr readColumnWithIndexesData(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
+/*
+ * Arrow Dictionary and ClickHouse LowCardinality types are a bit different.
+ * Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index.
+ * It doesn't have the concept of null or default values.
+ * An empty string is just a regular value appended at any position of the dictionary.
+ * Null values have an index of 0, but it should be ignored since the nullmap will return null.
+ * In ClickHouse LowCardinality, it's different. The dictionary contains null (if dictionary type is Nullable)
+ * and default values at the beginning. [default, ...] when default values have index of 0 or [null, default, ...]
+ * when null values have an index of 0 and default values have an index of 1.
+ * So, we should remap indexes while converting Arrow Dictionary to ClickHouse LowCardinality
+ * */
+template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
+static ColumnWithTypeAndName readColumnWithIndexesDataImpl(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, Int64 default_value_index, NumericType dict_size, bool is_nullable)
+{
+    auto internal_type = std::make_shared<DataTypeNumber<NumericType>>();
+    auto internal_column = internal_type->createColumn();
+    auto & column_data = static_cast<VectorType &>(*internal_column).getData();
+    column_data.reserve(arrow_column->length());
+    NumericType shift = is_nullable ? 2 : 1;
+
+    for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
+    {
+        std::shared_ptr<arrow::Array> chunk = arrow_column->chunk(chunk_i);
+        if (chunk->length() == 0)
+            continue;
+
+        /// buffers[0] is a null bitmap and buffers[1] are actual values
+        std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
+        const auto * data = reinterpret_cast<const NumericType *>(buffer->data());
+
+        /// Check that indexes are correct (protection against corrupted files)
+        for (int64_t i = 0; i != chunk->length(); ++i)
+        {
+            if (data[i] < 0 || data[i] >= dict_size)
+                throw Exception(ErrorCodes::INCORRECT_DATA, "Index {} in Dictionary column is out of bounds, dictionary size is {}", Int64(data[i]), UInt64(dict_size));
+        }
+
+        /// If dictionary type is not nullable and arrow dictionary contains default type
+        /// at 0 index, we don't need to remap anything (it's the case when this data
+        /// was generated by ClickHouse)
+        if (!is_nullable && default_value_index == 0)
+        {
+            column_data.insert_assume_reserved(data, data + chunk->length());
+        }
+        /// If dictionary don't contain default value, we should move all indexes
+        /// to the right one or two (if dictionary is Nullable) positions
+        /// Example:
+        /// Dictionary:
+        ///     dict: ["one", "two"]
+        ///     indexes: [0, 1, 0]
+        /// LowCardinality:
+        ///     dict: ["", "one", "two"]
+        ///     indexes: [1, 2, 1]
+        /// LowCardinality(Nullable):
+        ///     dict: [null, "", "one", "two"]
+        ///     indexes: [2, 3, 2]
+        else if (default_value_index == -1)
+        {
+            for (int64_t i = 0; i != chunk->length(); ++i)
+            {
+                if (chunk->IsNull(i))
+                    column_data.push_back(0);
+                else
+                    column_data.push_back(data[i] + shift);
+            }
+        }
+        /// If dictionary contains default value, we change all indexes of it to
+        /// 0 or 1 (if dictionary type is Nullable) and move all indexes
+        /// that are less then default value index to the right one or two
+        /// (if dictionary is Nullable) position and all indexes that are
+        /// greater then default value index zero or one (if dictionary is Nullable)
+        /// positions.
+        /// Example:
+        /// Dictionary:
+        ///     dict: ["one", "two", "", "three"]
+        ///     indexes: [0, 1, 2, 3, 0]
+        /// LowCardinality :
+        ///     dict: ["", "one", "two", "three"]
+        ///     indexes: [1, 2, 0, 3, 1]
+        /// LowCardinality(Nullable):
+        ///     dict: [null, "", "one", "two", "three"]
+        ///     indexes: [2, 3, 1, 4, 2]
+        else
+        {
+            NumericType new_default_index = is_nullable ? 1 : 0;
+            NumericType default_index = NumericType(default_value_index);
+            for (int64_t i = 0; i != chunk->length(); ++i)
+            {
+                if (chunk->IsNull(i))
+                    column_data.push_back(0);
+                else
+                {
+                    NumericType value = data[i];
+                    if (value == default_index)
+                        value = new_default_index;
+                    else if (value < default_index)
+                        value += shift;
+                    else
+                        value += shift - 1;
+                    column_data.push_back(value);
+                }
+            }
+        }
+    }
+    return {std::move(internal_column), std::move(internal_type), column_name};
+}
+
+static ColumnPtr readColumnWithIndexesData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, Int64 default_value_index, UInt64 dict_size, bool is_nullable)
 {
    switch (arrow_column->type()->id())
    {
 #    define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
            case ARROW_NUMERIC_TYPE: \
            { \
-                    return readColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, "").column; \
+                    return readColumnWithIndexesDataImpl<CPP_NUMERIC_TYPE>(arrow_column, "", default_value_index, dict_size, is_nullable).column; \
            }
        FOR_ARROW_INDEXES_TYPES(DISPATCH)
 #    undef DISPATCH
@ -327,85 +438,25 @@ static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr
    return std::make_shared<arrow::ChunkedArray>(array_vector);
 }

-static ColumnWithTypeAndName createLCColumnFromArrowDictionaryValues(
-    const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
-    const ColumnPtr & indexes_column,
-    const String & column_name
-)
-{
-    auto lc_type = std::make_shared<DataTypeLowCardinality>(dict_values->type);
-
-    auto lc_column = lc_type->createColumn();
-
-    for (auto i = 0u; i < indexes_column->size(); i++)
-    {
-        Field f;
-        dict_values->column->get(indexes_column->getUInt(i), f);
-        lc_column->insert(f);
-    }
-
-    return {std::move(lc_column), std::move(lc_type), column_name};
-}
-
-/*
- * Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index.
- * It doesn't have the concept of null or default values.
- * An empty string is just a regular value appended at any position of the dictionary.
- * Null values have an index of 0, but it should be ignored since the nullmap will return null.
- * In ClickHouse LowCardinality, it's different. The dictionary contains null and default values at the beginning.
- * [null, default, ...]. Therefore, null values have an index of 0 and default values have an index of 1.
- * No nullmap is used.
- * */
-static ColumnWithTypeAndName createLCOfNullableColumnFromArrowDictionaryValues(
-    const std::shared_ptr<ColumnWithTypeAndName> & dict_values,
-    const ColumnPtr & indexes_column,
-    const ColumnPtr & nullmap_column,
-    const String & column_name
-)
-{
-    /*
-     * ArrowColumn format handles nulls by maintaining a nullmap column, there is no nullable type.
-     * Therefore, dict_values->type is the actual data type/ non-nullable. It needs to be transformed into nullable
-     * so LC column is created from nullable type and a null value at the beginning of the collection
-     * is automatically added.
-     * */
-    auto lc_type = std::make_shared<DataTypeLowCardinality>(makeNullable(dict_values->type));
-
-    auto lc_column = lc_type->createColumn();
-
-    for (auto i = 0u; i < indexes_column->size(); i++)
-    {
-        if (nullmap_column && nullmap_column->getBool(i))
-        {
-            lc_column->insertDefault();
-        }
-        else
-        {
-            Field f;
-            dict_values->column->get(indexes_column->getUInt(i), f);
-            lc_column->insert(f);
-        }
-    }
-
-    return {std::move(lc_column), std::move(lc_type), column_name};
-}
-
 static ColumnWithTypeAndName readColumnFromArrowColumn(
    std::shared_ptr<arrow::ChunkedArray> & arrow_column,
    const std::string & column_name,
    const std::string & format_name,
    bool is_nullable,
-    std::unordered_map<String, std::shared_ptr<ColumnWithTypeAndName>> & dictionary_values,
-    bool read_ints_as_dates,
+    std::unordered_map<String, ArrowColumnToCHColumn::DictionaryInfo> & dictionary_infos,
    bool allow_null_type,
    bool skip_columns_with_unsupported_types,
-    bool & skipped)
+    bool & skipped,
+    DataTypePtr type_hint = nullptr)
 {
-    if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST
+    if (!is_nullable && (arrow_column->null_count() || (type_hint && type_hint->isNullable())) && arrow_column->type()->id() != arrow::Type::LIST
        && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT &&
        arrow_column->type()->id() != arrow::Type::DICTIONARY)
    {
-        auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
+        DataTypePtr nested_type_hint;
+        if (type_hint)
+            nested_type_hint = removeNullable(type_hint);
+        auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
        if (skipped)
            return {};
        auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
@ -435,14 +486,14 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
        case arrow::Type::UINT16:
        {
            auto column = readColumnWithNumericData<UInt16>(arrow_column, column_name);
-            if (read_ints_as_dates)
+            if (type_hint && (isDateOrDate32(type_hint) || isDateTime(type_hint) || isDateTime64(type_hint)))
                column.type = std::make_shared<DataTypeDate>();
            return column;
        }
        case arrow::Type::UINT32:
        {
            auto column = readColumnWithNumericData<UInt32>(arrow_column, column_name);
-            if (read_ints_as_dates)
+            if (type_hint && (isDateOrDate32(type_hint) || isDateTime(type_hint) || isDateTime64(type_hint)))
                column.type = std::make_shared<DataTypeDateTime>();
            return column;
        }
@ -454,8 +505,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
            return readColumnWithDecimalData<arrow::Decimal256Array>(arrow_column, column_name);
        case arrow::Type::MAP:
        {
+            DataTypePtr nested_type_hint;
+            if (type_hint)
+            {
+                const auto * map_type_hint = typeid_cast<const DataTypeMap *>(type_hint.get());
+                if (map_type_hint)
+                    nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType();
+            }
            auto arrow_nested_column = getNestedArrowColumn(arrow_column);
-            auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
+            auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
            if (skipped)
                return {};

@ -469,8 +527,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
        }
        case arrow::Type::LIST:
        {
+            DataTypePtr nested_type_hint;
+            if (type_hint)
+            {
+                const auto * array_type_hint = typeid_cast<const DataTypeArray *>(type_hint.get());
+                if (array_type_hint)
+                    nested_type_hint = array_type_hint->getNestedType();
+            }
            auto arrow_nested_column = getNestedArrowColumn(arrow_column);
-            auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
+            auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
            if (skipped)
                return {};
            auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
@ -493,11 +558,25 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
            Columns tuple_elements;
            DataTypes tuple_types;
            std::vector<String> tuple_names;
+            const auto * tuple_type_hint = type_hint ? typeid_cast<const DataTypeTuple *>(type_hint.get()) : nullptr;

            for (int i = 0; i != arrow_struct_type->num_fields(); ++i)
            {
+                auto field_name = arrow_struct_type->field(i)->name();
+                DataTypePtr nested_type_hint;
+                if (tuple_type_hint)
+                {
+                    if (tuple_type_hint->haveExplicitNames())
+                    {
+                        auto pos = tuple_type_hint->tryGetPositionByName(field_name);
+                        if (pos)
+                            nested_type_hint = tuple_type_hint->getElement(*pos);
+                    }
+                    else if (size_t(i) < tuple_type_hint->getElements().size())
+                        nested_type_hint = tuple_type_hint->getElement(i);
+                }
                auto nested_arrow_column = std::make_shared<arrow::ChunkedArray>(nested_arrow_columns[i]);
-                auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
+                auto element = readColumnFromArrowColumn(nested_arrow_column, field_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
                if (skipped)
                    return {};
                tuple_elements.emplace_back(std::move(element.column));
@ -511,9 +590,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
        }
        case arrow::Type::DICTIONARY:
        {
-            auto & dict_values = dictionary_values[column_name];
+            auto & dict_info = dictionary_infos[column_name];
+            const auto is_lc_nullable = arrow_column->null_count() > 0 || (type_hint && type_hint->isLowCardinalityNullable());
+
            /// Load dictionary values only once and reuse it.
-            if (!dict_values)
+            if (!dict_info.values)
            {
                arrow::ArrayVector dict_array;
                for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -522,8 +603,22 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
                    dict_array.emplace_back(dict_chunk.dictionary());
                }
                auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
-                auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
-                dict_values = std::make_shared<ColumnWithTypeAndName>(std::move(dict_column));
+                auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped);
+                for (size_t i = 0; i != dict_column.column->size(); ++i)
+                {
+                    if (dict_column.column->isDefaultAt(i))
+                    {
+                        dict_info.default_value_index = i;
+                        break;
+                    }
+                }
+                auto lc_type = std::make_shared<DataTypeLowCardinality>(is_lc_nullable ? makeNullable(dict_column.type) : dict_column.type);
+                auto tmp_lc_column = lc_type->createColumn();
+                auto tmp_dict_column = IColumn::mutate(assert_cast<ColumnLowCardinality *>(tmp_lc_column.get())->getDictionaryPtr());
+                dynamic_cast<IColumnUnique *>(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size());
+                dict_column.column = std::move(tmp_dict_column);
+                dict_info.values = std::make_shared<ColumnWithTypeAndName>(std::move(dict_column));
+                dict_info.dictionary_size = arrow_dict_column->length();
            }

            arrow::ArrayVector indexes_array;
@ -534,20 +629,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
            }

            auto arrow_indexes_column = std::make_shared<arrow::ChunkedArray>(indexes_array);
-            auto indexes_column = readColumnWithIndexesData(arrow_indexes_column);
-
-            const auto contains_null = arrow_column->null_count() > 0;
-
-            if (contains_null)
-            {
-                auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
-
-                return createLCOfNullableColumnFromArrowDictionaryValues(dict_values, indexes_column, nullmap_column, column_name);
-            }
-            else
-            {
-                return createLCColumnFromArrowDictionaryValues(dict_values, indexes_column, column_name);
-            }
+            auto indexes_column = readColumnWithIndexesData(arrow_indexes_column, dict_info.default_value_index, dict_info.dictionary_size, is_lc_nullable);
+            auto lc_column = ColumnLowCardinality::create(dict_info.values->column, indexes_column);
+            auto lc_type = std::make_shared<DataTypeLowCardinality>(is_lc_nullable ? makeNullable(dict_info.values->type) : dict_info.values->type);
+            return {std::move(lc_column), std::move(lc_type), column_name};
        }
 #    define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
        case ARROW_NUMERIC_TYPE: \
@ -623,13 +708,13 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(

        arrow::ArrayVector array_vector = {arrow_array};
        auto arrow_column = std::make_shared<arrow::ChunkedArray>(array_vector);
-        std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dict_values;
+        std::unordered_map<std::string, DictionaryInfo> dict_infos;
        bool skipped = false;
        bool allow_null_type = false;
        if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable())
            allow_null_type = true;
        ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(
-            arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped);
+            arrow_column, field->name(), format_name, false, dict_infos, allow_null_type, skip_columns_with_unsupported_types, skipped);
        if (!skipped)
            sample_columns.emplace_back(std::move(sample_column));
    }
@ -700,9 +785,17 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
                {
                    if (!nested_tables.contains(search_nested_table_name))
                    {
+                        NamesAndTypesList nested_columns;
+                        for (const auto & name_and_type : header.getNamesAndTypesList())
+                        {
+                            if (name_and_type.name.starts_with(nested_table_name + "."))
+                                nested_columns.push_back(name_and_type);
+                        }
+                        auto nested_table_type = Nested::collect(nested_columns).front().type;
+
                        std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name];
                        ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(
-                            arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)};
+                            arrow_column, nested_table_name, format_name, false, dictionary_infos, true, false, skipped, nested_table_type)};
                        BlockPtr block_ptr = std::make_shared<Block>(cols);
                        auto column_extractor = std::make_shared<NestedColumnExtractHelper>(*block_ptr, case_insensitive_matching);
                        nested_tables[search_nested_table_name] = {block_ptr, column_extractor};
@ -735,7 +828,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
        {
            auto arrow_column = name_to_column_ptr[search_column_name];
            column = readColumnFromArrowColumn(
-                arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped);
+                arrow_column, header_column.name, format_name, false, dictionary_infos, true, false, skipped, header_column.type);
        }

        try
--- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h
+++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h
@ -44,6 +44,14 @@ public:
        const Block * hint_header = nullptr,
        bool ignore_case = false);

+    struct DictionaryInfo
+    {
+        std::shared_ptr<ColumnWithTypeAndName> values;
+        Int64 default_value_index = -1;
+        UInt64 dictionary_size;
+    };
+
+
 private:
    const Block & header;
    const std::string format_name;
@ -55,7 +63,7 @@ private:
    /// Map {column name : dictionary column}.
    /// To avoid converting dictionary from Arrow Dictionary
    /// to LowCardinality every chunk we save it and reuse.
-    std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dictionary_values;
+    std::unordered_map<std::string, DictionaryInfo> dictionary_infos;
 };

 }
--- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp
+++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp
@ -235,27 +235,30 @@ namespace DB
    }

    template<typename T>
-    static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end)
+    static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end, bool shift)
    {
        const PaddedPODArray<T> & data = assert_cast<const ColumnVector<T> *>(column.get())->getData();
        PaddedPODArray<Int64> result;
        result.reserve(end - start);
-        std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); });
+        if (shift)
+            std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value) - 1; });
+        else
+            std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); });
        return result;
    }

-    static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end)
+    static PaddedPODArray<Int64> extractIndexesImpl(ColumnPtr column, size_t start, size_t end, bool shift)
    {
        switch (column->getDataType())
        {
            case TypeIndex::UInt8:
-                return extractIndexesImpl<UInt8>(column, start, end);
+                return extractIndexesImpl<UInt8>(column, start, end, shift);
            case TypeIndex::UInt16:
-                return extractIndexesImpl<UInt16>(column, start, end);
+                return extractIndexesImpl<UInt16>(column, start, end, shift);
            case TypeIndex::UInt32:
-                return extractIndexesImpl<UInt32>(column, start, end);
+                return extractIndexesImpl<UInt32>(column, start, end, shift);
            case TypeIndex::UInt64:
-                return extractIndexesImpl<UInt64>(column, start, end);
+                return extractIndexesImpl<UInt64>(column, start, end, shift);
            default:
                throw Exception(fmt::format("Indexes column must be ColumnUInt, got {}.", column->getName()),
                                ErrorCodes::LOGICAL_ERROR);
@ -267,7 +270,7 @@ namespace DB
        const String & column_name,
        ColumnPtr & column,
        const std::shared_ptr<const IDataType> & column_type,
-        const PaddedPODArray<UInt8> * null_bytemap,
+        const PaddedPODArray<UInt8> *,
        arrow::ArrayBuilder * array_builder,
        String format_name,
        size_t start,
@ -278,6 +281,7 @@ namespace DB
        const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get());
        arrow::DictionaryBuilder<ValueType> * builder = assert_cast<arrow::DictionaryBuilder<ValueType> *>(array_builder);
        auto & dict_values = dictionary_values[column_name];
+        bool is_nullable = column_type->isLowCardinalityNullable();

        /// Convert dictionary from LowCardinality to Arrow dictionary only once and then reuse it.
        if (!dict_values)
@ -288,9 +292,9 @@ namespace DB
            arrow::Status status = MakeBuilder(pool, value_type, &values_builder);
            checkStatus(status, column->getName(), format_name);

-            auto dict_column = column_lc->getDictionary().getNestedColumn();
-            const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
-            fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values);
+            auto dict_column = column_lc->getDictionary().getNestedNotNullableColumn();
+            const auto & dict_type = removeNullable(assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType());
+            fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, is_nullable, dict_column->size(), output_string_as_string, dictionary_values);
            status = values_builder->Finish(&dict_values);
            checkStatus(status, column->getName(), format_name);
        }
@ -300,15 +304,14 @@ namespace DB

        /// AppendIndices in DictionaryBuilder works only with int64_t data, so we cannot use
        /// fillArrowArray here and should copy all indexes to int64_t container.
-        auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end);
+        auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end, is_nullable);
        const uint8_t * arrow_null_bytemap_raw_ptr = nullptr;
        PaddedPODArray<uint8_t> arrow_null_bytemap;
-        if (null_bytemap)
+        if (column_type->isLowCardinalityNullable())
        {
-            /// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
            arrow_null_bytemap.reserve(end - start);
            for (size_t i = start; i < end; ++i)
-                arrow_null_bytemap.emplace_back(!(*null_bytemap)[i]);
+                arrow_null_bytemap.emplace_back(!column_lc->isNullAt(i));

            arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
        }
@ -680,7 +683,7 @@ namespace DB
        {
            auto nested_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
            const auto * lc_column = assert_cast<const ColumnLowCardinality *>(column.get());
-            const auto & nested_column = lc_column->getDictionaryPtr();
+            const auto & nested_column = lc_column->getDictionary().getNestedColumn();
            const auto & indexes_column = lc_column->getIndexesPtr();
            return arrow::dictionary(
                getArrowTypeForLowCardinalityIndexes(indexes_column),
--- a/tests/performance/arrow_format.xml
+++ b/tests/performance/arrow_format.xml
@ -0,0 +1,10 @@
+<test>
+    <create_query>CREATE TABLE test (uint32 UInt32, n_uint32 Nullable(UInt32), lc LowCardinality(String)) ENGINE=File(Arrow) SETTINGS output_format_arrow_low_cardinality_as_dictionary=1</create_query>
+    <fill_query>insert into test select number, number, toString(number % 10000) from numbers(10000000)</fill_query>
+
+    <query>SELECT uint32 from test format Null</query>
+    <query>SELECT n_uint32 from test format Null</query>
+    <query>SELECT lc from test format Null</query>
+
+    <drop_query>DROP TABLE IF EXISTS test</drop_query>
+</test>
--- a/tests/queries/0_stateless/02383_arrow_dict_special_cases.reference
+++ b/tests/queries/0_stateless/02383_arrow_dict_special_cases.reference
@ -0,0 +1,32 @@
+dict	LowCardinality(Nullable(String))					
+one
+two
+three
+one
+two
+dict	LowCardinality(Nullable(String))					
+
+one
+two
+three
+
+one
+three
+dict	LowCardinality(Nullable(String))					
+one
+two
+
+three
+one
+two
+three
+lc	LowCardinality(Nullable(String))					
+OK
+dict	LowCardinality(Nullable(String))					
+one
+two
+
+three
+one
+\N
+three
--- a/tests/queries/0_stateless/02383_arrow_dict_special_cases.sh
+++ b/tests/queries/0_stateless/02383_arrow_dict_special_cases.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+
+mkdir -p $USER_FILES_PATH/test_02383
+cp $CURDIR/data_arrow/dictionary*.arrow $USER_FILES_PATH/test_02383/
+cp $CURDIR/data_arrow/corrupted.arrow $USER_FILES_PATH/test_02383/
+cp $CURDIR/data_arrow/dict_with_nulls.arrow $USER_FILES_PATH/test_02383/
+
+$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary1.arrow')"
+$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary1.arrow')"
+$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary2.arrow')"
+$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary2.arrow')"
+$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary3.arrow')"
+$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary3.arrow')"
+
+$CLICKHOUSE_CLIENT -q "desc file('test_02383/corrupted.arrow')"
+$CLICKHOUSE_CLIENT -q "select * from file('test_02383/corrupted.arrow')" 2>&1 | grep -F -q "INCORRECT_DATA" && echo OK || echo FAIL
+
+$CLICKHOUSE_CLIENT -q "desc file('test_02383/dict_with_nulls.arrow')"
+$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dict_with_nulls.arrow')"
+
+
+rm -rf $USER_FILES_PATH/test_02383
--- a/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.reference
+++ b/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.reference
@ -0,0 +1,4 @@
+lc	LowCardinality(Nullable(String))					
+abc
+lc	LowCardinality(Nullable(String))					
+abc
--- a/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.sql
+++ b/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.sql
@ -0,0 +1,8 @@
+-- Tags: no-fasttest
+
+insert into function file(02384_data.arrow) select toLowCardinality(toNullable('abc')) as lc settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=0, engine_file_truncate_on_insert=1;
+desc file(02384_data.arrow);
+select * from file(02384_data.arrow);
+insert into function file(02384_data.arrow) select toLowCardinality(toNullable('abc')) as lc settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=1, engine_file_truncate_on_insert=1;
+desc file(02384_data.arrow);
+select * from file(02384_data.arrow);
--- a/tests/queries/0_stateless/data_arrow/corrupted.arrow
+++ b/tests/queries/0_stateless/data_arrow/corrupted.arrow
--- a/tests/queries/0_stateless/data_arrow/dict_with_nulls.arrow
+++ b/tests/queries/0_stateless/data_arrow/dict_with_nulls.arrow
--- a/tests/queries/0_stateless/data_arrow/dictionary1.arrow
+++ b/tests/queries/0_stateless/data_arrow/dictionary1.arrow
--- a/tests/queries/0_stateless/data_arrow/dictionary2.arrow
+++ b/tests/queries/0_stateless/data_arrow/dictionary2.arrow
--- a/tests/queries/0_stateless/data_arrow/dictionary3.arrow
+++ b/tests/queries/0_stateless/data_arrow/dictionary3.arrow