diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index cb24554f9f4..50e9ece0399 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -436,6 +436,22 @@ static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr return nullmap_column; } +template +struct ArrowOffsetArray; + +template <> +struct ArrowOffsetArray +{ + using type = arrow::Int32Array; +}; + +template <> +struct ArrowOffsetArray +{ + using type = arrow::Int64Array; +}; + +template static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr & arrow_column) { auto offsets_column = ColumnUInt64::create(); @@ -444,9 +460,9 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptrnum_chunks(); chunk_i < num_chunks; ++chunk_i) { - arrow::ListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); + ArrowListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); auto arrow_offsets_array = list_chunk.offsets(); - auto & arrow_offsets = dynamic_cast(*arrow_offsets_array); + auto & arrow_offsets = dynamic_cast::type &>(*arrow_offsets_array); /* * CH uses element size as "offsets", while arrow uses actual offsets as offsets. @@ -602,13 +618,14 @@ static ColumnPtr readColumnWithIndexesData(std::shared_ptr } } +template static std::shared_ptr getNestedArrowColumn(std::shared_ptr & arrow_column) { arrow::ArrayVector array_vector; array_vector.reserve(arrow_column->num_chunks()); for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i) { - arrow::ListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); + ArrowListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); /* * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks. @@ -819,12 +836,12 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( key_type_hint = map_type_hint->getKeyType(); } } - auto arrow_nested_column = getNestedArrowColumn(arrow_column); + auto arrow_nested_column = getNestedArrowColumn(arrow_column); auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, date_time_overflow_behavior, nested_type_hint, true); if (skipped) return {}; - auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); + auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); const auto * tuple_type = assert_cast(nested_column.type.get()); @@ -846,7 +863,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( return {std::move(map_column), std::move(map_type), column_name}; } case arrow::Type::LIST: + case arrow::Type::LARGE_LIST: { + bool is_large = arrow_column->type()->id() == arrow::Type::LARGE_LIST; DataTypePtr nested_type_hint; if (type_hint) { @@ -854,11 +873,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( if (array_type_hint) nested_type_hint = array_type_hint->getNestedType(); } - auto arrow_nested_column = getNestedArrowColumn(arrow_column); + auto arrow_nested_column = is_large ? getNestedArrowColumn(arrow_column) : getNestedArrowColumn(arrow_column); auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, date_time_overflow_behavior, nested_type_hint); if (skipped) return {}; - auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); + auto offsets_column = is_large ? readOffsetsFromArrowListColumn(arrow_column) : readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(nested_column.column, offsets_column); auto array_type = std::make_shared(nested_column.type); return {std::move(array_column), std::move(array_type), column_name}; diff --git a/tests/queries/0_stateless/02911_arrow_large_list.reference b/tests/queries/0_stateless/02911_arrow_large_list.reference new file mode 100644 index 00000000000..a6fbcce8c06 --- /dev/null +++ b/tests/queries/0_stateless/02911_arrow_large_list.reference @@ -0,0 +1,4 @@ +a +Array(Nullable(String)) +['00000','00001','00002'] +['10000','10001','10002'] diff --git a/tests/queries/0_stateless/02911_arrow_large_list.sh b/tests/queries/0_stateless/02911_arrow_large_list.sh new file mode 100755 index 00000000000..9b1c9a9d0ed --- /dev/null +++ b/tests/queries/0_stateless/02911_arrow_large_list.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# ## generate arrow file with python +# import pyarrow as pa +# schema = pa.schema([ pa.field('a', pa.large_list(pa.utf8())) ]) +# a = pa.array([["00000", "00001", "00002"], ["10000", "10001", "10002"]]) +# with pa.OSFile('arraydata.arrow', 'wb') as sink: +# with pa.ipc.new_file(sink, schema=schema) as writer: +# batch = pa.record_batch([a], schema=schema) +# writer.write(batch) + +# cat arraydata.arrow | base64 + +cat <