From 8ed6ad7c55d248768cd696af2f97eb7c246ad3ef Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 12 May 2021 19:06:08 +0300 Subject: [PATCH] Refactor, support all types in array, add nested arrays support, more tests --- src/Columns/IColumn.cpp | 6 + src/Columns/IColumn.h | 2 + .../Formats/Impl/ArrowColumnToCHColumn.cpp | 760 ++++-------------- .../Formats/Impl/CHColumnToArrowColumn.cpp | 338 ++++---- .../Formats/Impl/CHColumnToArrowColumn.h | 10 +- .../00900_orc_arrays_load.reference | 4 + .../0_stateless/00900_orc_arrays_load.sh | 15 + .../00900_orc_nested_arrays_load.reference | 2 + .../00900_orc_nested_arrays_load.sh | 15 + .../00900_orc_nullable_arrays_load.reference | 6 + .../00900_orc_nullable_arrays_load.sh | 15 + .../0_stateless/00900_parquet.reference | 12 + tests/queries/0_stateless/00900_parquet.sh | 31 + .../0_stateless/00900_parquet_load.reference | 20 +- .../0_stateless/data_orc/array_test.orc | Bin 0 -> 3870 bytes .../data_orc/nested_array_test.orc | Bin 0 -> 1344 bytes .../data_orc/nullable_array_test.orc | Bin 0 -> 714 bytes tests/queries/0_stateless/data_orc/tmp | Bin 0 -> 714 bytes .../data_parquet/00900_parquet_load.reference | 0 .../data_parquet/alltypes_list.parquet | Bin 0 -> 13405 bytes .../alltypes_list.parquet.columns | 1 + .../data_parquet/array_float.parquet | Bin 520 -> 668 bytes .../data_parquet/array_int.parquet | Bin 517 -> 724 bytes .../data_parquet/list_columns.parquet.columns | 2 +- .../data_parquet/nested_lists.parquet | Bin 0 -> 1755 bytes .../data_parquet/nested_lists.parquet.columns | 1 + .../nested_lists.snappy.parquet.columns | 2 +- .../data_parquet/nullable_list.parquet | Bin 0 -> 2138 bytes .../nullable_list.parquet.columns | 1 + 29 files changed, 441 insertions(+), 802 deletions(-) create mode 100644 tests/queries/0_stateless/00900_orc_arrays_load.reference create mode 100755 tests/queries/0_stateless/00900_orc_arrays_load.sh create mode 100644 tests/queries/0_stateless/00900_orc_nested_arrays_load.reference create mode 100755 tests/queries/0_stateless/00900_orc_nested_arrays_load.sh create mode 100644 tests/queries/0_stateless/00900_orc_nullable_arrays_load.reference create mode 100755 tests/queries/0_stateless/00900_orc_nullable_arrays_load.sh create mode 100644 tests/queries/0_stateless/data_orc/array_test.orc create mode 100644 tests/queries/0_stateless/data_orc/nested_array_test.orc create mode 100644 tests/queries/0_stateless/data_orc/nullable_array_test.orc create mode 100644 tests/queries/0_stateless/data_orc/tmp create mode 100644 tests/queries/0_stateless/data_parquet/00900_parquet_load.reference create mode 100644 tests/queries/0_stateless/data_parquet/alltypes_list.parquet create mode 100644 tests/queries/0_stateless/data_parquet/alltypes_list.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/nested_lists.parquet create mode 100644 tests/queries/0_stateless/data_parquet/nested_lists.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/nullable_list.parquet create mode 100644 tests/queries/0_stateless/data_parquet/nullable_list.parquet.columns diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index a3ed0885651..2e61dc50fa4 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -40,4 +41,9 @@ bool isColumnConst(const IColumn & column) return checkColumn(column); } +bool isColumnArray(const IColumn & column) +{ + return checkColumn(column); +} + } diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 1dedd191e1d..ac8e9f9872a 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -530,4 +530,6 @@ bool isColumnConst(const IColumn & column); /// True if column's an ColumnNullable instance. It's just a syntax sugar for type check. bool isColumnNullable(const IColumn & column); +bool isColumnArray(const IColumn & column); + } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 144ccef1fe3..cef9bd79b2a 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -17,6 +17,7 @@ #include #include + namespace DB { namespace ErrorCodes @@ -57,44 +58,11 @@ namespace DB // Full list of types: contrib/arrow/cpp/src/arrow/type.h }; - template - static void reserveArrayColumn(std::shared_ptr & arrow_column, ColumnArray & array_column, NestedColumnVector & nested_column) - { - size_t nested_column_length = 0; - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & chunk = static_cast(*(arrow_column->chunk(chunk_i))); - - for (int64_t array_idx = 0; array_idx != chunk.length(); ++array_idx) - { - const std::shared_ptr array = chunk.value_slice(array_idx); - nested_column_length += array->length(); - } - } - array_column.reserve(arrow_column->length()); - nested_column.reserve(nested_column_length); - } - - /// Creates a null bytemap from arrow's null bitmap - static void fillByteMapFromArrowColumn(std::shared_ptr & arrow_column, MutableColumnPtr & bytemap) - { - PaddedPODArray & bytemap_data = assert_cast &>(*bytemap).getData(); - bytemap_data.reserve(arrow_column->length()); - - for (size_t chunk_i = 0; chunk_i != static_cast(arrow_column->num_chunks()); ++chunk_i) - { - std::shared_ptr chunk = arrow_column->chunk(chunk_i); - - for (size_t value_i = 0; value_i != static_cast(chunk->length()); ++value_i) - bytemap_data.emplace_back(chunk->IsNull(value_i)); - } - } - /// Inserts numeric data right into internal column data to reduce an overhead template > - static void fillColumnWithNumericData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) + static void fillColumnWithNumericData(std::shared_ptr & arrow_column, IColumn & internal_column) { - auto & column_data = static_cast(*internal_column).getData(); + auto & column_data = static_cast(internal_column).getData(); column_data.reserve(arrow_column->length()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -108,70 +76,13 @@ namespace DB } } - template > - static void fillColumnWithArrayNumericData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) +/// Inserts chars and offsets right into internal column data to reduce an overhead. +/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars. +/// Also internal strings are null terminated. + static void fillColumnWithStringData(std::shared_ptr & arrow_column, IColumn & internal_column) { - ColumnArray & column_array = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = column_array.getOffsets(); - - const bool is_column_array_nullable = column_array.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&column_array.getData()) : nullptr; - - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(column_array.getData()).getNestedColumn() : - column_array.getData(); - VectorType & nested_column = static_cast(array_nested_column); - auto & nested_column_data = nested_column.getData(); - - reserveArrayColumn(arrow_column, column_array, nested_column); - - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - - const std::shared_ptr array = list_chunk.values(); - std::shared_ptr buffer = array->data()->buffers[1]; - const auto * raw_data = reinterpret_cast(buffer->data()); - nested_column_data.insert_assume_reserved(raw_data, raw_data + array->length()); - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - const std::shared_ptr chunk = list_chunk.value_slice(array_idx); - /// buffers[0] is a null bitmap and buffers[1] are actual values - std::shared_ptr inner_buffer = chunk->data()->buffers[1]; - - const size_t chunk_length = list_chunk.value_length(array_idx); - - for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i) - { - if (!chunk->IsNull(offset_i) && inner_buffer) - { - if (is_column_array_nullable && column_nullable) - { - column_nullable->getNullMapData().push_back(0); - } - } - else - { - if (is_column_array_nullable && column_nullable) - { - column_nullable->getNullMapData().push_back(1); - } - } - } - - column_array_offsets.emplace_back(column_array_offsets.back() + chunk->length()); - } - } - } - - /// Inserts chars and offsets right into internal column data to reduce an overhead. - /// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars. - /// Also internal strings are null terminated. - static void fillColumnWithStringData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) - { - PaddedPODArray & column_chars_t = assert_cast(*internal_column).getChars(); - PaddedPODArray & column_offsets = assert_cast(*internal_column).getOffsets(); + PaddedPODArray & column_chars_t = assert_cast(internal_column).getChars(); + PaddedPODArray & column_offsets = assert_cast(internal_column).getOffsets(); size_t chars_t_size = 0; for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -206,83 +117,9 @@ namespace DB } } - static void fillColumnWithArrayStringData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) + static void fillColumnWithBooleanData(std::shared_ptr & arrow_column, IColumn & internal_column) { - ColumnArray & column_array = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = column_array.getOffsets(); - - const bool is_column_array_nullable = column_array.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&column_array.getData()) : nullptr; - - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(column_array.getData()).getNestedColumn() : - column_array.getData(); - ColumnString & nested_column = static_cast(array_nested_column); - PaddedPODArray & nested_column_chars = nested_column.getChars(); - PaddedPODArray & nested_column_offsets = nested_column.getOffsets(); - - size_t chars_t_size = 0; - size_t number_size = 0; - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & chunk = static_cast(*(arrow_column->chunk(chunk_i))); - - for (int64_t array_idx = 0; array_idx != chunk.length(); ++array_idx) - { - const std::shared_ptr array = chunk.value_slice(array_idx); - arrow::BinaryArray & binary_array = static_cast(*(array)); - const size_t binary_array_length = binary_array.length(); - - chars_t_size += binary_array.value_offset(binary_array_length - 1) + binary_array.value_length(binary_array_length - 1); - chars_t_size += binary_array_length; /// additional space for null bytes - number_size += binary_array_length; - } - } - column_array.reserve(arrow_column->length()); - - nested_column_chars.reserve(chars_t_size); - nested_column_offsets.reserve(number_size); - - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - const std::shared_ptr array = list_chunk.value_slice(array_idx); - arrow::BinaryArray & chunk = static_cast(*(array)); - std::shared_ptr buffer = chunk.value_data(); - const size_t chunk_length = chunk.length(); - - for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i) - { - if (!chunk.IsNull(offset_i) && buffer) - { - const auto * raw_data = buffer->data() + chunk.value_offset(offset_i); - nested_column_chars.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i)); - if (is_column_array_nullable && column_nullable) - { - column_nullable->getNullMapData().push_back(0); - } - } - else - { - if (is_column_array_nullable && column_nullable) - { - column_nullable->getNullMapData().push_back(1); - } - } - nested_column_chars.emplace_back('\0'); - nested_column_offsets.emplace_back(nested_column_chars.size()); - } - column_array_offsets.emplace_back(column_array_offsets.back() + chunk_length); - } - } - } - - static void fillColumnWithBooleanData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) - { - auto & column_data = assert_cast &>(*internal_column).getData(); + auto & column_data = assert_cast &>(internal_column).getData(); column_data.reserve(arrow_column->length()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -296,58 +133,10 @@ namespace DB } } - static void fillColumnWithArrayBooleanData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) - { - ColumnArray & column_array = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = column_array.getOffsets(); - - const bool is_column_array_nullable = column_array.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&column_array.getData()) : nullptr; - - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(column_array.getData()).getNestedColumn() : - column_array.getData(); - ColumnVector & nested_column = assert_cast &>(array_nested_column); - auto & nested_column_data = nested_column.getData(); - - reserveArrayColumn(arrow_column, column_array, nested_column); - - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - const std::shared_ptr list_array = list_chunk.values(); - auto & chunk = static_cast(*(list_array)); - for (size_t bool_i = 0; bool_i != static_cast(list_array->length()); ++bool_i) - { - nested_column_data.emplace_back(chunk.Value(bool_i)); - } - - if (is_column_array_nullable && column_nullable) - { - for (size_t bool_i = 0; bool_i != static_cast(list_array->length()); ++bool_i) - { - if (!chunk.IsNull(bool_i)) - { - column_nullable->getNullMapData().push_back(0); - } - else - { - column_nullable->getNullMapData().push_back(1); - } - } - } - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx)); - } - } - } - /// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving - static void fillColumnWithDate32Data(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) + static void fillColumnWithDate32Data(std::shared_ptr & arrow_column, IColumn & internal_column) { - PaddedPODArray & column_data = assert_cast &>(*internal_column).getData(); + PaddedPODArray & column_data = assert_cast &>(internal_column).getData(); column_data.reserve(arrow_column->length()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -360,7 +149,7 @@ namespace DB if (days_num > DATE_LUT_MAX_DAY_NUM) { // TODO: will it rollback correctly? - throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column->getName() + throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column.getName() + "\" is greater than " "max allowed Date value, which is " + std::to_string(DATE_LUT_MAX_DAY_NUM), @@ -372,69 +161,10 @@ namespace DB } } - static void fillColumnWithArrayDate32Data(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) - { - ColumnArray & array_column = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = array_column.getOffsets(); - - const bool is_column_array_nullable = array_column.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&array_column.getData()) : nullptr; - - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(array_column.getData()).getNestedColumn() : - array_column.getData(); - ColumnVector & nested_column = assert_cast &>(array_nested_column); - auto & nested_column_data = nested_column.getData(); - - reserveArrayColumn(arrow_column, array_column, nested_column); - - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - const std::shared_ptr list_array = list_chunk.values(); - auto & chunk = static_cast(*(list_array)); - for (size_t value_i = 0, length = static_cast(chunk.length()); value_i < length; ++value_i) - { - UInt32 days_num = static_cast(chunk.Value(value_i)); - if (days_num > DATE_LUT_MAX_DAY_NUM) - { - // TODO: will it rollback correctly? - throw Exception{ - "Input value " + std::to_string(days_num) + " of a column \"" + internal_column->getName() - + "\" is greater than " - "max allowed Date value, which is " - + std::to_string(DATE_LUT_MAX_DAY_NUM), - ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE}; - } - nested_column_data.emplace_back(days_num); - } - - if (is_column_array_nullable && column_nullable) - { - for (size_t bool_i = 0; bool_i != static_cast(list_array->length()); ++bool_i) - { - if (!chunk.IsNull(bool_i)) - { - column_nullable->getNullMapData().push_back(0); - } - else - { - column_nullable->getNullMapData().push_back(1); - } - } - } - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx)); - } - } - } - /// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving - static void fillColumnWithDate64Data(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) + static void fillColumnWithDate64Data(std::shared_ptr & arrow_column, IColumn & internal_column) { - auto & column_data = assert_cast &>(*internal_column).getData(); + auto & column_data = assert_cast &>(internal_column).getData(); column_data.reserve(arrow_column->length()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -448,58 +178,9 @@ namespace DB } } - static void fillColumnWithArrayDate64Data(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) + static void fillColumnWithTimestampData(std::shared_ptr & arrow_column, IColumn & internal_column) { - ColumnArray & array_column = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = array_column.getOffsets(); - - const bool is_column_array_nullable = array_column.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&array_column.getData()) : nullptr; - - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(array_column.getData()).getNestedColumn() : - array_column.getData(); - ColumnVector & nested_column = assert_cast &>(array_nested_column); - auto & nested_column_data = nested_column.getData(); - - reserveArrayColumn(arrow_column, array_column, nested_column); - - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - const std::shared_ptr list_array = list_chunk.values(); - auto & chunk = static_cast(*(list_array)); - for (size_t value_i = 0, length = static_cast(chunk.length()); value_i < length; ++value_i) - { - auto timestamp = static_cast(chunk.Value(value_i) / 1000); // Always? in ms - nested_column_data.emplace_back(timestamp); - } - - if (is_column_array_nullable && column_nullable) - { - for (size_t bool_i = 0; bool_i != static_cast(list_array->length()); ++bool_i) - { - if (!chunk.IsNull(bool_i)) - { - column_nullable->getNullMapData().push_back(0); - } - else - { - column_nullable->getNullMapData().push_back(1); - } - } - } - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx)); - } - } - } - - static void fillColumnWithTimestampData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) - { - auto & column_data = assert_cast &>(*internal_column).getData(); + auto & column_data = assert_cast &>(internal_column).getData(); column_data.reserve(arrow_column->length()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -533,77 +214,9 @@ namespace DB } } - static void fillColumnWithArrayTimestampData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) + static void fillColumnWithDecimalData(std::shared_ptr & arrow_column, IColumn & internal_column) { - ColumnArray & column_array = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = column_array.getOffsets(); - - const bool is_column_array_nullable = column_array.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&column_array.getData()) : nullptr; - - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(column_array.getData()).getNestedColumn() : - column_array.getData(); - ColumnVector & nested_column = assert_cast &>(array_nested_column); - auto & nested_column_data = nested_column.getData(); - - reserveArrayColumn(arrow_column, column_array, nested_column); - - for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) - { - arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - const auto & type = static_cast(*list_chunk.type()); - const std::shared_ptr list_array = list_chunk.values(); - auto & chunk = static_cast(*(list_array)); - UInt32 divide = 1; - const auto unit = type.unit(); - switch (unit) - { - case arrow::TimeUnit::SECOND: - divide = 1; - break; - case arrow::TimeUnit::MILLI: - divide = 1000; - break; - case arrow::TimeUnit::MICRO: - divide = 1000000; - break; - case arrow::TimeUnit::NANO: - divide = 1000000000; - break; - } - - for (size_t value_i = 0, length = static_cast(chunk.length()); value_i < length; ++value_i) - { - auto timestamp = static_cast(chunk.Value(value_i) / divide); // ms! TODO: check other 's' 'ns' ... - nested_column_data.emplace_back(timestamp); - } - - if (is_column_array_nullable && column_nullable) - { - for (size_t bool_i = 0; bool_i != static_cast(list_array->length()); ++bool_i) - { - if (!chunk.IsNull(bool_i)) - { - column_nullable->getNullMapData().push_back(0); - } - else - { - column_nullable->getNullMapData().push_back(1); - } - } - } - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx)); - } - } - } - - static void fillColumnWithDecimalData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) - { - auto & column = assert_cast &>(*internal_column); + auto & column = assert_cast &>(internal_column); auto & column_data = column.getData(); column_data.reserve(arrow_column->length()); @@ -617,54 +230,155 @@ namespace DB } } - static void fillColumnWithArrayDecimalData(std::shared_ptr & arrow_column, MutableColumnPtr & internal_column) +/// Creates a null bytemap from arrow's null bitmap + static void fillByteMapFromArrowColumn(std::shared_ptr & arrow_column, IColumn & bytemap) { - ColumnArray & array_column = assert_cast(*internal_column); - ColumnArray::Offsets & column_array_offsets = array_column.getOffsets(); + PaddedPODArray & bytemap_data = assert_cast &>(bytemap).getData(); + bytemap_data.reserve(arrow_column->length()); - const bool is_column_array_nullable = array_column.getData().isNullable(); - ColumnNullable * column_nullable = is_column_array_nullable ? static_cast(&array_column.getData()) : nullptr; + for (size_t chunk_i = 0; chunk_i != static_cast(arrow_column->num_chunks()); ++chunk_i) + { + std::shared_ptr chunk = arrow_column->chunk(chunk_i); - IColumn & array_nested_column = - is_column_array_nullable ? static_cast(array_column.getData()).getNestedColumn() : - array_column.getData(); - ColumnDecimal & nested_column = assert_cast &>(array_nested_column); - auto & nested_column_data = nested_column.getData(); + for (size_t value_i = 0; value_i != static_cast(chunk->length()); ++value_i) + bytemap_data.emplace_back(chunk->IsNull(value_i)); + } + } - reserveArrayColumn(arrow_column, array_column, nested_column); + static void fillOffsetsFromArrowListColumn(std::shared_ptr & arrow_column, IColumn & offsets) + { + ColumnArray::Offsets & offsets_data = assert_cast &>(offsets).getData(); + offsets_data.reserve(arrow_column->length()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) { arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); - const std::shared_ptr list_array = list_chunk.values(); - auto & chunk = static_cast(*(list_array)); - for (size_t value_i = 0, length = static_cast(chunk.length()); value_i < length; ++value_i) - { - nested_column_data.emplace_back(*reinterpret_cast(chunk.Value(value_i))); // TODO: copy column - } - - if (is_column_array_nullable && column_nullable) - { - for (size_t bool_i = 0; bool_i != static_cast(list_array->length()); ++bool_i) - { - if (!chunk.IsNull(bool_i)) - { - column_nullable->getNullMapData().push_back(0); - } - else - { - column_nullable->getNullMapData().push_back(1); - } - } - } - - for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx) - { - column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx)); - } + auto arrow_offsets_array = list_chunk.offsets(); + auto & arrow_offsets = static_cast(*arrow_offsets_array); + auto start = offsets_data.back(); + for (int64_t i = 1; i < arrow_offsets.length(); ++i) + offsets_data.emplace_back(start + arrow_offsets.Value(i)); } } + static void readColumnFromArrowColumn(std::shared_ptr & arrow_column, IColumn & internal_column, const std::string & column_name, const std::string format_name, bool is_nullable) + { + if (internal_column.isNullable()) + { + ColumnNullable & column_nullable = typeid_cast(internal_column); + readColumnFromArrowColumn(arrow_column, column_nullable.getNestedColumn(), column_name, format_name, true); + fillByteMapFromArrowColumn(arrow_column, column_nullable.getNullMapColumn()); + return; + } + + // TODO: check if a column is const? + if (!is_nullable && !isColumnArray(internal_column) && arrow_column->null_count()) + { + throw Exception{ + "Can not insert NULL data into non-nullable column \"" + column_name + "\"", + ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN}; + } + + switch (arrow_column->type()->id()) + { + case arrow::Type::STRING: + case arrow::Type::BINARY: + //case arrow::Type::FIXED_SIZE_BINARY: + fillColumnWithStringData(arrow_column, internal_column); + break; + case arrow::Type::BOOL: + fillColumnWithBooleanData(arrow_column, internal_column); + break; + case arrow::Type::DATE32: + fillColumnWithDate32Data(arrow_column, internal_column); + break; + case arrow::Type::DATE64: + fillColumnWithDate64Data(arrow_column, internal_column); + break; + case arrow::Type::TIMESTAMP: + fillColumnWithTimestampData(arrow_column, internal_column); + break; + case arrow::Type::DECIMAL: + //fillColumnWithNumericData>(arrow_column, read_column); // Have problems with trash values under NULL, but faster + fillColumnWithDecimalData(arrow_column, internal_column /*, internal_nested_type*/); + break; + case arrow::Type::LIST: + { + const auto * list_type = static_cast(arrow_column->type().get()); + auto list_nested_type = list_type->value_type(); + arrow::ArrayVector array_vector; + array_vector.reserve(arrow_column->num_chunks()); + for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) + { + arrow::ListArray & list_chunk = static_cast(*(arrow_column->chunk(chunk_i))); + std::shared_ptr chunk = list_chunk.values(); + array_vector.emplace_back(std::move(chunk)); + } + auto arrow_nested_column = std::make_shared(array_vector); + + ColumnArray & column_array = typeid_cast(internal_column); + readColumnFromArrowColumn(arrow_nested_column, column_array.getData(), column_name, format_name, false); + fillOffsetsFromArrowListColumn(arrow_column, column_array.getOffsetsColumn()); + break; + } +# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ + case ARROW_NUMERIC_TYPE: \ + fillColumnWithNumericData(arrow_column, internal_column); \ + break; + + FOR_ARROW_NUMERIC_TYPES(DISPATCH) +# undef DISPATCH + // TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds? + // TODO: read JSON as a string? + // TODO: read UUID as a string? + default: + throw Exception + { + "Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \"" + + column_name + "\"", + ErrorCodes::UNKNOWN_TYPE + }; + } + } + + static DataTypePtr getInternalType(std::shared_ptr arrow_type, const DataTypePtr & column_type, const std::string & column_name, const std::string & format_name) + { + if (column_type->isNullable()) + { + DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + return makeNullable(getInternalType(arrow_type, nested_type, column_name, format_name)); + } + + if (arrow_type->id() == arrow::Type::DECIMAL) + { + const auto * decimal_type = static_cast(arrow_type.get()); + return std::make_shared>(decimal_type->precision(), decimal_type->scale()); + } + + if (arrow_type->id() == arrow::Type::LIST) + { + const auto * list_type = static_cast(arrow_type.get()); + auto list_nested_type = list_type->value_type(); + + const DataTypeArray * array_type = typeid_cast(column_type.get()); + if (!array_type) + throw Exception{"Cannot convert arrow LIST type to a not Array ClickHouse type " + column_type->getName(), ErrorCodes::CANNOT_CONVERT_TYPE}; + + return std::make_shared(getInternalType(list_nested_type, array_type->getNestedType(), column_name, format_name)); + } + + if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(), + [=](auto && elem) { return elem.first == arrow_type->id(); }); + internal_type_it != arrow_type_to_internal_type.end()) + { + return DataTypeFactory::instance().get(internal_type_it->second); + } + throw Exception{ + "The type \"" + arrow_type->name() + "\" of an input column \"" + column_name + "\" is not supported for conversion from a " + + format_name + " data format", + ErrorCodes::CANNOT_CONVERT_TYPE}; + } + void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr & table, const Block & header, std::string format_name) { @@ -693,166 +407,16 @@ namespace DB ErrorCodes::THERE_IS_NO_COLUMN}; std::shared_ptr arrow_column = name_to_column_ptr[header_column.name]; - arrow::Type::type arrow_type = arrow_column->type()->id(); - std::shared_ptr list_nested_type; - // TODO: check if a column is const? - if (!column_type->isNullable() && arrow_column->null_count()) - { - throw Exception{"Can not insert NULL data into non-nullable column \"" + header_column.name + "\"", - ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN}; - } + DataTypePtr internal_type = getInternalType(arrow_column->type(), column_type, header_column.name, format_name); - const bool target_column_is_nullable = column_type->isNullable() || arrow_column->null_count(); - - DataTypePtr internal_nested_type; - DataTypePtr array_nested_type; - - if (arrow_type == arrow::Type::DECIMAL) - { - const auto * decimal_type = static_cast(arrow_column->type().get()); - internal_nested_type = std::make_shared>(decimal_type->precision(), - decimal_type->scale()); - } - else if (arrow_type == arrow::Type::LIST) - { - const auto * list_type = static_cast(arrow_column->type().get()); - list_nested_type = list_type->value_type(); - - const auto * column_array_type = static_cast(column_type.get()); - const bool is_column_array_nullable = column_array_type->getNestedType()->isNullable(); - - if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(), - [=](auto && elem) { return elem.first == list_nested_type->id(); }); - internal_type_it != arrow_type_to_internal_type.end()) - { - DataTypePtr array_instance_type = DataTypeFactory::instance().get(internal_type_it->second); - array_nested_type = is_column_array_nullable ? makeNullable(array_instance_type) - : array_instance_type; - internal_nested_type = std::make_shared(array_nested_type); - } - else - { - throw Exception{"The internal type \"" + list_type->value_type()->name() + "\" of an array column \"" + header_column.name - + "\" is not supported for conversion from a " + format_name + " data format", - ErrorCodes::CANNOT_CONVERT_TYPE}; - } - } - else if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(), - [=](auto && elem) { return elem.first == arrow_type; }); - internal_type_it != arrow_type_to_internal_type.end()) - { - internal_nested_type = DataTypeFactory::instance().get(internal_type_it->second); - } - else - { - throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + header_column.name - + "\" is not supported for conversion from a " + format_name + " data format", - ErrorCodes::CANNOT_CONVERT_TYPE}; - } - - const DataTypePtr internal_type = target_column_is_nullable ? makeNullable(internal_nested_type) - : internal_nested_type; + MutableColumnPtr read_column = internal_type->createColumn(); + readColumnFromArrowColumn(arrow_column, *read_column, header_column.name, format_name, false); ColumnWithTypeAndName column; column.name = header_column.name; column.type = internal_type; - - /// Data - MutableColumnPtr read_column = internal_nested_type->createColumn(); - switch (arrow_type) - { - case arrow::Type::STRING: - case arrow::Type::BINARY: - //case arrow::Type::FIXED_SIZE_BINARY: - fillColumnWithStringData(arrow_column, read_column); - break; - case arrow::Type::BOOL: - fillColumnWithBooleanData(arrow_column, read_column); - break; - case arrow::Type::DATE32: - fillColumnWithDate32Data(arrow_column, read_column); - break; - case arrow::Type::DATE64: - fillColumnWithDate64Data(arrow_column, read_column); - break; - case arrow::Type::TIMESTAMP: - fillColumnWithTimestampData(arrow_column, read_column); - break; - case arrow::Type::DECIMAL: - //fillColumnWithNumericData>(arrow_column, read_column); // Have problems with trash values under NULL, but faster - fillColumnWithDecimalData(arrow_column, read_column /*, internal_nested_type*/); - break; - case arrow::Type::LIST: - if (array_nested_type && list_nested_type) { - switch (list_nested_type->id()) - { - case arrow::Type::STRING: - case arrow::Type::BINARY: - //case arrow::Type::FIXED_SIZE_BINARY: - fillColumnWithArrayStringData(arrow_column, read_column); - break; - case arrow::Type::BOOL: - fillColumnWithArrayBooleanData(arrow_column, read_column); - break; - case arrow::Type::DATE32: - fillColumnWithArrayDate32Data(arrow_column, read_column); - break; - case arrow::Type::DATE64: - fillColumnWithArrayDate64Data(arrow_column, read_column); - break; - case arrow::Type::TIMESTAMP: - fillColumnWithArrayTimestampData(arrow_column, read_column); - break; - case arrow::Type::DECIMAL: - //fillColumnWithNumericData>(arrow_column, read_column); // Have problems with trash values under NULL, but faster - fillColumnWithArrayDecimalData(arrow_column, read_column /*, internal_nested_type*/); - break; - # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ - case ARROW_NUMERIC_TYPE: \ - fillColumnWithArrayNumericData(arrow_column, read_column); \ - break; - - FOR_ARROW_NUMERIC_TYPES(DISPATCH) - # undef DISPATCH - default: - throw Exception - { - "Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \"" - + header_column.name + "\"", - ErrorCodes::UNKNOWN_TYPE - }; - } - } - break; -# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ - case ARROW_NUMERIC_TYPE: \ - fillColumnWithNumericData(arrow_column, read_column); \ - break; - - FOR_ARROW_NUMERIC_TYPES(DISPATCH) -# undef DISPATCH - // TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds? - // TODO: read JSON as a string? - // TODO: read UUID as a string? - default: - throw Exception - { - "Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \"" - + header_column.name + "\"", - ErrorCodes::UNKNOWN_TYPE - }; - } - - - if (column.type->isNullable()) - { - MutableColumnPtr null_bytemap = DataTypeUInt8().createColumn(); - fillByteMapFromArrowColumn(arrow_column, null_bytemap); - column.column = ColumnNullable::create(std::move(read_column), std::move(null_bytemap)); - } - else - column.column = std::move(read_column); + column.column = std::move(read_column); column.column = castColumn(column, header_column.type); column.type = header_column.type; diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index be98bf68bbd..522a3927bef 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -18,6 +18,8 @@ #include #include +#include + namespace DB { namespace ErrorCodes @@ -50,12 +52,6 @@ namespace DB {"FixedString", arrow::utf8()}, }; - static const PaddedPODArray * extractNullBytemapPtr(ColumnPtr column) - { - ColumnPtr null_column = assert_cast(*column).getNullMapColumnPtr(); - const PaddedPODArray & null_bytemap = assert_cast &>(*null_column).getData(); - return &null_bytemap; - } static void checkStatus(const arrow::Status & status, const String & column_name, const String & format_name) { @@ -68,10 +64,12 @@ namespace DB ColumnPtr write_column, const PaddedPODArray * null_bytemap, const String & format_name, - arrow::ArrayBuilder* abuilder) + arrow::ArrayBuilder* array_builder, + size_t start, + size_t end) { const PaddedPODArray & internal_data = assert_cast &>(*write_column).getData(); - ArrowBuilderType & builder = assert_cast(*abuilder); + ArrowBuilderType & builder = assert_cast(*array_builder); arrow::Status status; const UInt8 * arrow_null_bytemap_raw_ptr = nullptr; @@ -79,98 +77,48 @@ namespace DB if (null_bytemap) { /// Invert values since Arrow interprets 1 as a non-null value, while CH as a null - arrow_null_bytemap.reserve(null_bytemap->size()); - for (auto is_null : *null_bytemap) - arrow_null_bytemap.emplace_back(!is_null); + arrow_null_bytemap.reserve(end - start); + for (size_t i = start; i < end; ++i) + arrow_null_bytemap.template emplace_back(!(*null_bytemap)[i]); arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data(); } if constexpr (std::is_same_v) status = builder.AppendValues( - reinterpret_cast(internal_data.data()), - internal_data.size(), + reinterpret_cast(internal_data.data() + start), + end - start, reinterpret_cast(arrow_null_bytemap_raw_ptr)); else - status = builder.AppendValues(internal_data.data(), internal_data.size(), reinterpret_cast(arrow_null_bytemap_raw_ptr)); + status = builder.AppendValues(internal_data.data() + start, end - start, reinterpret_cast(arrow_null_bytemap_raw_ptr)); checkStatus(status, write_column->getName(), format_name); } static void fillArrowArrayWithArrayColumnData( const String & column_name, - ColumnPtr & nested_column, + ColumnPtr & column, const std::shared_ptr & column_type, - std::shared_ptr arrow_array, const PaddedPODArray * null_bytemap, arrow::ArrayBuilder * array_builder, - String format_name) + String format_name, + size_t start, + size_t end) { - const auto * column_array = static_cast(nested_column.get()); - const bool is_column_array_nullable = column_array->getData().isNullable(); - const IColumn & array_nested_column = - is_column_array_nullable ? static_cast(column_array->getData()).getNestedColumn() : - column_array->getData(); - const String column_array_nested_type_name = array_nested_column.getFamilyName(); + const auto * column_array = static_cast(column.get()); + ColumnPtr nested_column = column_array->getDataPtr(); + DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + const auto & offsets = column_array->getOffsets(); - const auto * column_array_type = static_cast(column_type.get()); - const DataTypePtr & array_nested_type = - is_column_array_nullable ? static_cast(column_array_type->getNestedType().get())->getNestedType() : - column_array_type->getNestedType(); + arrow::ListBuilder & builder = assert_cast(*array_builder); + arrow::ArrayBuilder * value_builder = builder.value_builder(); + arrow::Status components_status; - const PaddedPODArray * array_null_bytemap = - is_column_array_nullable ? extractNullBytemapPtr(assert_cast(*nested_column).getDataPtr()) : nullptr; - - const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(), - [=](auto && elem) { return elem.first == column_array_nested_type_name; }); - if (arrow_type_it != internal_type_to_arrow_type.end()) + for (size_t array_idx = start; array_idx < end; ++array_idx) { - std::shared_ptr list_type = arrow::list(arrow_type_it->second); - - const auto & internal_column = assert_cast(*nested_column); - - arrow::ListBuilder & builder = assert_cast(*array_builder); - arrow::ArrayBuilder * value_builder = builder.value_builder(); - arrow::Status components_status; - - const auto & offsets = internal_column.getOffsets(); - ColumnPtr & data = is_column_array_nullable ? - const_cast(static_cast(internal_column.getData()).getNestedColumnPtr()) : - const_cast(internal_column.getDataPtr()); - - size_t array_start = 0; - size_t array_length = 0; - - for (size_t idx = 0, size = internal_column.size(); idx < size; ++idx) - { - if (null_bytemap && (*null_bytemap)[idx]) - { - components_status = builder.AppendNull(); - checkStatus(components_status, nested_column->getName(), format_name); - } - else - { - components_status = builder.Append(); - checkStatus(components_status, nested_column->getName(), format_name); - array_length = offsets[idx] - array_start; - auto cut_data = data->cut(array_start, array_length); - if (array_null_bytemap == nullptr) - { - CHColumnToArrowColumn::fillArrowArray(column_name, cut_data, array_nested_type, - column_array_nested_type_name, arrow_array, - nullptr, value_builder, format_name); - } - else - { - PaddedPODArray array_nested_null_bytemap; - array_nested_null_bytemap.insertByOffsets(*array_null_bytemap, array_start, array_start + array_length); - - CHColumnToArrowColumn::fillArrowArray(column_name, cut_data, array_nested_type, - column_array_nested_type_name, arrow_array, - &array_nested_null_bytemap, value_builder, format_name); - } - array_start = offsets[idx]; - } - } + /// Start new array + components_status = builder.Append(); + checkStatus(components_status, nested_column->getName(), format_name); + CHColumnToArrowColumn::fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx]); } } @@ -179,13 +127,15 @@ namespace DB ColumnPtr write_column, const PaddedPODArray * null_bytemap, const String & format_name, - arrow::ArrayBuilder* abuilder) + arrow::ArrayBuilder* array_builder, + size_t start, + size_t end) { const auto & internal_column = assert_cast(*write_column); - arrow::StringBuilder & builder = assert_cast(*abuilder); + arrow::StringBuilder & builder = assert_cast(*array_builder); arrow::Status status; - for (size_t string_i = 0, size = internal_column.size(); string_i < size; ++string_i) + for (size_t string_i = start; string_i < end; ++string_i) { if (null_bytemap && (*null_bytemap)[string_i]) { @@ -205,14 +155,16 @@ namespace DB ColumnPtr write_column, const PaddedPODArray * null_bytemap, const String & format_name, - arrow::ArrayBuilder* abuilder) + arrow::ArrayBuilder* array_builder, + size_t start, + size_t end) { const PaddedPODArray & internal_data = assert_cast &>(*write_column).getData(); //arrow::Date32Builder date_builder; - arrow::UInt16Builder & builder = assert_cast(*abuilder); + arrow::UInt16Builder & builder = assert_cast(*array_builder); arrow::Status status; - for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i) + for (size_t value_i = start; value_i < end; ++value_i) { if (null_bytemap && (*null_bytemap)[value_i]) status = builder.AppendNull(); @@ -227,14 +179,16 @@ namespace DB ColumnPtr write_column, const PaddedPODArray * null_bytemap, const String & format_name, - arrow::ArrayBuilder* abuilder) + arrow::ArrayBuilder* array_builder, + size_t start, + size_t end) { const auto & internal_data = assert_cast &>(*write_column).getData(); //arrow::Date64Builder builder; - arrow::UInt32Builder & builder = assert_cast(*abuilder); + arrow::UInt32Builder & builder = assert_cast(*array_builder); arrow::Status status; - for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i) + for (size_t value_i = start; value_i < end; ++value_i) { if (null_bytemap && (*null_bytemap)[value_i]) status = builder.AppendNull(); @@ -249,36 +203,46 @@ namespace DB void CHColumnToArrowColumn::fillArrowArray( const String & column_name, - ColumnPtr & nested_column, - const std::shared_ptr & column_nested_type, - const String column_nested_type_name, - std::shared_ptr arrow_array, + ColumnPtr & column, + const std::shared_ptr & column_type, const PaddedPODArray * null_bytemap, arrow::ArrayBuilder * array_builder, - String format_name) + String format_name, + size_t start, + size_t end) { - if ("String" == column_nested_type_name) + const String column_type_name = column_type->getFamilyName(); + + if ("Nullable" == column_type_name) { - fillArrowArrayWithStringColumnData(nested_column, null_bytemap, format_name, array_builder); + const ColumnNullable * column_nullable = checkAndGetColumn(column.get()); + ColumnPtr nested_column = column_nullable->getNestedColumnPtr(); + DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + ColumnPtr null_column = column_nullable->getNullMapColumnPtr(); + const PaddedPODArray & bytemap = assert_cast &>(*null_column).getData(); + fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end); } - else if ("FixedString" == column_nested_type_name) + else if ("String" == column_type_name) { - fillArrowArrayWithStringColumnData(nested_column, null_bytemap, format_name, array_builder); + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); } - else if ("Date" == column_nested_type_name) + else if ("FixedString" == column_type_name) { - fillArrowArrayWithDateColumnData(nested_column, null_bytemap, format_name, array_builder); + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); } - else if ("DateTime" == column_nested_type_name) + else if ("Date" == column_type_name) { - fillArrowArrayWithDateTimeColumnData(nested_column, null_bytemap, format_name, array_builder); + fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end); } - else if ("Array" == column_nested_type_name) + else if ("DateTime" == column_type_name) { - fillArrowArrayWithArrayColumnData(column_name, nested_column, column_nested_type, arrow_array, null_bytemap, - array_builder, format_name); + fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end); } - else if (isDecimal(column_nested_type)) + else if ("Array" == column_type_name) + { + fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end); + } + else if (isDecimal(column_type)) { auto fill_decimal = [&](const auto & types) -> bool { @@ -289,24 +253,23 @@ namespace DB || std::is_same_v> || std::is_same_v>) { - const auto & decimal_type = static_cast(column_nested_type.get()); - fillArrowArrayWithDecimalColumnData(nested_column, arrow_array, null_bytemap, decimal_type, format_name); + fillArrowArrayWithDecimalColumnData(column, null_bytemap, array_builder, format_name, start, end); } return false; }; - callOnIndexAndDataType(column_nested_type->getTypeId(), fill_decimal); + callOnIndexAndDataType(column_type->getTypeId(), fill_decimal); } #define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \ - else if (#CPP_NUMERIC_TYPE == column_nested_type_name) \ + else if (#CPP_NUMERIC_TYPE == column_type_name) \ { \ - fillArrowArrayWithNumericColumnData(nested_column, null_bytemap, format_name, array_builder); \ + fillArrowArrayWithNumericColumnData(column, null_bytemap, format_name, array_builder, start, end); \ } FOR_INTERNAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH else { - throw Exception{"Internal type \"" + column_nested_type_name + "\" of a column \"" + column_name + "\"" + throw Exception{"Internal type \"" + column_type_name + "\" of a column \"" + column_name + "\"" " is not supported for conversion into a " + format_name + " data format", ErrorCodes::UNKNOWN_TYPE}; } @@ -315,16 +278,17 @@ namespace DB template static void fillArrowArrayWithDecimalColumnData( ColumnPtr write_column, - std::shared_ptr & arrow_array, const PaddedPODArray * null_bytemap, - const DataType * decimal_type, - const String & format_name) + arrow::ArrayBuilder * array_builder, + const String & format_name, + size_t start, + size_t end) { const auto & column = static_cast(*write_column); - arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale())); + arrow::DecimalBuilder & builder = assert_cast(*array_builder); arrow::Status status; - for (size_t value_i = 0, size = column.size(); value_i < size; ++value_i) + for (size_t value_i = start; value_i < end; ++value_i) { if (null_bytemap && (*null_bytemap)[value_i]) status = builder.AppendNull(); @@ -334,10 +298,65 @@ namespace DB checkStatus(status, write_column->getName(), format_name); } - status = builder.Finish(&arrow_array); checkStatus(status, write_column->getName(), format_name); } + static std::shared_ptr getArrowType(DataTypePtr column_type, const std::string & column_name, const std::string & format_name, bool * is_column_nullable) + { + if (column_type->isNullable()) + { + DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); + auto arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable); + *is_column_nullable = true; + return arrow_type; + } + + if (isDecimal(column_type)) + { + std::shared_ptr arrow_type; + const auto create_arrow_type = [&](const auto & types) -> bool { + using Types = std::decay_t; + using ToDataType = typename Types::LeftType; + + if constexpr ( + std::is_same_v> + || std::is_same_v> + || std::is_same_v>) + { + const auto & decimal_type = static_cast(column_type.get()); + arrow_type = arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()); + } + + return false; + }; + callOnIndexAndDataType(column_type->getTypeId(), create_arrow_type); + return arrow_type; + } + + if (isArray(column_type)) + { + auto nested_type = typeid_cast(column_type.get())->getNestedType(); + auto nested_arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable); + return arrow::list(nested_arrow_type); + } + + const std::string type_name = column_type->getFamilyName(); + if (const auto * arrow_type_it = std::find_if( + internal_type_to_arrow_type.begin(), + internal_type_to_arrow_type.end(), + [=](auto && elem) { return elem.first == type_name; }); + arrow_type_it != internal_type_to_arrow_type.end()) + { + return arrow_type_it->second; + } + throw Exception{ + "The type \"" + type_name + "\" of a column \"" + column_name + + "\"" + " is not supported for conversion into a " + + format_name + " data format", + ErrorCodes::UNKNOWN_TYPE}; + } + void CHColumnToArrowColumn::chChunkToArrowTable( std::shared_ptr & res, const Block & header, @@ -358,86 +377,20 @@ namespace DB column.column = recursiveRemoveLowCardinality(chunk.getColumns()[column_i]); column.type = recursiveRemoveLowCardinality(column.type); - const bool is_column_nullable = column.type->isNullable(); - bool is_column_array_nullable = false; - const auto & column_nested_type - = is_column_nullable ? static_cast(column.type.get())->getNestedType() : column.type; - const String column_nested_type_name = column_nested_type->getFamilyName(); - - if (isDecimal(column_nested_type)) - { - const auto add_decimal_field = [&](const auto & types) -> bool { - using Types = std::decay_t; - using ToDataType = typename Types::LeftType; - - if constexpr ( - std::is_same_v> - || std::is_same_v> - || std::is_same_v>) - { - const auto & decimal_type = static_cast(column_nested_type.get()); - arrow_fields.emplace_back(std::make_shared( - column.name, arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()), is_column_nullable)); - } - - return false; - }; - callOnIndexAndDataType(column_nested_type->getTypeId(), add_decimal_field); - } - else if (isArray(column_nested_type)) - { - const auto * column_array_type = static_cast(column_nested_type.get()); - is_column_array_nullable = column_array_type->getNestedType()->isNullable(); - const DataTypePtr & column_array_nested_type = - is_column_array_nullable ? static_cast(column_array_type->getNestedType().get())->getNestedType() : - column_array_type->getNestedType(); - const String column_array_nested_type_name = column_array_nested_type->getFamilyName(); - - if (const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(), - [=](auto && elem) { return elem.first == column_array_nested_type_name; }); - arrow_type_it != internal_type_to_arrow_type.end()) - { - arrow_fields.emplace_back(std::make_shared( - column.name, arrow::list(arrow_type_it->second), is_column_array_nullable)); - } else - { - throw Exception{"The type \"" + column_array_nested_type_name + "\" of a array column \"" + column.name + "\"" - " is not supported for conversion into a " + format_name + " data format", - ErrorCodes::UNKNOWN_TYPE}; - } - } - else - { - if (const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(), - [=](auto && elem) { return elem.first == column_nested_type_name; }); - arrow_type_it != internal_type_to_arrow_type.end()) - { - arrow_fields.emplace_back(std::make_shared(column.name, arrow_type_it->second, is_column_nullable)); - } else - { - throw Exception{"The type \"" + column_nested_type_name + "\" of a column \"" + column.name + "\"" - " is not supported for conversion into a " + format_name + " data format", - ErrorCodes::UNKNOWN_TYPE}; - } - } - - ColumnPtr nested_column - = is_column_nullable ? assert_cast(*column.column).getNestedColumnPtr() : column.column; - - const PaddedPODArray * null_bytemap = - is_column_nullable ? extractNullBytemapPtr(column.column) : nullptr; + bool is_column_nullable = false; + auto arrow_type = getArrowType(column.type, column.name, format_name, &is_column_nullable); + arrow_fields.emplace_back(std::make_shared(column.name, arrow_type, is_column_nullable)); arrow::MemoryPool* pool = arrow::default_memory_pool(); std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, arrow_fields[column_i]->type(), &array_builder); - checkStatus(status, nested_column->getName(), format_name); + checkStatus(status, column.column->getName(), format_name); + + fillArrowArray(column.name, column.column, column.type, nullptr, array_builder.get(), format_name, 0, column.column->size()); std::shared_ptr arrow_array; - - fillArrowArray(column.name, nested_column, column_nested_type, column_nested_type_name, arrow_array, null_bytemap, array_builder.get(), format_name); - status = array_builder->Finish(&arrow_array); - checkStatus(status, nested_column->getName(), format_name); + checkStatus(status, column.column->getName(), format_name); arrow_arrays.emplace_back(std::move(arrow_array)); } @@ -445,7 +398,6 @@ namespace DB res = arrow::Table::Make(arrow_schema, arrow_arrays); } - - } +} #endif diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h index 07bceb6266c..9740063f110 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h @@ -33,13 +33,13 @@ public: static void fillArrowArray( const String & column_name, - ColumnPtr & nested_column, - const std::shared_ptr & column_nested_type, - const String column_nested_type_name, - std::shared_ptr arrow_array, + ColumnPtr & column, + const std::shared_ptr & column_type, const PaddedPODArray * null_bytemap, arrow::ArrayBuilder * array_builder, - String format_name); + String format_name, + size_t start, + size_t end); }; } #endif diff --git a/tests/queries/0_stateless/00900_orc_arrays_load.reference b/tests/queries/0_stateless/00900_orc_arrays_load.reference new file mode 100644 index 00000000000..9b20ef98164 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_arrays_load.reference @@ -0,0 +1,4 @@ +[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01] +[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] +[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01] +[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] diff --git a/tests/queries/0_stateless/00900_orc_arrays_load.sh b/tests/queries/0_stateless/00900_orc_arrays_load.sh new file mode 100755 index 00000000000..ff3d4596b61 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_arrays_load.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +DATA_FILE=$CUR_DIR/data_orc/array_test.orc + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (a1 Array(Int8), a2 Array(UInt8), a3 Array(Int16), a4 Array(UInt16), a5 Array(Int32), a6 Array(UInt32), a7 Array(Int64), a8 Array(UInt64), a9 Array(String), a10 Array(FixedString(4)), a11 Array(Float32), a12 Array(Float64), a13 Array(Date), a14 Array(Datetime), a15 Array(Decimal(4, 2)), a16 Array(Decimal(10, 2)), a17 Array(Decimal(25, 2))) ENGINE=Memory()" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" +timeout 3 ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" < $DATA_FILE +${CLICKHOUSE_CLIENT} --query="select * from orc_load" + +${CLICKHOUSE_CLIENT} --query="drop table orc_load" diff --git a/tests/queries/0_stateless/00900_orc_nested_arrays_load.reference b/tests/queries/0_stateless/00900_orc_nested_arrays_load.reference new file mode 100644 index 00000000000..dd9c9900684 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_nested_arrays_load.reference @@ -0,0 +1,2 @@ +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] diff --git a/tests/queries/0_stateless/00900_orc_nested_arrays_load.sh b/tests/queries/0_stateless/00900_orc_nested_arrays_load.sh new file mode 100755 index 00000000000..d9d5e5f1c57 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_nested_arrays_load.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +DATA_FILE=$CUR_DIR/data_orc/nested_array_test.orc + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (a1 Array(Array(Array(UInt32))), a2 Array(Array(Array(String))), a3 Array(Array(Nullable(UInt32))), a4 Array(Array(Nullable(String)))) engine=Memory()" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" +timeout 3 ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" < $DATA_FILE +${CLICKHOUSE_CLIENT} --query="select * from orc_load" + +${CLICKHOUSE_CLIENT} --query="drop table orc_load" diff --git a/tests/queries/0_stateless/00900_orc_nullable_arrays_load.reference b/tests/queries/0_stateless/00900_orc_nullable_arrays_load.reference new file mode 100644 index 00000000000..62e95652040 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_nullable_arrays_load.reference @@ -0,0 +1,6 @@ +[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42] +[NULL] [NULL] [NULL] +[] [] [] +[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42] +[NULL] [NULL] [NULL] +[] [] [] diff --git a/tests/queries/0_stateless/00900_orc_nullable_arrays_load.sh b/tests/queries/0_stateless/00900_orc_nullable_arrays_load.sh new file mode 100755 index 00000000000..ec2a8be2d07 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_nullable_arrays_load.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +DATA_FILE=$CUR_DIR/data_orc/nullable_array_test.orc + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (a1 Array(Nullable(UInt32)), a2 Array(Nullable(String)), a3 Array(Nullable(Decimal(4, 2)))) ENGINE=Memory()" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" +timeout 3 ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" < $DATA_FILE +${CLICKHOUSE_CLIENT} --query="select * from orc_load" + +${CLICKHOUSE_CLIENT} --query="drop table orc_load" diff --git a/tests/queries/0_stateless/00900_parquet.reference b/tests/queries/0_stateless/00900_parquet.reference index 0f4be2c74a0..230d1f5ca48 100644 --- a/tests/queries/0_stateless/00900_parquet.reference +++ b/tests/queries/0_stateless/00900_parquet.reference @@ -60,3 +60,15 @@ dest from null: -108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06 127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06 \N \N \N \N \N \N \N \N \N \N \N \N \N \N +1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01] +1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01] +2 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] +2 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] +1 [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42] +1 [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42] +2 [NULL] [NULL] [NULL] +2 [NULL] [NULL] [NULL] +3 [] [] [] +3 [] [] [] +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] diff --git a/tests/queries/0_stateless/00900_parquet.sh b/tests/queries/0_stateless/00900_parquet.sh index 4b06001429f..8c19c7cecab 100755 --- a/tests/queries/0_stateless/00900_parquet.sh +++ b/tests/queries/0_stateless/00900_parquet.sh @@ -127,6 +127,7 @@ ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types1 ORDER BY int8 FORMAT echo dest from null: ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types6 ORDER BY int8" + ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types5" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types6" @@ -135,3 +136,33 @@ ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types1" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types2" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types3" ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types4" + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_arrays" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_arrays (id UInt32, a1 Array(Int8), a2 Array(UInt8), a3 Array(Int16), a4 Array(UInt16), a5 Array(Int32), a6 Array(UInt32), a7 Array(Int64), a8 Array(UInt64), a9 Array(String), a10 Array(FixedString(4)), a11 Array(Float32), a12 Array(Float64), a13 Array(Date), a14 Array(Datetime), a15 Array(Decimal(4, 2)), a16 Array(Decimal(10, 2)), a17 Array(Decimal(25, 2))) engine=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays VALUES (1, [1,-2,3], [1,2,3], [100, -200, 300], [100, 200, 300], [10000000, -20000000, 30000000], [10000000, 2000000, 3000000], [100000000000000, -200000000000, 3000000000000], [100000000000000, 20000000000000, 3000000000000], ['Some string', 'Some string', 'Some string'], ['0000', '1111', '2222'], [42.42, 424.2, 0.4242], [424242.424242, 4242042420.242424, 42], ['2000-01-01', '2001-01-01', '2002-01-01'], ['2000-01-01', '2001-01-01', '2002-01-01'], [0.2, 10.003, 4.002], [4.000000001, 10000.10000, 10000.100001], [1000000000.000000001123, 90.0000000010010101, 0101001.0112341001])" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays VALUES (2, [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [])" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays FORMAT Parquet" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_arrays ORDER BY id" + +${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_arrays" + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_nullable_arrays" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_nullable_arrays (id UInt32, a1 Array(Nullable(UInt32)), a2 Array(Nullable(String)), a3 Array(Nullable(Decimal(4, 2)))) engine=Memory()" +${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nullable_arrays VALUES (1, [1, Null, 2], [Null, 'Some string', Null], [0.001, Null, 42.42]), (2, [Null], [Null], [Null]), (3, [], [], [])" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nullable_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nullable_arrays FORMAT Parquet" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nullable_arrays ORDER BY id" +${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_nullable_arrays" + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_nested_arrays" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_nested_arrays (a1 Array(Array(Array(UInt32))), a2 Array(Array(Array(String))), a3 Array(Array(Nullable(UInt32))), a4 Array(Array(Nullable(String)))) engine=Memory() " +${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nested_arrays VALUES ([[[1,2,3], [1,2,3]], [[1,2,3]], [[], [1,2,3]]], [[['Some string', 'Some string'], []], [['Some string']], [[]]], [[Null, 1, 2], [Null], [1, 2], []], [['Some string', Null, 'Some string'], [Null], []])" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nested_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nested_arrays FORMAT Parquet" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nested_arrays" +${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_nested_arrays" diff --git a/tests/queries/0_stateless/00900_parquet_load.reference b/tests/queries/0_stateless/00900_parquet_load.reference index 65973e05c24..4bd699f40fe 100644 --- a/tests/queries/0_stateless/00900_parquet_load.reference +++ b/tests/queries/0_stateless/00900_parquet_load.reference @@ -1,6 +1,10 @@ === Try load data from alltypes_dictionary.parquet 0 1 0 0 0 0 0 0 01/01/09 0 1230768000 1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060 +=== Try load data from alltypes_list.parquet +[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] +[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01] +[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01] === Try load data from alltypes_plain.parquet 4 1 0 0 0 0 0 0 03/01/09 0 1235865600 5 0 1 1 1 10 1.1 10.1 03/01/09 1 1235865660 @@ -258,8 +262,9 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Not y 23.00 24.00 === Try load data from list_columns.parquet -Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin - +[1,2,3] ['abc','efg','hij'] +[NULL,1] [] +[4] ['efg',NULL,'hij','xyz'] === Try load data from nation.dict-malformed.parquet 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon @@ -286,9 +291,12 @@ Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not 22 RUSSIA 3 requests against the platelets use never according to the quickly regular pint 23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull 24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be +=== Try load data from nested_lists.parquet +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] === Try load data from nested_lists.snappy.parquet -Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin - +[[['a','b'],['c']],[[],['d']]] 1 +[[['a','b'],['c','d']],[[],['e']]] 1 +[[['a','b'],['c','d'],['e']],[[],['f']]] 1 === Try load data from nested_maps.snappy.parquet Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin @@ -301,6 +309,10 @@ Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) === Try load data from nullable.impala.parquet ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) +=== Try load data from nullable_list.parquet +[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42] +[NULL] [NULL] [NULL] +[] [] [] === Try load data from nulls.snappy.parquet Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin diff --git a/tests/queries/0_stateless/data_orc/array_test.orc b/tests/queries/0_stateless/data_orc/array_test.orc new file mode 100644 index 0000000000000000000000000000000000000000..8de2ebd7054c8586c60f918bb268525bf6594664 GIT binary patch literal 3870 zcmds3+iM(E7@wIlGkbRSHaokUJv-TKb{CC_)iAqB8j}azrW8fcmRkRSf>H!8pgt9p zAtE8t8|qW55CkbGgrZd+l$zo#zSIze)@ww4P=q!}t@vQ$nKQqc>@C^$)nU(>^SgX! zcFs9`^X=Wg+lU!Nk|ax|%09_R7#as5%mpw`iKSCw>P+qFQn)ML>0-fLDLz+}9w+7{ zm7O3xjmxrUJQM9SmVy~`e8v>rcy~Tyy2maQnR|9V)6?$0d{s8xZ$FJPckXnu(>@f; zKEP+s(~XZWA6t!?uDd$-^GA0w*Ztw*+S+Jm*lj^riH9-jF8%Z4SBv~XLEO?`XFvOU zvNO&ybg}Z1MJbt4=uB@5>$Hp*s@T2?C3b(550~;o-`$hugT~kkhZ_Y+TF<@bJlD8^ zNFKANkzPMP_j%&fm0g7c59j{;o)0=B^T8V@DVe79xrR4r3 zb7wEUadfimuDWpG(N#xswi3ZB>N@{^0_+Vic^jGH(^WP@p zE_^WeyS8#fe(Bl@Kgg<~Q{9wULfmDR5_g5=#hn1Fz#6d5Y@tVhDR30nU}d4lfaAak z;3OLsdJ5QNBLb&^EtVBH1DplU0r#;TLeB#;-~zDC3PSG(F0zWi1HdIVB=8_`ncX39 z1-Qy4IhOdkDB>=QyJE_W2&@P!n+o&RPcjK~0$qi!Lf4>c&~@lKi-`CL^aykcogzL8 zJqq1`Zis!8%oy|-^f>f5^aS(-^d$5o%ZcZwpr@dl&`s7S^fdG|bPL!LST-%OzLJ>% zpMft6oP{sT;$prW_#AwF0?TF}^gQ%DbOxOquB6Gy-Ew;wOL0rW3OoN@eR}n zZAPBHwk}G)y!B!Ei(qq1^xy*wMld5FmuaAuam>BfbFsVE>2^ zk?kjJKmgLRP1#*-+@l2|G~IrFb-ZfVln)N~ZD|z%v>L?*Y;6l7mr*tL7DTTN!4+1m zLFtTNjiYuXcQiBwx7Ngh&?2fmiFi8(9@lPkm*SQqB}b$Gjyi_<Ud-PgAH90}^71XEX-ZI})V)5JT51+jq;PUk z{z1VSG!A?nQTQtnC8J{0{zkMH%njp&5&hNZpE9?MCFfPlYsMzV1J#w32TnwkjNl}5 z>2>rg=E|GMwO53~(Vq}`!TuhBMk45j(dhFh%qPBx@yHxm2mP&RpR&D2aF)5{yfSj( zO6C>&djwZAueol+yx}^HzfL1h=l&n-Og()3%pTePP7Q0AevRScjPJ|XA4dH5RTzB1 zT46r#gD0pW@=T+0U(Kv0_#yl!G5@TVVsVphrxwMun#t#P=)FttJ$moYVF@c(lxSNa zbE0yhnb1e&W|NaiPB0}5yI8^IF=dJwiPm{clVVtBCD!rOX{EE%o;tc7b#%v1y<4Y_ oCSS0QI+{E|HFfmC$0v^jevx!Rwp+cBP%e*G=EFCo4i5MIFLpbKG5`Po literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_orc/nullable_array_test.orc b/tests/queries/0_stateless/data_orc/nullable_array_test.orc new file mode 100644 index 0000000000000000000000000000000000000000..8a1ad838b502edc42110bf2bbb547c1425a74db6 GIT binary patch literal 714 zcma)(u};G<5Qfh_+d4TdL{R~gsj3j_RGH9;scHuppsfloz<>|~AOR0hv9d5Tb>Kmm zcnx?6RvL`shE|m-2H)@R_xTdv4Msg3X#{{o5Dr0CwRBM8#tbXO6Gc20MlEM0TKP+m z5V0<&&E=GxG9@LR)wHlJYZKYdTJ0FxDJ{r=(Z>gIBG z4zJy*5L%In!Bo96v+k9fEw30@n3`u9SeaeN;O=p+%qx$X`K<>r_spE4XAI{U&PCut znd?a8^JMU4$Wx(?@^xhF)b-eX_dhYk!;>?8tc#%(ad8BN_t)=&S2UL}e!;Zjndv0ZJ?bo3R78()hgz*4GIB$#6rg>YH_L|!X%Pk)G-wJWz0l_OD f5O~D{I?w$hM(@l`ZafwZ4pFmj)#Df9VAL}|T|~AOR0hv9d5Tb>Kmm zcnx?6RvL`shE|m-2H)@R_xTdv4Msg3X#{{o5Dr0CwRBM8#tbXO6Gc20MlEM0TKP+m z5V0<&&E=GxG9@LR)wHlJYZKYdTJ0FxDJ{r=(Z>gIBG z4zJy*5L%In!Bo96v+k9fEw30@n3`u9SeaeN;O=p+%qx$X`K<>r_spE4XAI{U&PCut znd?a8^JMU4$Wx(?@^xhF)b-eX_dhYk!;>?8tc#%(ad8BN_t)=&S2UL}e!;Zjndv0ZJ?bo3R78()hgz*4GIB$#6rg>YH_L|!X%Pk)G-wJWz0l_OD f5O~D{I?w$hM(@l`ZafwZ4pFmj)#Df9VAL}|TfG=^y3?6J2WH^Il$YK6_F1$ znLG+$^<;SDV54c7Vgy`F0eln-U9rwtSU1%O=%)a7#X(ct=m4?jgmuFI;uSG2Eh!2zWmWqh9aC1Hhs*;>T?D8sN zyzW(6ygBKGUfDUs=GRP!w1D$)CfuCq9Ae!p6Cy3*bScoJL`0NDC4fUN8HM9AS?moj zUcP+k!aEM0DPmeCsZ)n!nJ1+(PwFKm!|;T~3HhX1y9DwpaGWcRfzH`_3$qT_f^>kIXhy5r zhNiXy5&h^b;#Mm)nJ1+(PwI&`^(Q*b2Wdu{=I*L_vv)2u%|&nOx-6Q9NBKH3piw6F z2UQcp@S=FL<#i)Xb062d*#%Gssy7YVhNf0gcZOkp?|M=3%MCW%o1|3cNj$Wn>k0t!G1$Fsm?7xN0 znDIUSrWLzlxOtkS?ABz7dEup~i_)h4@)^xDM>y5IJ{H>EguVr6UbxN(NFy+B`QK-K zBu9y0UY^C2pcCk(JY8rs#XMURq&8OSV4*(;F6NN#VKQbwE0uP#m>M9N6Ox?fpMuCm z`%6nw%E}Mz-nSD!23G4)9Q95yRLjN(d~=%L7{^(GfxCJ7^ouJt&-Vn3;m8{9=0bfg zp={Jt>?{gTT#Og%&dkP(Jm%nKZB2;6lX96S!YD*(G1r#poWrJ!YX5Fwq1z9q z{Aw67sH4&}jW?c4^H$V*@2qU|{rR?)o6M6MnJ0A;^3Gka`uUao_guT-sT%U-hwATf8R0nr^f}z2Cr4i!}z;33G7J&G6+T~#Y&3bo8B2@ z8mo2H+P9(oE$phphN9YM`X;~5XP9M~4AY45)oK%IFQi=Ni4>jpxfS|w5H=ashOO2D zs4P(1i$NWgo5}Zu-cmn-Eo@l7l-z&2<;y2`D^J<2)yQt`9pd}&Qhb@>`^HszXbGEM zYre037g|@M@57ZwfTG$LCnjj4fZ$;~u3ok#%o@(sD|3to)eiiN&ieFf)AZFk64$a) zJqs7!hv)Apa=1z=JC7&_aLP)eVQdNZU*vEuliZvg}ItU ze7uM>BM&?<3`(L=5$o^;N+W7sXR^>b*4A)RH3$6xXbdPC(g$yD8`9Hj3((}2|JZoIU6bo~Zj$(MWZQ|!3&r6J^&~uQg5@^{$y2r%GLGJC+=kt+rkb67f z{7yo7g5)6AztlNL%t7w&fzx}`Ff?vC$Qj&Ki@4aWbUZ7<2Wk3B4=oXMklG{Aet06~AZL%6CcV~?IGBT6ItrCX zCqNGJ>}wNQdJgjRIJ6yqRdbNm3f5$zb+8R4AywZ((+LNY5YF4bD3_6ivhBM{N{hu* z1|KkEz-2WdK=SomURF0A$*BS^os-pr2kFUyi;3>(q?$knY^UrGo_Ws6xDETu^B7aD@DW3#BVaZh45@5~_5ZhMUd=l%S8uusH%* z`p0r>dWk)#k^YflEmA{$9HYBQq@&U_xe}%5UnQQLQ5k4YvG{Ly=2r3>asmA_#f!0~ zw>;`wp3qzL?-nmgrnhp`w{q|;N$}=-`={GDV*|Dh%z&-aFJS8^3)niB0=5pBfUOfF zVC(n?*gDGr_6}x%y;Bxo@5lt$JO2Ro4l{tglL=t&m;u;3Gaz+Ll{nakH;gtUWJGDB z9~|77hbV2IgM<6s5T#9N7^CfB7^8h!h{7gn^maMH$-PL3)`lUB)y^F_xzz?^wdVy+ zZ&O8eD+$J97f0&%;;7I$8}A-GfsMmLl=fc0!Oaqg(vAi=xCH@G+9!Z9TCs;QS~`a) ztZPHm7M{V$Bb5S`mw&eK3b$j!Q!TtWD_U~&+T^AHjXud^VELSbP;5F!aE0uq0PrLi^? z#>yXH=?}3o7XAol(ZU#GV=QK8C-ZjRdy{QM2R?GV)$pW_7V;FpzQX~ae|+gDOeiNz z9`I!e8&rf86^c!YYQoIbVxPcu1bQqzH31DYoC*{;O{htA`h;4^@mR9ln%&UdebpR{ z+aU+TmO25D2}@iv@vOO5nrnW4ZF@A8BLiYl2~?8k)QL(7j(b6njegHg1-J5V z<{532G=Q*d$M*id$$#@H=DM&*A7u-mkQued=NH{+u`?Q_XWj9ne|DPoOJT_`Hg}^& Tb7!w*dt_g2Agcj1=wH4A2TOA7 literal 520 zcmZWny-or_5Z?RYyhJo$mn5{~6JG0JtCP!J0%4B6xMUIgToH|sbT75!oqh)V^&TZsN?d8aDGUdYoErV%z zN>g~^Lld8x^AvI3*>4EfjL59x0Hi+Y_QTT-a==6e7H(yrVUs2*Dt=_BH>zsSpOE zgoL=7E|wsILPFV)QhE&{0TC;9XjRe237fJRn=gs3nU*=|sTQ=%P*I7wkpSG+zXXh) zdh+iHd*6bE4!VkL#&w%o{&(-udN!LjK?2cCVE4DnERrZ=yi}H4E zyTf?s+WmA@PIEh;DGhSUqBM2#faPgcM2Y89=K6k8cyW>YMO0+PA=woO8RG}|0T44{ A?EnA( diff --git a/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns b/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns index 86745c2f074..1a488ee6e00 100644 --- a/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns @@ -1 +1 @@ -`int64_list` Nullable(Int64), `utf8_list` Nullable(String) +`int64_list` Array(Nullable(Int64)), `utf8_list` Array(Nullable(String)) diff --git a/tests/queries/0_stateless/data_parquet/nested_lists.parquet b/tests/queries/0_stateless/data_parquet/nested_lists.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0e9b9998ab60d9dbb5c9a3144060f8d0eb81a1fe GIT binary patch literal 1755 zcmc(gL5tHs6vt;K)7jY8DvNK(EJ4cB9dhUv+uam}UKTtkL@iqn-n1-XTWEJ{Tg9^< z!Gi}+euzEE9{mi0A_yY*4P+4yzG<5#NGO|24I%Gk-sHXa%YPnib3@(hu7Tsx1W|kXH!^bR|(5i z?!O`WHFJS0pA(^8v2L9Y`jaW|Ps3rF2TX3s3o3t8eW>j$O{gk1&VdmGF@_j(6Sf0j zGwf@(3pmy}9fsw}blg8V#(5R+s*uNd2DnMQLL%%WZw%92^FFKr<;FB05GsiNkaMB9 zWNV}m#bd~pm*-S|r235OU)Ji1sXS$<1lX>T@p>y#B~mg1DlLEqvhWT;N53yF zB}1nDm?ZWRLcJ^KuX(q*s!j?8Q-73C95FfSg~Kk`1aENZ<4P1k3tzU+omn+prap6K zn;~lq_|D?VdGbcq1gi=|F-pEe0@pOmu>ZZRC6Yaw1pAp2{M(0Q%2mPt`=qRQ!>@n; ztz&n{A3`#e5bX^@yzNnU{QN~Y-8~wO$}hw5q?uaCwra K_#tlNZ|yJJxFX~L literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/nested_lists.parquet.columns b/tests/queries/0_stateless/data_parquet/nested_lists.parquet.columns new file mode 100644 index 00000000000..c4ca4443236 --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/nested_lists.parquet.columns @@ -0,0 +1 @@ +`a1` Array(Array(Array(UInt32))), `a2` Array(Array(Array(String))), `a3` Array(Array(Nullable(UInt32))), `a4` Array(Array(Nullable(String))) diff --git a/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns index 6d55d46dd5b..99310769309 100644 --- a/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns @@ -1 +1 @@ -`a` Nullable(String), `b` Nullable(Int32) +`a` Array(Array(Array(Nullable(String)))), `b` Nullable(Int32) diff --git a/tests/queries/0_stateless/data_parquet/nullable_list.parquet b/tests/queries/0_stateless/data_parquet/nullable_list.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f42cf1beb466e640778d6e0809e2e3d199c693bf GIT binary patch literal 2138 zcmcguF>KR76ump24mWXKx9Xj2WK~f`P8rgOv~pAlFWov=5D`WOqRK!d(uO23GIU^M z=)l~e>eQ)XU_e#Xp+kp`OchgwIwD3U{>4qz8ixpMu&w*H@8AFby?_7P^_7QJPI-af z2|&;pge)JU=pdJpBQ;;<3^W*LuB{JeY<;LJNGahh7_fY$tT9cE_Yl^I{EA&p9giaB zER$xHzG(IOwAByWoCqd9WAO#cRjL1pAE&IM=LA8lN+2cX_r2c9#F*L+Uph$+DqR76@ z=gNxX!N#a5;wSH_k-!Nf@N_gWI@?4%5l`h0U7qPKEY8N^PJVP^B8O)=d63J_;oHg4 zBu5Sp47qJ&W$S0j~pM$BD&BU_|n(mui_`5l_Z=1 literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/nullable_list.parquet.columns b/tests/queries/0_stateless/data_parquet/nullable_list.parquet.columns new file mode 100644 index 00000000000..2f308030da6 --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/nullable_list.parquet.columns @@ -0,0 +1 @@ +`a1` Array(Nullable(UInt32)), `a2` Array(Nullable(String)), `a3` Array(Nullable(Decimal(4, 2)))