From 235e3e2f5ba2e6061fbeac1e282f0fc9f7d3b9b5 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Thu, 20 May 2021 16:47:12 +0300 Subject: [PATCH] Support structs in Arrow/Parquet/ORC --- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 48 ++++++++++++- .../Formats/Impl/CHColumnToArrowColumn.cpp | 51 ++++++++++++- .../Formats/Impl/ORCBlockInputFormat.cpp | 2 +- .../Formats/Impl/ORCBlockOutputFormat.cpp | 71 +++++++++++++++---- .../Formats/Impl/ORCBlockOutputFormat.h | 4 +- .../Formats/Impl/ParquetBlockInputFormat.cpp | 23 +++++- .../00900_orc_tuples_load.reference | 2 + .../0_stateless/00900_orc_tuples_load.sh | 17 +++++ .../00900_parquet_tuples_load.reference | 2 + .../0_stateless/00900_parquet_tuples_load.sh | 17 +++++ .../0_stateless/01273_arrow_arrays_load.sh | 2 +- .../01273_arrow_tuples_load.reference | 2 + .../0_stateless/01273_arrow_tuples_load.sh | 17 +++++ 13 files changed, 237 insertions(+), 21 deletions(-) create mode 100644 tests/queries/0_stateless/00900_orc_tuples_load.reference create mode 100755 tests/queries/0_stateless/00900_orc_tuples_load.sh create mode 100644 tests/queries/0_stateless/00900_parquet_tuples_load.reference create mode 100755 tests/queries/0_stateless/00900_parquet_tuples_load.sh create mode 100644 tests/queries/0_stateless/01273_arrow_tuples_load.reference create mode 100755 tests/queries/0_stateless/01273_arrow_tuples_load.sh diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 29fff1fd4e0..23f356a77fc 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -7,15 +7,17 @@ #include #include #include +#include +#include #include #include #include #include #include #include +#include #include #include -#include namespace DB @@ -309,8 +311,6 @@ namespace DB break; case arrow::Type::LIST: { - const auto * list_type = static_cast(arrow_column->type().get()); - auto list_nested_type = list_type->value_type(); arrow::ArrayVector array_vector; array_vector.reserve(arrow_column->num_chunks()); for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -326,6 +326,25 @@ namespace DB fillOffsetsFromArrowListColumn(arrow_column, column_array.getOffsetsColumn()); break; } + case arrow::Type::STRUCT: + { + ColumnTuple & column_tuple = typeid_cast(internal_column); + int fields_count = column_tuple.tupleSize(); + std::vector nested_arrow_columns(fields_count); + for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) + { + arrow::StructArray & struct_chunk = static_cast(*(arrow_column->chunk(chunk_i))); + for (int i = 0; i < fields_count; ++i) + nested_arrow_columns[i].emplace_back(struct_chunk.field(i)); + } + + for (int i = 0; i != fields_count; ++i) + { + auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); + readColumnFromArrowColumn(nested_arrow_column, column_tuple.getColumn(i), column_name, format_name, false); + } + break; + } # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ fillColumnWithNumericData(arrow_column, internal_column); \ @@ -372,6 +391,29 @@ namespace DB return std::make_shared(getInternalType(list_nested_type, array_type->getNestedType(), column_name, format_name)); } + if (arrow_type->id() == arrow::Type::STRUCT) + { + const auto * struct_type = static_cast(arrow_type.get()); + const DataTypeTuple * tuple_type = typeid_cast(column_type.get()); + if (!tuple_type) + throw Exception{"Cannot convert arrow STRUCT type to a not Tuple ClickHouse type " + column_type->getName(), ErrorCodes::CANNOT_CONVERT_TYPE}; + + const DataTypes & tuple_nested_types = tuple_type->getElements(); + int internal_fields_num = tuple_nested_types.size(); + /// If internal column has less elements then arrow struct, we will select only first internal_fields_num columns. + if (internal_fields_num > struct_type->num_fields()) + throw Exception{ + "Cannot convert arrow STRUCT with " + std::to_string(struct_type->num_fields()) + " fields to a ClickHouse Tuple with " + + std::to_string(internal_fields_num) + " elements " + column_type->getName(), + ErrorCodes::CANNOT_CONVERT_TYPE}; + + DataTypes nested_types; + for (int i = 0; i < internal_fields_num; ++i) + nested_types.push_back(getInternalType(struct_type->field(i)->type(), tuple_nested_types[i], column_name, format_name)); + + return std::make_shared(std::move(nested_types)); + } + if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(), [=](auto && elem) { return elem.first == arrow_type->id(); }); internal_type_it != arrow_type_to_internal_type.end()) diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 0e9968bec17..4626d87f0cc 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -6,11 +6,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -113,7 +115,7 @@ namespace DB size_t start, size_t end) { - const auto * column_array = static_cast(column.get()); + const auto * column_array = typeid_cast(column.get()); ColumnPtr nested_column = column_array->getDataPtr(); DataTypePtr nested_type = typeid_cast(column_type.get())->getNestedType(); const auto & offsets = column_array->getOffsets(); @@ -124,13 +126,41 @@ namespace DB for (size_t array_idx = start; array_idx < end; ++array_idx) { - /// Start new array + /// Start new array. components_status = builder.Append(); checkStatus(components_status, nested_column->getName(), format_name); fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx]); } } + static void fillArrowArrayWithTupleColumnData( + const String & column_name, + ColumnPtr & column, + const std::shared_ptr & column_type, + const PaddedPODArray * null_bytemap, + arrow::ArrayBuilder * array_builder, + String format_name, + size_t start, + size_t end) + { + const auto * column_tuple = typeid_cast(column.get()); + const auto & nested_types = typeid_cast(column_type.get())->getElements(); + + arrow::StructBuilder & builder = assert_cast(*array_builder); + + for (size_t i = 0; i != column_tuple->tupleSize(); ++i) + { + ColumnPtr nested_column = column_tuple->getColumnPtr(i); + fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end); + } + + for (size_t i = start; i != end; ++i) + { + auto status = builder.Append(); + checkStatus(status, column->getName(), format_name); + } + } + template static void fillArrowArrayWithStringColumnData( ColumnPtr write_column, @@ -251,6 +281,10 @@ namespace DB { fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end); } + else if ("Tuple" == column_type_name) + { + fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end); + } else if (isDecimal(column_type)) { auto fill_decimal = [&](const auto & types) -> bool @@ -351,6 +385,19 @@ namespace DB return arrow::list(nested_arrow_type); } + if (isTuple(column_type)) + { + const auto & nested_types = typeid_cast(column_type.get())->getElements(); + std::vector> nested_fields; + for (size_t i = 0; i != nested_types.size(); ++i) + { + String name = column_name + "." + std::to_string(i); + auto nested_arrow_type = getArrowType(nested_types[i], name, format_name, is_column_nullable); + nested_fields.push_back(std::make_shared(name, nested_arrow_type, *is_column_nullable)); + } + return arrow::struct_(std::move(nested_fields)); + } + const std::string type_name = column_type->getFamilyName(); if (const auto * arrow_type_it = std::find_if( internal_type_to_arrow_type.begin(), diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 24b524faeaf..572b1c130e3 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -67,7 +67,7 @@ void ORCBlockInputFormat::resetParser() stripe_current = 0; } -size_t countIndicesForType(std::shared_ptr type) +static size_t countIndicesForType(std::shared_ptr type) { if (type->id() == arrow::Type::LIST) return countIndicesForType(static_cast(type.get())->value_type()) + 1; diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index ec6a7a65573..fa6f386adf1 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -10,12 +10,14 @@ #include #include #include +#include #include #include #include #include #include +#include namespace DB { @@ -46,15 +48,9 @@ void ORCOutputStream::write(const void* buf, size_t length) ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) : IOutputFormat(header_, out_), format_settings{format_settings_}, output_stream(out_), data_types(header_.getDataTypes()) { - schema = orc::createStructType(); - options.setCompression(orc::CompressionKind::CompressionKind_NONE); - size_t columns_count = header_.columns(); - for (size_t i = 0; i != columns_count; ++i) - schema->addStructField(header_.safeGetByPosition(i).name, getORCType(data_types[i])); - writer = orc::createWriter(*schema, &output_stream, options); } -ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type) +ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type, const std::string & column_name) { switch (type->getTypeId()) { @@ -102,12 +98,12 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t } case TypeIndex::Nullable: { - return getORCType(removeNullable(type)); + return getORCType(removeNullable(type), column_name); } case TypeIndex::Array: { const auto * array_type = typeid_cast(type.get()); - return orc::createListType(getORCType(array_type->getNestedType())); + return orc::createListType(getORCType(array_type->getNestedType(), column_name)); } case TypeIndex::Decimal32: { @@ -124,6 +120,18 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t const auto * decimal_type = typeid_cast *>(type.get()); return orc::createDecimalType(decimal_type->getPrecision(), decimal_type->getScale()); } + case TypeIndex::Tuple: + { + const auto * tuple_type = typeid_cast(type.get()); + const auto & nested_types = tuple_type->getElements(); + auto struct_type = orc::createStructType(); + for (size_t i = 0; i < nested_types.size(); ++i) + { + String name = column_name + "." + std::to_string(i); + struct_type->addStructField(name, getORCType(nested_types[i], name)); + } + return struct_type; + } default: { throw Exception("Type " + type->getName() + " is not supported for ORC output format", ErrorCodes::ILLEGAL_COLUMN); @@ -149,6 +157,8 @@ void ORCBlockOutputFormat::writeNumbers( number_orc_column.notNull[i] = 0; continue; } + else + number_orc_column.notNull[i] = 1; number_orc_column.data[i] = convert(number_column.getElement(i)); } number_orc_column.numElements = number_column.size(); @@ -175,6 +185,9 @@ void ORCBlockOutputFormat::writeDecimals( decimal_orc_column.notNull[i] = 0; continue; } + else + decimal_orc_column.notNull[i] = 1; + decimal_orc_column.values[i] = convert(decimal_column.getElement(i).value); } decimal_orc_column.numElements = decimal_column.size(); @@ -197,6 +210,9 @@ void ORCBlockOutputFormat::writeStrings( string_orc_column.notNull[i] = 0; continue; } + else + string_orc_column.notNull[i] = 1; + const StringRef & string = string_column.getDataAt(i); string_orc_column.data[i] = const_cast(string.data); string_orc_column.length[i] = string.size; @@ -223,6 +239,9 @@ void ORCBlockOutputFormat::writeDateTimes( timestamp_orc_column.notNull[i] = 0; continue; } + else + timestamp_orc_column.notNull[i] = 1; + timestamp_orc_column.data[i] = get_seconds(timestamp_column.getElement(i)); timestamp_orc_column.nanoseconds[i] = get_nanoseconds(timestamp_column.getElement(i)); } @@ -235,11 +254,10 @@ void ORCBlockOutputFormat::writeColumn( DataTypePtr & type, const PaddedPODArray * null_bytemap) { + orc_column.notNull.resize(column.size()); if (null_bytemap) - { orc_column.hasNulls = true; - orc_column.notNull.resize(column.size()); - } + switch (type->getTypeId()) { case TypeIndex::Int8: @@ -374,12 +392,25 @@ void ORCBlockOutputFormat::writeColumn( for (size_t i = 0; i != list_column.size(); ++i) { list_orc_column.offsets[i + 1] = offsets[i]; + list_orc_column.notNull[i] = 1; } orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements; writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap); list_orc_column.numElements = list_column.size(); break; } + case TypeIndex::Tuple: + { + orc::StructVectorBatch & struct_orc_column = dynamic_cast(orc_column); + const auto & tuple_column = assert_cast(column); + auto nested_types = typeid_cast(type.get())->getElements(); + for (size_t i = 0; i != tuple_column.size(); ++i) + struct_orc_column.notNull[i] = 1; + for (size_t i = 0; i != tuple_column.tupleSize(); ++i) + writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], null_bytemap); + break; + + } default: throw Exception("Type " + type->getName() + " is not supported for ORC output format", ErrorCodes::ILLEGAL_COLUMN); } @@ -409,6 +440,8 @@ size_t ORCBlockOutputFormat::getMaxColumnSize(Chunk & chunk) void ORCBlockOutputFormat::consume(Chunk chunk) { + if (!writer) + prepareWriter(); size_t columns_num = chunk.getNumColumns(); size_t rows_num = chunk.getNumRows(); /// getMaxColumnSize is needed to write arrays. @@ -425,9 +458,23 @@ void ORCBlockOutputFormat::consume(Chunk chunk) void ORCBlockOutputFormat::finalize() { + if (!writer) + prepareWriter(); + writer->close(); } +void ORCBlockOutputFormat::prepareWriter() +{ + const Block & header = getPort(PortKind::Main).getHeader(); + schema = orc::createStructType(); + options.setCompression(orc::CompressionKind::CompressionKind_NONE); + size_t columns_count = header.columns(); + for (size_t i = 0; i != columns_count; ++i) + schema->addStructField(header.safeGetByPosition(i).name, getORCType(data_types[i], header.safeGetByPosition(i).name)); + writer = orc::createWriter(*schema, &output_stream, options); +} + void registerOutputFormatProcessorORC(FormatFactory & factory) { factory.registerOutputFormatProcessor("ORC", []( diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h index 05053317533..557bf6cc07a 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h @@ -43,7 +43,7 @@ public: void finalize() override; private: - ORC_UNIQUE_PTR getORCType(const DataTypePtr & type); + ORC_UNIQUE_PTR getORCType(const DataTypePtr & type, const std::string & column_name); /// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be /// converted to unsigned char (bugprone-signed-char-misuse in clang). @@ -71,6 +71,8 @@ private: size_t getColumnSize(const IColumn & column, DataTypePtr & type); size_t getMaxColumnSize(Chunk & chunk); + void prepareWriter(); + const FormatSettings format_settings; ORCOutputStream output_stream; DataTypes data_types; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index df264406cfe..969aaf6ff36 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -67,6 +67,24 @@ void ParquetBlockInputFormat::resetParser() row_group_current = 0; } +static size_t countIndicesForType(std::shared_ptr type) +{ + if (type->id() == arrow::Type::LIST) + return countIndicesForType(static_cast(type.get())->value_type()); + + int indices = 0; + if (type->id() == arrow::Type::STRUCT) + { + auto * struct_type = static_cast(type.get()); + for (int i = 0; i != struct_type->num_fields(); ++i) + indices += countIndicesForType(struct_type->field(i)->type()); + } + else + indices = 1; + + return indices; +} + void ParquetBlockInputFormat::prepareReader() { THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(in), arrow::default_memory_pool(), &file_reader)); @@ -76,11 +94,14 @@ void ParquetBlockInputFormat::prepareReader() std::shared_ptr schema; THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); + int index = 0; for (int i = 0; i < schema->num_fields(); ++i) { if (getPort().getHeader().has(schema->field(i)->name())) { - column_indices.push_back(i); + int indexes_count = countIndicesForType(schema->field(i)->type()); + for (int j = 0; j != indexes_count; ++j) + column_indices.push_back(index++); } } } diff --git a/tests/queries/0_stateless/00900_orc_tuples_load.reference b/tests/queries/0_stateless/00900_orc_tuples_load.reference new file mode 100644 index 00000000000..cbf8ee819f7 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_tuples_load.reference @@ -0,0 +1,2 @@ +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] diff --git a/tests/queries/0_stateless/00900_orc_tuples_load.sh b/tests/queries/0_stateless/00900_orc_tuples_load.sh new file mode 100755 index 00000000000..f32be6af751 --- /dev/null +++ b/tests/queries/0_stateless/00900_orc_tuples_load.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_tuples" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_tuples (t1 Tuple(UInt32, UInt32), t2 Tuple(String, String), t3 Tuple(Tuple(UInt32, String), UInt32), t4 Tuple(Tuple(UInt32, UInt32), Tuple(String, String)), t5 Tuple(Array(UInt32), UInt32), t6 Tuple(Tuple(Array(UInt32), Array(UInt32)), Tuple(Array(Array(UInt32)), UInt32)), t7 Array(Tuple(Array(Array(UInt32)), Tuple(Array(Tuple(UInt32, UInt32)), UInt32)))) ENGINE=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO orc_tuples VALUES ((1, 2), ('1', '2'), ((1, '1'), 1), ((1, 2), ('1', '2')), ([1,2,3], 1), (([1,2,3], [1,2,3]), ([[1,2,3], [1,2,3]], 1)), [([[1,2,3], [1,2,3]], ([(1, 2), (1, 2)], 1))])" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_tuples FORMAT ORC" > "${CLICKHOUSE_TMP}"/tuples.orc + +cat "${CLICKHOUSE_TMP}"/tuples.orc | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_tuples FORMAT ORC" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_tuples" +${CLICKHOUSE_CLIENT} --query="DROP TABLE orc_tuples" diff --git a/tests/queries/0_stateless/00900_parquet_tuples_load.reference b/tests/queries/0_stateless/00900_parquet_tuples_load.reference new file mode 100644 index 00000000000..cbf8ee819f7 --- /dev/null +++ b/tests/queries/0_stateless/00900_parquet_tuples_load.reference @@ -0,0 +1,2 @@ +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] diff --git a/tests/queries/0_stateless/00900_parquet_tuples_load.sh b/tests/queries/0_stateless/00900_parquet_tuples_load.sh new file mode 100755 index 00000000000..031e8111019 --- /dev/null +++ b/tests/queries/0_stateless/00900_parquet_tuples_load.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_tuples" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_tuples (t1 Tuple(UInt32, UInt32), t2 Tuple(String, String), t3 Tuple(Tuple(UInt32, String), UInt32), t4 Tuple(Tuple(UInt32, UInt32), Tuple(String, String)), t5 Tuple(Array(UInt32), UInt32), t6 Tuple(Tuple(Array(UInt32), Array(UInt32)), Tuple(Array(Array(UInt32)), UInt32)), t7 Array(Tuple(Array(Array(UInt32)), Tuple(Array(Tuple(UInt32, UInt32)), UInt32)))) ENGINE=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_tuples VALUES ((1, 2), ('1', '2'), ((1, '1'), 1), ((1, 2), ('1', '2')), ([1,2,3], 1), (([1,2,3], [1,2,3]), ([[1,2,3], [1,2,3]], 1)), [([[1,2,3], [1,2,3]], ([(1, 2), (1, 2)], 1))])" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_tuples FORMAT Parquet" > "${CLICKHOUSE_TMP}"/tuples.parquet + +cat "${CLICKHOUSE_TMP}"/tuples.parquet | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_tuples FORMAT Parquet" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_tuples" +${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_tuples" diff --git a/tests/queries/0_stateless/01273_arrow_arrays_load.sh b/tests/queries/0_stateless/01273_arrow_arrays_load.sh index b8d1a85921d..bce653376a5 100755 --- a/tests/queries/0_stateless/01273_arrow_arrays_load.sh +++ b/tests/queries/0_stateless/01273_arrow_arrays_load.sh @@ -4,7 +4,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_arrays" +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_arrays" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_arrays (arr1 Array(Int8), arr2 Array(UInt8), arr3 Array(Int16), arr4 Array(UInt16), arr5 Array(Int32), arr6 Array(UInt32), arr7 Array(Int64), arr8 Array(UInt64), arr9 Array(String), arr10 Array(FixedString(4)), arr11 Array(Float32), arr12 Array(Float64), arr13 Array(Date), arr14 Array(Datetime)) ENGINE=Memory()" ${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_arrays VALUES ([1,-2,3],[1,2,3],[100,-200,300],[100,200,300],[10000000,-20000000,30000000],[10000000,2000000,3000000],[100000000000000,-200000000000,3000000000000],[100000000000000,20000000000000,3000000000000],['Some string','Some string','Some string'],['0000','1111','2222'],[42.42,424.2,0.4242],[424242.424242,4242042420.242424,42],['2000-01-01','2001-01-01','2002-01-01'],['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']),([],[],[],[],[],[],[],[],[],[],[],[],[],[])" diff --git a/tests/queries/0_stateless/01273_arrow_tuples_load.reference b/tests/queries/0_stateless/01273_arrow_tuples_load.reference new file mode 100644 index 00000000000..cbf8ee819f7 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_tuples_load.reference @@ -0,0 +1,2 @@ +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] +(1,2) ('1','2') ((1,'1'),1) ((1,2),('1','2')) ([1,2,3],1) (([1,2,3],[1,2,3]),([[1,2,3],[1,2,3]],1)) [([[1,2,3],[1,2,3]],([(1,2),(1,2)],1))] diff --git a/tests/queries/0_stateless/01273_arrow_tuples_load.sh b/tests/queries/0_stateless/01273_arrow_tuples_load.sh new file mode 100755 index 00000000000..311079afe50 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_tuples_load.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_tuples" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_tuples (t1 Tuple(UInt32, UInt32), t2 Tuple(String, String), t3 Tuple(Tuple(UInt32, String), UInt32), t4 Tuple(Tuple(UInt32, UInt32), Tuple(String, String)), t5 Tuple(Array(UInt32), UInt32), t6 Tuple(Tuple(Array(UInt32), Array(UInt32)), Tuple(Array(Array(UInt32)), UInt32)), t7 Array(Tuple(Array(Array(UInt32)), Tuple(Array(Tuple(UInt32, UInt32)), UInt32)))) ENGINE=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_tuples VALUES ((1, 2), ('1', '2'), ((1, '1'), 1), ((1, 2), ('1', '2')), ([1,2,3], 1), (([1,2,3], [1,2,3]), ([[1,2,3], [1,2,3]], 1)), [([[1,2,3], [1,2,3]], ([(1, 2), (1, 2)], 1))])" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_tuples FORMAT Arrow" > "${CLICKHOUSE_TMP}"/tuples.arrow + +cat "${CLICKHOUSE_TMP}"/tuples.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_tuples FORMAT Arrow" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_tuples" +${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_tuples"