diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index f669f6695d1..24b524faeaf 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -42,7 +42,7 @@ Chunk ORCBlockInputFormat::generate() return res; std::shared_ptr batch_result; - arrow::Status batch_status = file_reader->ReadStripe(stripe_current, &batch_result); + arrow::Status batch_status = file_reader->ReadStripe(stripe_current, include_indices, &batch_result); if (!batch_status.ok()) throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", batch_status.ToString()); @@ -63,14 +63,39 @@ void ORCBlockInputFormat::resetParser() IInputFormat::resetParser(); file_reader.reset(); + include_indices.clear(); stripe_current = 0; } +size_t countIndicesForType(std::shared_ptr type) +{ + if (type->id() == arrow::Type::LIST) + return countIndicesForType(static_cast(type.get())->value_type()) + 1; + + return 1; +} + void ORCBlockInputFormat::prepareReader() { THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader)); stripe_total = file_reader->NumberOfStripes(); stripe_current = 0; + + std::shared_ptr schema; + THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema)); + + int index = 0; + for (int i = 0; i < schema->num_fields(); ++i) + { + if (getPort().getHeader().has(schema->field(i)->name())) + { + /// LIST type require 2 indices, so we should recursively + /// count the number of indices we need for this type. + int indexes_count = countIndicesForType(schema->field(i)->type()); + for (int j = 0; j != indexes_count; ++j) + include_indices.push_back(index++); + } + } } void registerInputFormatProcessorORC(FormatFactory &factory) diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index 5aec330a0c8..0c78290f3cc 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -30,6 +30,9 @@ private: int stripe_current = 0; + // indices of columns to read from ORC file + std::vector include_indices; + void prepareReader(); }; diff --git a/tests/queries/0_stateless/01273_arrow_arrays_load.reference b/tests/queries/0_stateless/01273_arrow_arrays_load.reference new file mode 100644 index 00000000000..edb6ba617a1 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_arrays_load.reference @@ -0,0 +1,4 @@ +[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] +[] [] [] [] [] [] [] [] [] [] [] [] [] [] +[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] +[] [] [] [] [] [] [] [] [] [] [] [] [] [] diff --git a/tests/queries/0_stateless/01273_arrow_arrays_load.sh b/tests/queries/0_stateless/01273_arrow_arrays_load.sh new file mode 100755 index 00000000000..b8d1a85921d --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_arrays_load.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_arrays" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_arrays (arr1 Array(Int8), arr2 Array(UInt8), arr3 Array(Int16), arr4 Array(UInt16), arr5 Array(Int32), arr6 Array(UInt32), arr7 Array(Int64), arr8 Array(UInt64), arr9 Array(String), arr10 Array(FixedString(4)), arr11 Array(Float32), arr12 Array(Float64), arr13 Array(Date), arr14 Array(Datetime)) ENGINE=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_arrays VALUES ([1,-2,3],[1,2,3],[100,-200,300],[100,200,300],[10000000,-20000000,30000000],[10000000,2000000,3000000],[100000000000000,-200000000000,3000000000000],[100000000000000,20000000000000,3000000000000],['Some string','Some string','Some string'],['0000','1111','2222'],[42.42,424.2,0.4242],[424242.424242,4242042420.242424,42],['2000-01-01','2001-01-01','2002-01-01'],['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']),([],[],[],[],[],[],[],[],[],[],[],[],[],[])" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/arrays.arrow + +cat "${CLICKHOUSE_TMP}"/arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_arrays FORMAT Arrow" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_arrays" +${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_arrays" diff --git a/tests/queries/0_stateless/01273_arrow_nested_arrays_load.reference b/tests/queries/0_stateless/01273_arrow_nested_arrays_load.reference new file mode 100644 index 00000000000..dd9c9900684 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_nested_arrays_load.reference @@ -0,0 +1,2 @@ +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] +[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] diff --git a/tests/queries/0_stateless/01273_arrow_nested_arrays_load.sh b/tests/queries/0_stateless/01273_arrow_nested_arrays_load.sh new file mode 100755 index 00000000000..cd2040260e9 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_nested_arrays_load.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_nested_arrays" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_nested_arrays (arr1 Array(Array(Array(UInt32))), arr2 Array(Array(Array(String))), arr3 Array(Array(Nullable(UInt32))), arr4 Array(Array(Nullable(String)))) engine=Memory()" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_nested_arrays VALUES ([[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]],[[['Some string','Some string'],[]],[['Some string']],[[]]],[[NULL,1,2],[NULL],[1,2],[]],[['Some string',NULL,'Some string'],[NULL],[]])" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nested_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/nested_arrays.arrow + +cat "${CLICKHOUSE_TMP}"/nested_arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_nested_arrays FORMAT Arrow" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nested_arrays" +${CLICKHOUSE_CLIENT} --query="DROP table arrow_nested_arrays" diff --git a/tests/queries/0_stateless/01273_arrow_nullable_arrays_load.reference b/tests/queries/0_stateless/01273_arrow_nullable_arrays_load.reference new file mode 100644 index 00000000000..44b19f616d0 --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_nullable_arrays_load.reference @@ -0,0 +1,6 @@ +[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42] +[NULL] [NULL] [NULL] +[] [] [] +[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42] +[NULL] [NULL] [NULL] +[] [] [] diff --git a/tests/queries/0_stateless/01273_arrow_nullable_arrays_load.sh b/tests/queries/0_stateless/01273_arrow_nullable_arrays_load.sh new file mode 100755 index 00000000000..d780cc6d06a --- /dev/null +++ b/tests/queries/0_stateless/01273_arrow_nullable_arrays_load.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_nullable_arrays" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_nullable_arrays (arr1 Array(Nullable(UInt32)), arr2 Array(Nullable(String)), arr3 Array(Nullable(Float32))) ENGINE=Memory()" +${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_nullable_arrays VALUES ([1,NULL,2],[NULL,'Some string',NULL],[0.00,NULL,42.42]), ([NULL],[NULL],[NULL]), ([],[],[])" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nullable_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/nullable_arrays.arrow + +cat "${CLICKHOUSE_TMP}"/nullable_arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_nullable_arrays FORMAT Arrow" + +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nullable_arrays" +${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_nullable_arrays"