Return include_indices, fix arrays and add more tests

This commit is contained in:
Pavel Kruglov 2021-05-14 13:48:31 +03:00
parent 46a7cc5f1d
commit 484cac6193
8 changed files with 90 additions and 1 deletions

View File

@ -42,7 +42,7 @@ Chunk ORCBlockInputFormat::generate()
return res;
std::shared_ptr<arrow::RecordBatch> batch_result;
arrow::Status batch_status = file_reader->ReadStripe(stripe_current, &batch_result);
arrow::Status batch_status = file_reader->ReadStripe(stripe_current, include_indices, &batch_result);
if (!batch_status.ok())
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA,
"Error while reading batch of ORC data: {}", batch_status.ToString());
@ -63,14 +63,39 @@ void ORCBlockInputFormat::resetParser()
IInputFormat::resetParser();
file_reader.reset();
include_indices.clear();
stripe_current = 0;
}
size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
{
if (type->id() == arrow::Type::LIST)
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + 1;
return 1;
}
void ORCBlockInputFormat::prepareReader()
{
THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader));
stripe_total = file_reader->NumberOfStripes();
stripe_current = 0;
std::shared_ptr<arrow::Schema> schema;
THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema));
int index = 0;
for (int i = 0; i < schema->num_fields(); ++i)
{
if (getPort().getHeader().has(schema->field(i)->name()))
{
/// LIST type require 2 indices, so we should recursively
/// count the number of indices we need for this type.
int indexes_count = countIndicesForType(schema->field(i)->type());
for (int j = 0; j != indexes_count; ++j)
include_indices.push_back(index++);
}
}
}
void registerInputFormatProcessorORC(FormatFactory &factory)

View File

@ -30,6 +30,9 @@ private:
int stripe_current = 0;
// indices of columns to read from ORC file
std::vector<int> include_indices;
void prepareReader();
};

View File

@ -0,0 +1,4 @@
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']
[] [] [] [] [] [] [] [] [] [] [] [] [] []
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']
[] [] [] [] [] [] [] [] [] [] [] [] [] []

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_arrays (arr1 Array(Int8), arr2 Array(UInt8), arr3 Array(Int16), arr4 Array(UInt16), arr5 Array(Int32), arr6 Array(UInt32), arr7 Array(Int64), arr8 Array(UInt64), arr9 Array(String), arr10 Array(FixedString(4)), arr11 Array(Float32), arr12 Array(Float64), arr13 Array(Date), arr14 Array(Datetime)) ENGINE=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_arrays VALUES ([1,-2,3],[1,2,3],[100,-200,300],[100,200,300],[10000000,-20000000,30000000],[10000000,2000000,3000000],[100000000000000,-200000000000,3000000000000],[100000000000000,20000000000000,3000000000000],['Some string','Some string','Some string'],['0000','1111','2222'],[42.42,424.2,0.4242],[424242.424242,4242042420.242424,42],['2000-01-01','2001-01-01','2002-01-01'],['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00']),([],[],[],[],[],[],[],[],[],[],[],[],[],[])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/arrays.arrow
cat "${CLICKHOUSE_TMP}"/arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_arrays FORMAT Arrow"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_arrays"

View File

@ -0,0 +1,2 @@
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_nested_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_nested_arrays (arr1 Array(Array(Array(UInt32))), arr2 Array(Array(Array(String))), arr3 Array(Array(Nullable(UInt32))), arr4 Array(Array(Nullable(String)))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_nested_arrays VALUES ([[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]],[[['Some string','Some string'],[]],[['Some string']],[[]]],[[NULL,1,2],[NULL],[1,2],[]],[['Some string',NULL,'Some string'],[NULL],[]])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nested_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/nested_arrays.arrow
cat "${CLICKHOUSE_TMP}"/nested_arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_nested_arrays FORMAT Arrow"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nested_arrays"
${CLICKHOUSE_CLIENT} --query="DROP table arrow_nested_arrays"

View File

@ -0,0 +1,6 @@
[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []
[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_nullable_arrays (arr1 Array(Nullable(UInt32)), arr2 Array(Nullable(String)), arr3 Array(Nullable(Float32))) ENGINE=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO arrow_nullable_arrays VALUES ([1,NULL,2],[NULL,'Some string',NULL],[0.00,NULL,42.42]), ([NULL],[NULL],[NULL]), ([],[],[])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nullable_arrays FORMAT Arrow" > "${CLICKHOUSE_TMP}"/nullable_arrays.arrow
cat "${CLICKHOUSE_TMP}"/nullable_arrays.arrow | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_nullable_arrays FORMAT Arrow"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE arrow_nullable_arrays"