Refactor, support all types in array, add nested arrays support, more tests

This commit is contained in:
Pavel Kruglov 2021-05-12 19:06:08 +03:00
parent 8990120eb7
commit 8ed6ad7c55
29 changed files with 441 additions and 802 deletions

View File

@ -3,6 +3,7 @@
#include <Columns/IColumn.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnArray.h>
#include <Core/Field.h>
@ -40,4 +41,9 @@ bool isColumnConst(const IColumn & column)
return checkColumn<ColumnConst>(column);
}
bool isColumnArray(const IColumn & column)
{
return checkColumn<ColumnArray>(column);
}
}

View File

@ -530,4 +530,6 @@ bool isColumnConst(const IColumn & column);
/// True if column's an ColumnNullable instance. It's just a syntax sugar for type check.
bool isColumnNullable(const IColumn & column);
bool isColumnArray(const IColumn & column);
}

View File

@ -17,6 +17,7 @@
#include <algorithm>
#include <DataTypes/DataTypeLowCardinality.h>
namespace DB
{
namespace ErrorCodes
@ -57,44 +58,11 @@ namespace DB
// Full list of types: contrib/arrow/cpp/src/arrow/type.h
};
template <typename NestedColumnVector>
static void reserveArrayColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, ColumnArray & array_column, NestedColumnVector & nested_column)
{
size_t nested_column_length = 0;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
for (int64_t array_idx = 0; array_idx != chunk.length(); ++array_idx)
{
const std::shared_ptr<arrow::Array> array = chunk.value_slice(array_idx);
nested_column_length += array->length();
}
}
array_column.reserve(arrow_column->length());
nested_column.reserve(nested_column_length);
}
/// Creates a null bytemap from arrow's null bitmap
static void fillByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & bytemap)
{
PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(*bytemap).getData();
bytemap_data.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->num_chunks()); ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->chunk(chunk_i);
for (size_t value_i = 0; value_i != static_cast<size_t>(chunk->length()); ++value_i)
bytemap_data.emplace_back(chunk->IsNull(value_i));
}
}
/// Inserts numeric data right into internal column data to reduce an overhead
template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
static void fillColumnWithNumericData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
static void fillColumnWithNumericData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
auto & column_data = static_cast<VectorType &>(*internal_column).getData();
auto & column_data = static_cast<VectorType &>(internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -108,70 +76,13 @@ namespace DB
}
}
template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
static void fillColumnWithArrayNumericData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
/// Inserts chars and offsets right into internal column data to reduce an overhead.
/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
/// Also internal strings are null terminated.
static void fillColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
ColumnArray & column_array = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = column_array.getOffsets();
const bool is_column_array_nullable = column_array.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&column_array.getData()) : nullptr;
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(column_array.getData()).getNestedColumn() :
column_array.getData();
VectorType & nested_column = static_cast<VectorType &>(array_nested_column);
auto & nested_column_data = nested_column.getData();
reserveArrayColumn(arrow_column, column_array, nested_column);
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
const std::shared_ptr<arrow::Array> array = list_chunk.values();
std::shared_ptr<arrow::Buffer> buffer = array->data()->buffers[1];
const auto * raw_data = reinterpret_cast<const NumericType *>(buffer->data());
nested_column_data.insert_assume_reserved(raw_data, raw_data + array->length());
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
const std::shared_ptr<arrow::Array> chunk = list_chunk.value_slice(array_idx);
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> inner_buffer = chunk->data()->buffers[1];
const size_t chunk_length = list_chunk.value_length(array_idx);
for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
{
if (!chunk->IsNull(offset_i) && inner_buffer)
{
if (is_column_array_nullable && column_nullable)
{
column_nullable->getNullMapData().push_back(0);
}
}
else
{
if (is_column_array_nullable && column_nullable)
{
column_nullable->getNullMapData().push_back(1);
}
}
}
column_array_offsets.emplace_back(column_array_offsets.back() + chunk->length());
}
}
}
/// Inserts chars and offsets right into internal column data to reduce an overhead.
/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
/// Also internal strings are null terminated.
static void fillColumnWithStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(*internal_column).getChars();
PaddedPODArray<UInt64> & column_offsets = assert_cast<ColumnString &>(*internal_column).getOffsets();
PaddedPODArray<UInt8> & column_chars_t = assert_cast<ColumnString &>(internal_column).getChars();
PaddedPODArray<UInt64> & column_offsets = assert_cast<ColumnString &>(internal_column).getOffsets();
size_t chars_t_size = 0;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -206,83 +117,9 @@ namespace DB
}
}
static void fillColumnWithArrayStringData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
static void fillColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
ColumnArray & column_array = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = column_array.getOffsets();
const bool is_column_array_nullable = column_array.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&column_array.getData()) : nullptr;
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(column_array.getData()).getNestedColumn() :
column_array.getData();
ColumnString & nested_column = static_cast<ColumnString &>(array_nested_column);
PaddedPODArray<UInt8> & nested_column_chars = nested_column.getChars();
PaddedPODArray<UInt64> & nested_column_offsets = nested_column.getOffsets();
size_t chars_t_size = 0;
size_t number_size = 0;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
for (int64_t array_idx = 0; array_idx != chunk.length(); ++array_idx)
{
const std::shared_ptr<arrow::Array> array = chunk.value_slice(array_idx);
arrow::BinaryArray & binary_array = static_cast<arrow::BinaryArray &>(*(array));
const size_t binary_array_length = binary_array.length();
chars_t_size += binary_array.value_offset(binary_array_length - 1) + binary_array.value_length(binary_array_length - 1);
chars_t_size += binary_array_length; /// additional space for null bytes
number_size += binary_array_length;
}
}
column_array.reserve(arrow_column->length());
nested_column_chars.reserve(chars_t_size);
nested_column_offsets.reserve(number_size);
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
const std::shared_ptr<arrow::Array> array = list_chunk.value_slice(array_idx);
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(array));
std::shared_ptr<arrow::Buffer> buffer = chunk.value_data();
const size_t chunk_length = chunk.length();
for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
{
if (!chunk.IsNull(offset_i) && buffer)
{
const auto * raw_data = buffer->data() + chunk.value_offset(offset_i);
nested_column_chars.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i));
if (is_column_array_nullable && column_nullable)
{
column_nullable->getNullMapData().push_back(0);
}
}
else
{
if (is_column_array_nullable && column_nullable)
{
column_nullable->getNullMapData().push_back(1);
}
}
nested_column_chars.emplace_back('\0');
nested_column_offsets.emplace_back(nested_column_chars.size());
}
column_array_offsets.emplace_back(column_array_offsets.back() + chunk_length);
}
}
}
static void fillColumnWithBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column_data = assert_cast<ColumnVector<UInt8> &>(*internal_column).getData();
auto & column_data = assert_cast<ColumnVector<UInt8> &>(internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -296,58 +133,10 @@ namespace DB
}
}
static void fillColumnWithArrayBooleanData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
{
ColumnArray & column_array = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = column_array.getOffsets();
const bool is_column_array_nullable = column_array.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&column_array.getData()) : nullptr;
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(column_array.getData()).getNestedColumn() :
column_array.getData();
ColumnVector<UInt8> & nested_column = assert_cast<ColumnVector<UInt8> &>(array_nested_column);
auto & nested_column_data = nested_column.getData();
reserveArrayColumn(arrow_column, column_array, nested_column);
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
const std::shared_ptr<arrow::Array> list_array = list_chunk.values();
auto & chunk = static_cast<arrow::BooleanArray &>(*(list_array));
for (size_t bool_i = 0; bool_i != static_cast<size_t>(list_array->length()); ++bool_i)
{
nested_column_data.emplace_back(chunk.Value(bool_i));
}
if (is_column_array_nullable && column_nullable)
{
for (size_t bool_i = 0; bool_i != static_cast<size_t>(list_array->length()); ++bool_i)
{
if (!chunk.IsNull(bool_i))
{
column_nullable->getNullMapData().push_back(0);
}
else
{
column_nullable->getNullMapData().push_back(1);
}
}
}
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx));
}
}
}
/// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving
static void fillColumnWithDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
static void fillColumnWithDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
PaddedPODArray<UInt16> & column_data = assert_cast<ColumnVector<UInt16> &>(*internal_column).getData();
PaddedPODArray<UInt16> & column_data = assert_cast<ColumnVector<UInt16> &>(internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -360,7 +149,7 @@ namespace DB
if (days_num > DATE_LUT_MAX_DAY_NUM)
{
// TODO: will it rollback correctly?
throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column->getName()
throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + internal_column.getName()
+ "\" is greater than "
"max allowed Date value, which is "
+ std::to_string(DATE_LUT_MAX_DAY_NUM),
@ -372,69 +161,10 @@ namespace DB
}
}
static void fillColumnWithArrayDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
{
ColumnArray & array_column = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = array_column.getOffsets();
const bool is_column_array_nullable = array_column.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&array_column.getData()) : nullptr;
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(array_column.getData()).getNestedColumn() :
array_column.getData();
ColumnVector<UInt16> & nested_column = assert_cast<ColumnVector<UInt16> &>(array_nested_column);
auto & nested_column_data = nested_column.getData();
reserveArrayColumn(arrow_column, array_column, nested_column);
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
const std::shared_ptr<arrow::Array> list_array = list_chunk.values();
auto & chunk = static_cast<arrow::Date32Array &>(*(list_array));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
UInt32 days_num = static_cast<UInt32>(chunk.Value(value_i));
if (days_num > DATE_LUT_MAX_DAY_NUM)
{
// TODO: will it rollback correctly?
throw Exception{
"Input value " + std::to_string(days_num) + " of a column \"" + internal_column->getName()
+ "\" is greater than "
"max allowed Date value, which is "
+ std::to_string(DATE_LUT_MAX_DAY_NUM),
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
}
nested_column_data.emplace_back(days_num);
}
if (is_column_array_nullable && column_nullable)
{
for (size_t bool_i = 0; bool_i != static_cast<size_t>(list_array->length()); ++bool_i)
{
if (!chunk.IsNull(bool_i))
{
column_nullable->getNullMapData().push_back(0);
}
else
{
column_nullable->getNullMapData().push_back(1);
}
}
}
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx));
}
}
}
/// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving
static void fillColumnWithDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
static void fillColumnWithDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData();
auto & column_data = assert_cast<ColumnVector<UInt32> &>(internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -448,58 +178,9 @@ namespace DB
}
}
static void fillColumnWithArrayDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
static void fillColumnWithTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
ColumnArray & array_column = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = array_column.getOffsets();
const bool is_column_array_nullable = array_column.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&array_column.getData()) : nullptr;
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(array_column.getData()).getNestedColumn() :
array_column.getData();
ColumnVector<UInt32> & nested_column = assert_cast<ColumnVector<UInt32> &>(array_nested_column);
auto & nested_column_data = nested_column.getData();
reserveArrayColumn(arrow_column, array_column, nested_column);
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
const std::shared_ptr<arrow::Array> list_array = list_chunk.values();
auto & chunk = static_cast<arrow::Date64Array &>(*(list_array));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
auto timestamp = static_cast<UInt32>(chunk.Value(value_i) / 1000); // Always? in ms
nested_column_data.emplace_back(timestamp);
}
if (is_column_array_nullable && column_nullable)
{
for (size_t bool_i = 0; bool_i != static_cast<size_t>(list_array->length()); ++bool_i)
{
if (!chunk.IsNull(bool_i))
{
column_nullable->getNullMapData().push_back(0);
}
else
{
column_nullable->getNullMapData().push_back(1);
}
}
}
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx));
}
}
}
static void fillColumnWithTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData();
auto & column_data = assert_cast<ColumnVector<UInt32> &>(internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
@ -533,77 +214,9 @@ namespace DB
}
}
static void fillColumnWithArrayTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
static void fillColumnWithDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column)
{
ColumnArray & column_array = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = column_array.getOffsets();
const bool is_column_array_nullable = column_array.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&column_array.getData()) : nullptr;
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(column_array.getData()).getNestedColumn() :
column_array.getData();
ColumnVector<UInt32> & nested_column = assert_cast<ColumnVector<UInt32> &>(array_nested_column);
auto & nested_column_data = nested_column.getData();
reserveArrayColumn(arrow_column, column_array, nested_column);
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
const auto & type = static_cast<const ::arrow::TimestampType &>(*list_chunk.type());
const std::shared_ptr<arrow::Array> list_array = list_chunk.values();
auto & chunk = static_cast<arrow::TimestampArray &>(*(list_array));
UInt32 divide = 1;
const auto unit = type.unit();
switch (unit)
{
case arrow::TimeUnit::SECOND:
divide = 1;
break;
case arrow::TimeUnit::MILLI:
divide = 1000;
break;
case arrow::TimeUnit::MICRO:
divide = 1000000;
break;
case arrow::TimeUnit::NANO:
divide = 1000000000;
break;
}
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
auto timestamp = static_cast<UInt32>(chunk.Value(value_i) / divide); // ms! TODO: check other 's' 'ns' ...
nested_column_data.emplace_back(timestamp);
}
if (is_column_array_nullable && column_nullable)
{
for (size_t bool_i = 0; bool_i != static_cast<size_t>(list_array->length()); ++bool_i)
{
if (!chunk.IsNull(bool_i))
{
column_nullable->getNullMapData().push_back(0);
}
else
{
column_nullable->getNullMapData().push_back(1);
}
}
}
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx));
}
}
}
static void fillColumnWithDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column = assert_cast<ColumnDecimal<Decimal128> &>(*internal_column);
auto & column = assert_cast<ColumnDecimal<Decimal128> &>(internal_column);
auto & column_data = column.getData();
column_data.reserve(arrow_column->length());
@ -617,54 +230,155 @@ namespace DB
}
}
static void fillColumnWithArrayDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, MutableColumnPtr & internal_column)
/// Creates a null bytemap from arrow's null bitmap
static void fillByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & bytemap)
{
ColumnArray & array_column = assert_cast<ColumnArray &>(*internal_column);
ColumnArray::Offsets & column_array_offsets = array_column.getOffsets();
PaddedPODArray<UInt8> & bytemap_data = assert_cast<ColumnVector<UInt8> &>(bytemap).getData();
bytemap_data.reserve(arrow_column->length());
const bool is_column_array_nullable = array_column.getData().isNullable();
ColumnNullable * column_nullable = is_column_array_nullable ? static_cast<ColumnNullable *>(&array_column.getData()) : nullptr;
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->num_chunks()); ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->chunk(chunk_i);
IColumn & array_nested_column =
is_column_array_nullable ? static_cast<ColumnNullable &>(array_column.getData()).getNestedColumn() :
array_column.getData();
ColumnDecimal<Decimal128> & nested_column = assert_cast<ColumnDecimal<Decimal128> &>(array_nested_column);
auto & nested_column_data = nested_column.getData();
for (size_t value_i = 0; value_i != static_cast<size_t>(chunk->length()); ++value_i)
bytemap_data.emplace_back(chunk->IsNull(value_i));
}
}
reserveArrayColumn(arrow_column, array_column, nested_column);
static void fillOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & offsets)
{
ColumnArray::Offsets & offsets_data = assert_cast<ColumnVector<UInt64> &>(offsets).getData();
offsets_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
const std::shared_ptr<arrow::Array> list_array = list_chunk.values();
auto & chunk = static_cast<arrow::DecimalArray &>(*(list_array));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
nested_column_data.emplace_back(*reinterpret_cast<const Decimal128 *>(chunk.Value(value_i))); // TODO: copy column
}
if (is_column_array_nullable && column_nullable)
{
for (size_t bool_i = 0; bool_i != static_cast<size_t>(list_array->length()); ++bool_i)
{
if (!chunk.IsNull(bool_i))
{
column_nullable->getNullMapData().push_back(0);
}
else
{
column_nullable->getNullMapData().push_back(1);
}
}
}
for (int64_t array_idx = 0; array_idx != list_chunk.length(); ++array_idx)
{
column_array_offsets.emplace_back(column_array_offsets.back() + list_chunk.value_offset(array_idx));
}
auto arrow_offsets_array = list_chunk.offsets();
auto & arrow_offsets = static_cast<arrow::Int32Array &>(*arrow_offsets_array);
auto start = offsets_data.back();
for (int64_t i = 1; i < arrow_offsets.length(); ++i)
offsets_data.emplace_back(start + arrow_offsets.Value(i));
}
}
static void readColumnFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column, IColumn & internal_column, const std::string & column_name, const std::string format_name, bool is_nullable)
{
if (internal_column.isNullable())
{
ColumnNullable & column_nullable = typeid_cast<ColumnNullable &>(internal_column);
readColumnFromArrowColumn(arrow_column, column_nullable.getNestedColumn(), column_name, format_name, true);
fillByteMapFromArrowColumn(arrow_column, column_nullable.getNullMapColumn());
return;
}
// TODO: check if a column is const?
if (!is_nullable && !isColumnArray(internal_column) && arrow_column->null_count())
{
throw Exception{
"Can not insert NULL data into non-nullable column \"" + column_name + "\"",
ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN};
}
switch (arrow_column->type()->id())
{
case arrow::Type::STRING:
case arrow::Type::BINARY:
//case arrow::Type::FIXED_SIZE_BINARY:
fillColumnWithStringData(arrow_column, internal_column);
break;
case arrow::Type::BOOL:
fillColumnWithBooleanData(arrow_column, internal_column);
break;
case arrow::Type::DATE32:
fillColumnWithDate32Data(arrow_column, internal_column);
break;
case arrow::Type::DATE64:
fillColumnWithDate64Data(arrow_column, internal_column);
break;
case arrow::Type::TIMESTAMP:
fillColumnWithTimestampData(arrow_column, internal_column);
break;
case arrow::Type::DECIMAL:
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
fillColumnWithDecimalData(arrow_column, internal_column /*, internal_nested_type*/);
break;
case arrow::Type::LIST:
{
const auto * list_type = static_cast<arrow::ListType *>(arrow_column->type().get());
auto list_nested_type = list_type->value_type();
arrow::ArrayVector array_vector;
array_vector.reserve(arrow_column->num_chunks());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::ListArray & list_chunk = static_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
std::shared_ptr<arrow::Array> chunk = list_chunk.values();
array_vector.emplace_back(std::move(chunk));
}
auto arrow_nested_column = std::make_shared<arrow::ChunkedArray>(array_vector);
ColumnArray & column_array = typeid_cast<ColumnArray &>(internal_column);
readColumnFromArrowColumn(arrow_nested_column, column_array.getData(), column_name, format_name, false);
fillOffsetsFromArrowListColumn(arrow_column, column_array.getOffsetsColumn());
break;
}
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, internal_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
// TODO: read JSON as a string?
// TODO: read UUID as a string?
default:
throw Exception
{
"Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \""
+ column_name + "\"",
ErrorCodes::UNKNOWN_TYPE
};
}
}
static DataTypePtr getInternalType(std::shared_ptr<arrow::DataType> arrow_type, const DataTypePtr & column_type, const std::string & column_name, const std::string & format_name)
{
if (column_type->isNullable())
{
DataTypePtr nested_type = typeid_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
return makeNullable(getInternalType(arrow_type, nested_type, column_name, format_name));
}
if (arrow_type->id() == arrow::Type::DECIMAL)
{
const auto * decimal_type = static_cast<arrow::DecimalType *>(arrow_type.get());
return std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(), decimal_type->scale());
}
if (arrow_type->id() == arrow::Type::LIST)
{
const auto * list_type = static_cast<arrow::ListType *>(arrow_type.get());
auto list_nested_type = list_type->value_type();
const DataTypeArray * array_type = typeid_cast<const DataTypeArray *>(column_type.get());
if (!array_type)
throw Exception{"Cannot convert arrow LIST type to a not Array ClickHouse type " + column_type->getName(), ErrorCodes::CANNOT_CONVERT_TYPE};
return std::make_shared<DataTypeArray>(getInternalType(list_nested_type, array_type->getNestedType(), column_name, format_name));
}
if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
[=](auto && elem) { return elem.first == arrow_type->id(); });
internal_type_it != arrow_type_to_internal_type.end())
{
return DataTypeFactory::instance().get(internal_type_it->second);
}
throw Exception{
"The type \"" + arrow_type->name() + "\" of an input column \"" + column_name + "\" is not supported for conversion from a "
+ format_name + " data format",
ErrorCodes::CANNOT_CONVERT_TYPE};
}
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table,
const Block & header, std::string format_name)
{
@ -693,166 +407,16 @@ namespace DB
ErrorCodes::THERE_IS_NO_COLUMN};
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[header_column.name];
arrow::Type::type arrow_type = arrow_column->type()->id();
std::shared_ptr<arrow::DataType> list_nested_type;
// TODO: check if a column is const?
if (!column_type->isNullable() && arrow_column->null_count())
{
throw Exception{"Can not insert NULL data into non-nullable column \"" + header_column.name + "\"",
ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN};
}
DataTypePtr internal_type = getInternalType(arrow_column->type(), column_type, header_column.name, format_name);
const bool target_column_is_nullable = column_type->isNullable() || arrow_column->null_count();
DataTypePtr internal_nested_type;
DataTypePtr array_nested_type;
if (arrow_type == arrow::Type::DECIMAL)
{
const auto * decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(),
decimal_type->scale());
}
else if (arrow_type == arrow::Type::LIST)
{
const auto * list_type = static_cast<arrow::ListType *>(arrow_column->type().get());
list_nested_type = list_type->value_type();
const auto * column_array_type = static_cast<const DataTypeArray *>(column_type.get());
const bool is_column_array_nullable = column_array_type->getNestedType()->isNullable();
if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
[=](auto && elem) { return elem.first == list_nested_type->id(); });
internal_type_it != arrow_type_to_internal_type.end())
{
DataTypePtr array_instance_type = DataTypeFactory::instance().get(internal_type_it->second);
array_nested_type = is_column_array_nullable ? makeNullable(array_instance_type)
: array_instance_type;
internal_nested_type = std::make_shared<DataTypeArray>(array_nested_type);
}
else
{
throw Exception{"The internal type \"" + list_type->value_type()->name() + "\" of an array column \"" + header_column.name
+ "\" is not supported for conversion from a " + format_name + " data format",
ErrorCodes::CANNOT_CONVERT_TYPE};
}
}
else if (const auto * internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
[=](auto && elem) { return elem.first == arrow_type; });
internal_type_it != arrow_type_to_internal_type.end())
{
internal_nested_type = DataTypeFactory::instance().get(internal_type_it->second);
}
else
{
throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + header_column.name
+ "\" is not supported for conversion from a " + format_name + " data format",
ErrorCodes::CANNOT_CONVERT_TYPE};
}
const DataTypePtr internal_type = target_column_is_nullable ? makeNullable(internal_nested_type)
: internal_nested_type;
MutableColumnPtr read_column = internal_type->createColumn();
readColumnFromArrowColumn(arrow_column, *read_column, header_column.name, format_name, false);
ColumnWithTypeAndName column;
column.name = header_column.name;
column.type = internal_type;
/// Data
MutableColumnPtr read_column = internal_nested_type->createColumn();
switch (arrow_type)
{
case arrow::Type::STRING:
case arrow::Type::BINARY:
//case arrow::Type::FIXED_SIZE_BINARY:
fillColumnWithStringData(arrow_column, read_column);
break;
case arrow::Type::BOOL:
fillColumnWithBooleanData(arrow_column, read_column);
break;
case arrow::Type::DATE32:
fillColumnWithDate32Data(arrow_column, read_column);
break;
case arrow::Type::DATE64:
fillColumnWithDate64Data(arrow_column, read_column);
break;
case arrow::Type::TIMESTAMP:
fillColumnWithTimestampData(arrow_column, read_column);
break;
case arrow::Type::DECIMAL:
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
fillColumnWithDecimalData(arrow_column, read_column /*, internal_nested_type*/);
break;
case arrow::Type::LIST:
if (array_nested_type && list_nested_type) {
switch (list_nested_type->id())
{
case arrow::Type::STRING:
case arrow::Type::BINARY:
//case arrow::Type::FIXED_SIZE_BINARY:
fillColumnWithArrayStringData(arrow_column, read_column);
break;
case arrow::Type::BOOL:
fillColumnWithArrayBooleanData(arrow_column, read_column);
break;
case arrow::Type::DATE32:
fillColumnWithArrayDate32Data(arrow_column, read_column);
break;
case arrow::Type::DATE64:
fillColumnWithArrayDate64Data(arrow_column, read_column);
break;
case arrow::Type::TIMESTAMP:
fillColumnWithArrayTimestampData(arrow_column, read_column);
break;
case arrow::Type::DECIMAL:
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
fillColumnWithArrayDecimalData(arrow_column, read_column /*, internal_nested_type*/);
break;
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithArrayNumericData<CPP_NUMERIC_TYPE>(arrow_column, read_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
default:
throw Exception
{
"Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \""
+ header_column.name + "\"",
ErrorCodes::UNKNOWN_TYPE
};
}
}
break;
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, read_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
// TODO: read JSON as a string?
// TODO: read UUID as a string?
default:
throw Exception
{
"Unsupported " + format_name + " type \"" + arrow_column->type()->name() + "\" of an input column \""
+ header_column.name + "\"",
ErrorCodes::UNKNOWN_TYPE
};
}
if (column.type->isNullable())
{
MutableColumnPtr null_bytemap = DataTypeUInt8().createColumn();
fillByteMapFromArrowColumn(arrow_column, null_bytemap);
column.column = ColumnNullable::create(std::move(read_column), std::move(null_bytemap));
}
else
column.column = std::move(read_column);
column.column = std::move(read_column);
column.column = castColumn(column, header_column.type);
column.type = header_column.type;

View File

@ -18,6 +18,8 @@
#include <arrow/util/decimal.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
@ -50,12 +52,6 @@ namespace DB
{"FixedString", arrow::utf8()},
};
static const PaddedPODArray<UInt8> * extractNullBytemapPtr(ColumnPtr column)
{
ColumnPtr null_column = assert_cast<const ColumnNullable &>(*column).getNullMapColumnPtr();
const PaddedPODArray<UInt8> & null_bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
return &null_bytemap;
}
static void checkStatus(const arrow::Status & status, const String & column_name, const String & format_name)
{
@ -68,10 +64,12 @@ namespace DB
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
arrow::ArrayBuilder* abuilder)
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{
const PaddedPODArray<NumericType> & internal_data = assert_cast<const ColumnVector<NumericType> &>(*write_column).getData();
ArrowBuilderType & builder = assert_cast<ArrowBuilderType &>(*abuilder);
ArrowBuilderType & builder = assert_cast<ArrowBuilderType &>(*array_builder);
arrow::Status status;
const UInt8 * arrow_null_bytemap_raw_ptr = nullptr;
@ -79,98 +77,48 @@ namespace DB
if (null_bytemap)
{
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
arrow_null_bytemap.reserve(null_bytemap->size());
for (auto is_null : *null_bytemap)
arrow_null_bytemap.emplace_back(!is_null);
arrow_null_bytemap.reserve(end - start);
for (size_t i = start; i < end; ++i)
arrow_null_bytemap.template emplace_back(!(*null_bytemap)[i]);
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
}
if constexpr (std::is_same_v<NumericType, UInt8>)
status = builder.AppendValues(
reinterpret_cast<const uint8_t *>(internal_data.data()),
internal_data.size(),
reinterpret_cast<const uint8_t *>(internal_data.data() + start),
end - start,
reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
else
status = builder.AppendValues(internal_data.data(), internal_data.size(), reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
status = builder.AppendValues(internal_data.data() + start, end - start, reinterpret_cast<const uint8_t *>(arrow_null_bytemap_raw_ptr));
checkStatus(status, write_column->getName(), format_name);
}
static void fillArrowArrayWithArrayColumnData(
const String & column_name,
ColumnPtr & nested_column,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
std::shared_ptr<arrow::Array> arrow_array,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name)
String format_name,
size_t start,
size_t end)
{
const auto * column_array = static_cast<const ColumnArray *>(nested_column.get());
const bool is_column_array_nullable = column_array->getData().isNullable();
const IColumn & array_nested_column =
is_column_array_nullable ? static_cast<const ColumnNullable &>(column_array->getData()).getNestedColumn() :
column_array->getData();
const String column_array_nested_type_name = array_nested_column.getFamilyName();
const auto * column_array = static_cast<const ColumnArray *>(column.get());
ColumnPtr nested_column = column_array->getDataPtr();
DataTypePtr nested_type = typeid_cast<const DataTypeArray *>(column_type.get())->getNestedType();
const auto & offsets = column_array->getOffsets();
const auto * column_array_type = static_cast<const DataTypeArray *>(column_type.get());
const DataTypePtr & array_nested_type =
is_column_array_nullable ? static_cast<const DataTypeNullable *>(column_array_type->getNestedType().get())->getNestedType() :
column_array_type->getNestedType();
arrow::ListBuilder & builder = assert_cast<arrow::ListBuilder &>(*array_builder);
arrow::ArrayBuilder * value_builder = builder.value_builder();
arrow::Status components_status;
const PaddedPODArray<UInt8> * array_null_bytemap =
is_column_array_nullable ? extractNullBytemapPtr(assert_cast<const ColumnArray &>(*nested_column).getDataPtr()) : nullptr;
const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(),
[=](auto && elem) { return elem.first == column_array_nested_type_name; });
if (arrow_type_it != internal_type_to_arrow_type.end())
for (size_t array_idx = start; array_idx < end; ++array_idx)
{
std::shared_ptr<arrow::DataType> list_type = arrow::list(arrow_type_it->second);
const auto & internal_column = assert_cast<const ColumnArray &>(*nested_column);
arrow::ListBuilder & builder = assert_cast<arrow::ListBuilder &>(*array_builder);
arrow::ArrayBuilder * value_builder = builder.value_builder();
arrow::Status components_status;
const auto & offsets = internal_column.getOffsets();
ColumnPtr & data = is_column_array_nullable ?
const_cast<ColumnPtr &>(static_cast<const ColumnNullable &>(internal_column.getData()).getNestedColumnPtr()) :
const_cast<ColumnPtr &>(internal_column.getDataPtr());
size_t array_start = 0;
size_t array_length = 0;
for (size_t idx = 0, size = internal_column.size(); idx < size; ++idx)
{
if (null_bytemap && (*null_bytemap)[idx])
{
components_status = builder.AppendNull();
checkStatus(components_status, nested_column->getName(), format_name);
}
else
{
components_status = builder.Append();
checkStatus(components_status, nested_column->getName(), format_name);
array_length = offsets[idx] - array_start;
auto cut_data = data->cut(array_start, array_length);
if (array_null_bytemap == nullptr)
{
CHColumnToArrowColumn::fillArrowArray(column_name, cut_data, array_nested_type,
column_array_nested_type_name, arrow_array,
nullptr, value_builder, format_name);
}
else
{
PaddedPODArray<UInt8> array_nested_null_bytemap;
array_nested_null_bytemap.insertByOffsets(*array_null_bytemap, array_start, array_start + array_length);
CHColumnToArrowColumn::fillArrowArray(column_name, cut_data, array_nested_type,
column_array_nested_type_name, arrow_array,
&array_nested_null_bytemap, value_builder, format_name);
}
array_start = offsets[idx];
}
}
/// Start new array
components_status = builder.Append();
checkStatus(components_status, nested_column->getName(), format_name);
CHColumnToArrowColumn::fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx]);
}
}
@ -179,13 +127,15 @@ namespace DB
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
arrow::ArrayBuilder* abuilder)
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{
const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
arrow::StringBuilder & builder = assert_cast<arrow::StringBuilder &>(*abuilder);
arrow::StringBuilder & builder = assert_cast<arrow::StringBuilder &>(*array_builder);
arrow::Status status;
for (size_t string_i = 0, size = internal_column.size(); string_i < size; ++string_i)
for (size_t string_i = start; string_i < end; ++string_i)
{
if (null_bytemap && (*null_bytemap)[string_i])
{
@ -205,14 +155,16 @@ namespace DB
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
arrow::ArrayBuilder* abuilder)
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{
const PaddedPODArray<UInt16> & internal_data = assert_cast<const ColumnVector<UInt16> &>(*write_column).getData();
//arrow::Date32Builder date_builder;
arrow::UInt16Builder & builder = assert_cast<arrow::UInt16Builder &>(*abuilder);
arrow::UInt16Builder & builder = assert_cast<arrow::UInt16Builder &>(*array_builder);
arrow::Status status;
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i)
for (size_t value_i = start; value_i < end; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
@ -227,14 +179,16 @@ namespace DB
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
arrow::ArrayBuilder* abuilder)
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{
const auto & internal_data = assert_cast<const ColumnVector<UInt32> &>(*write_column).getData();
//arrow::Date64Builder builder;
arrow::UInt32Builder & builder = assert_cast<arrow::UInt32Builder &>(*abuilder);
arrow::UInt32Builder & builder = assert_cast<arrow::UInt32Builder &>(*array_builder);
arrow::Status status;
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i)
for (size_t value_i = start; value_i < end; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
@ -249,36 +203,46 @@ namespace DB
void CHColumnToArrowColumn::fillArrowArray(
const String & column_name,
ColumnPtr & nested_column,
const std::shared_ptr<const IDataType> & column_nested_type,
const String column_nested_type_name,
std::shared_ptr<arrow::Array> arrow_array,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name)
String format_name,
size_t start,
size_t end)
{
if ("String" == column_nested_type_name)
const String column_type_name = column_type->getFamilyName();
if ("Nullable" == column_type_name)
{
fillArrowArrayWithStringColumnData<ColumnString>(nested_column, null_bytemap, format_name, array_builder);
const ColumnNullable * column_nullable = checkAndGetColumn<ColumnNullable>(column.get());
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
DataTypePtr nested_type = typeid_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
ColumnPtr null_column = column_nullable->getNullMapColumnPtr();
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end);
}
else if ("FixedString" == column_nested_type_name)
else if ("String" == column_type_name)
{
fillArrowArrayWithStringColumnData<ColumnFixedString>(nested_column, null_bytemap, format_name, array_builder);
fillArrowArrayWithStringColumnData<ColumnString>(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("Date" == column_nested_type_name)
else if ("FixedString" == column_type_name)
{
fillArrowArrayWithDateColumnData(nested_column, null_bytemap, format_name, array_builder);
fillArrowArrayWithStringColumnData<ColumnFixedString>(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("DateTime" == column_nested_type_name)
else if ("Date" == column_type_name)
{
fillArrowArrayWithDateTimeColumnData(nested_column, null_bytemap, format_name, array_builder);
fillArrowArrayWithDateColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if ("Array" == column_nested_type_name)
else if ("DateTime" == column_type_name)
{
fillArrowArrayWithArrayColumnData(column_name, nested_column, column_nested_type, arrow_array, null_bytemap,
array_builder, format_name);
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isDecimal(column_nested_type))
else if ("Array" == column_type_name)
{
fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end);
}
else if (isDecimal(column_type))
{
auto fill_decimal = [&](const auto & types) -> bool
{
@ -289,24 +253,23 @@ namespace DB
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
fillArrowArrayWithDecimalColumnData(nested_column, arrow_array, null_bytemap, decimal_type, format_name);
fillArrowArrayWithDecimalColumnData<ToDataType>(column, null_bytemap, array_builder, format_name, start, end);
}
return false;
};
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), fill_decimal);
callOnIndexAndDataType<void>(column_type->getTypeId(), fill_decimal);
}
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
else if (#CPP_NUMERIC_TYPE == column_nested_type_name) \
else if (#CPP_NUMERIC_TYPE == column_type_name) \
{ \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(nested_column, null_bytemap, format_name, array_builder); \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(column, null_bytemap, format_name, array_builder, start, end); \
}
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
else
{
throw Exception{"Internal type \"" + column_nested_type_name + "\" of a column \"" + column_name + "\""
throw Exception{"Internal type \"" + column_type_name + "\" of a column \"" + column_name + "\""
" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
@ -315,16 +278,17 @@ namespace DB
template <typename DataType>
static void fillArrowArrayWithDecimalColumnData(
ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap,
const DataType * decimal_type,
const String & format_name)
arrow::ArrayBuilder * array_builder,
const String & format_name,
size_t start,
size_t end)
{
const auto & column = static_cast<const typename DataType::ColumnType &>(*write_column);
arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()));
arrow::DecimalBuilder & builder = assert_cast<arrow::DecimalBuilder &>(*array_builder);
arrow::Status status;
for (size_t value_i = 0, size = column.size(); value_i < size; ++value_i)
for (size_t value_i = start; value_i < end; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
@ -334,10 +298,65 @@ namespace DB
checkStatus(status, write_column->getName(), format_name);
}
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName(), format_name);
}
static std::shared_ptr<arrow::DataType> getArrowType(DataTypePtr column_type, const std::string & column_name, const std::string & format_name, bool * is_column_nullable)
{
if (column_type->isNullable())
{
DataTypePtr nested_type = typeid_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
auto arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable);
*is_column_nullable = true;
return arrow_type;
}
if (isDecimal(column_type))
{
std::shared_ptr<arrow::DataType> arrow_type;
const auto create_arrow_type = [&](const auto & types) -> bool {
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<ToDataType, DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_type.get());
arrow_type = arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale());
}
return false;
};
callOnIndexAndDataType<void>(column_type->getTypeId(), create_arrow_type);
return arrow_type;
}
if (isArray(column_type))
{
auto nested_type = typeid_cast<const DataTypeArray *>(column_type.get())->getNestedType();
auto nested_arrow_type = getArrowType(nested_type, column_name, format_name, is_column_nullable);
return arrow::list(nested_arrow_type);
}
const std::string type_name = column_type->getFamilyName();
if (const auto * arrow_type_it = std::find_if(
internal_type_to_arrow_type.begin(),
internal_type_to_arrow_type.end(),
[=](auto && elem) { return elem.first == type_name; });
arrow_type_it != internal_type_to_arrow_type.end())
{
return arrow_type_it->second;
}
throw Exception{
"The type \"" + type_name + "\" of a column \"" + column_name
+ "\""
" is not supported for conversion into a "
+ format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
void CHColumnToArrowColumn::chChunkToArrowTable(
std::shared_ptr<arrow::Table> & res,
const Block & header,
@ -358,86 +377,20 @@ namespace DB
column.column = recursiveRemoveLowCardinality(chunk.getColumns()[column_i]);
column.type = recursiveRemoveLowCardinality(column.type);
const bool is_column_nullable = column.type->isNullable();
bool is_column_array_nullable = false;
const auto & column_nested_type
= is_column_nullable ? static_cast<const DataTypeNullable *>(column.type.get())->getNestedType() : column.type;
const String column_nested_type_name = column_nested_type->getFamilyName();
if (isDecimal(column_nested_type))
{
const auto add_decimal_field = [&](const auto & types) -> bool {
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<ToDataType, DataTypeDecimal<Decimal32>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>>
|| std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
arrow_fields.emplace_back(std::make_shared<arrow::Field>(
column.name, arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()), is_column_nullable));
}
return false;
};
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), add_decimal_field);
}
else if (isArray(column_nested_type))
{
const auto * column_array_type = static_cast<const DataTypeArray *>(column_nested_type.get());
is_column_array_nullable = column_array_type->getNestedType()->isNullable();
const DataTypePtr & column_array_nested_type =
is_column_array_nullable ? static_cast<const DataTypeNullable *>(column_array_type->getNestedType().get())->getNestedType() :
column_array_type->getNestedType();
const String column_array_nested_type_name = column_array_nested_type->getFamilyName();
if (const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(),
[=](auto && elem) { return elem.first == column_array_nested_type_name; });
arrow_type_it != internal_type_to_arrow_type.end())
{
arrow_fields.emplace_back(std::make_shared<arrow::Field>(
column.name, arrow::list(arrow_type_it->second), is_column_array_nullable));
} else
{
throw Exception{"The type \"" + column_array_nested_type_name + "\" of a array column \"" + column.name + "\""
" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
}
else
{
if (const auto * arrow_type_it = std::find_if(internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.end(),
[=](auto && elem) { return elem.first == column_nested_type_name; });
arrow_type_it != internal_type_to_arrow_type.end())
{
arrow_fields.emplace_back(std::make_shared<arrow::Field>(column.name, arrow_type_it->second, is_column_nullable));
} else
{
throw Exception{"The type \"" + column_nested_type_name + "\" of a column \"" + column.name + "\""
" is not supported for conversion into a " + format_name + " data format",
ErrorCodes::UNKNOWN_TYPE};
}
}
ColumnPtr nested_column
= is_column_nullable ? assert_cast<const ColumnNullable &>(*column.column).getNestedColumnPtr() : column.column;
const PaddedPODArray<UInt8> * null_bytemap =
is_column_nullable ? extractNullBytemapPtr(column.column) : nullptr;
bool is_column_nullable = false;
auto arrow_type = getArrowType(column.type, column.name, format_name, &is_column_nullable);
arrow_fields.emplace_back(std::make_shared<arrow::Field>(column.name, arrow_type, is_column_nullable));
arrow::MemoryPool* pool = arrow::default_memory_pool();
std::unique_ptr<arrow::ArrayBuilder> array_builder;
arrow::Status status = MakeBuilder(pool, arrow_fields[column_i]->type(), &array_builder);
checkStatus(status, nested_column->getName(), format_name);
checkStatus(status, column.column->getName(), format_name);
fillArrowArray(column.name, column.column, column.type, nullptr, array_builder.get(), format_name, 0, column.column->size());
std::shared_ptr<arrow::Array> arrow_array;
fillArrowArray(column.name, nested_column, column_nested_type, column_nested_type_name, arrow_array, null_bytemap, array_builder.get(), format_name);
status = array_builder->Finish(&arrow_array);
checkStatus(status, nested_column->getName(), format_name);
checkStatus(status, column.column->getName(), format_name);
arrow_arrays.emplace_back(std::move(arrow_array));
}
@ -445,7 +398,6 @@ namespace DB
res = arrow::Table::Make(arrow_schema, arrow_arrays);
}
}
}
#endif

View File

@ -33,13 +33,13 @@ public:
static void fillArrowArray(
const String & column_name,
ColumnPtr & nested_column,
const std::shared_ptr<const IDataType> & column_nested_type,
const String column_nested_type_name,
std::shared_ptr<arrow::Array> arrow_array,
ColumnPtr & column,
const std::shared_ptr<const IDataType> & column_type,
const PaddedPODArray<UInt8> * null_bytemap,
arrow::ArrayBuilder * array_builder,
String format_name);
String format_name,
size_t start,
size_t end);
};
}
#endif

View File

@ -0,0 +1,4 @@
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
DATA_FILE=$CUR_DIR/data_orc/array_test.orc
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (a1 Array(Int8), a2 Array(UInt8), a3 Array(Int16), a4 Array(UInt16), a5 Array(Int32), a6 Array(UInt32), a7 Array(Int64), a8 Array(UInt64), a9 Array(String), a10 Array(FixedString(4)), a11 Array(Float32), a12 Array(Float64), a13 Array(Date), a14 Array(Datetime), a15 Array(Decimal(4, 2)), a16 Array(Decimal(10, 2)), a17 Array(Decimal(25, 2))) ENGINE=Memory()"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC"
timeout 3 ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" < $DATA_FILE
${CLICKHOUSE_CLIENT} --query="select * from orc_load"
${CLICKHOUSE_CLIENT} --query="drop table orc_load"

View File

@ -0,0 +1,2 @@
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
DATA_FILE=$CUR_DIR/data_orc/nested_array_test.orc
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (a1 Array(Array(Array(UInt32))), a2 Array(Array(Array(String))), a3 Array(Array(Nullable(UInt32))), a4 Array(Array(Nullable(String)))) engine=Memory()"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC"
timeout 3 ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" < $DATA_FILE
${CLICKHOUSE_CLIENT} --query="select * from orc_load"
${CLICKHOUSE_CLIENT} --query="drop table orc_load"

View File

@ -0,0 +1,6 @@
[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []
[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
DATA_FILE=$CUR_DIR/data_orc/nullable_array_test.orc
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (a1 Array(Nullable(UInt32)), a2 Array(Nullable(String)), a3 Array(Nullable(Decimal(4, 2)))) ENGINE=Memory()"
cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC"
timeout 3 ${CLICKHOUSE_CLIENT} -q "insert into orc_load format ORC" < $DATA_FILE
${CLICKHOUSE_CLIENT} --query="select * from orc_load"
${CLICKHOUSE_CLIENT} --query="drop table orc_load"

View File

@ -60,3 +60,15 @@ dest from null:
-108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06
127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06
\N \N \N \N \N \N \N \N \N \N \N \N \N \N
1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
2 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
2 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
1 [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
1 [1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
2 [NULL] [NULL] [NULL]
2 [NULL] [NULL] [NULL]
3 [] [] []
3 [] [] []
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]

View File

@ -127,6 +127,7 @@ ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types1 ORDER BY int8 FORMAT
echo dest from null:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_types6 ORDER BY int8"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types5"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types6"
@ -135,3 +136,33 @@ ${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types1"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types2"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types3"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_types4"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_arrays (id UInt32, a1 Array(Int8), a2 Array(UInt8), a3 Array(Int16), a4 Array(UInt16), a5 Array(Int32), a6 Array(UInt32), a7 Array(Int64), a8 Array(UInt64), a9 Array(String), a10 Array(FixedString(4)), a11 Array(Float32), a12 Array(Float64), a13 Array(Date), a14 Array(Datetime), a15 Array(Decimal(4, 2)), a16 Array(Decimal(10, 2)), a17 Array(Decimal(25, 2))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays VALUES (1, [1,-2,3], [1,2,3], [100, -200, 300], [100, 200, 300], [10000000, -20000000, 30000000], [10000000, 2000000, 3000000], [100000000000000, -200000000000, 3000000000000], [100000000000000, 20000000000000, 3000000000000], ['Some string', 'Some string', 'Some string'], ['0000', '1111', '2222'], [42.42, 424.2, 0.4242], [424242.424242, 4242042420.242424, 42], ['2000-01-01', '2001-01-01', '2002-01-01'], ['2000-01-01', '2001-01-01', '2002-01-01'], [0.2, 10.003, 4.002], [4.000000001, 10000.10000, 10000.100001], [1000000000.000000001123, 90.0000000010010101, 0101001.0112341001])"
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays VALUES (2, [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_arrays FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_arrays ORDER BY id"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_nullable_arrays (id UInt32, a1 Array(Nullable(UInt32)), a2 Array(Nullable(String)), a3 Array(Nullable(Decimal(4, 2)))) engine=Memory()"
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nullable_arrays VALUES (1, [1, Null, 2], [Null, 'Some string', Null], [0.001, Null, 42.42]), (2, [Null], [Null], [Null]), (3, [], [], [])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nullable_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nullable_arrays FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nullable_arrays ORDER BY id"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_nullable_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_nested_arrays"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_nested_arrays (a1 Array(Array(Array(UInt32))), a2 Array(Array(Array(String))), a3 Array(Array(Nullable(UInt32))), a4 Array(Array(Nullable(String)))) engine=Memory() "
${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nested_arrays VALUES ([[[1,2,3], [1,2,3]], [[1,2,3]], [[], [1,2,3]]], [[['Some string', 'Some string'], []], [['Some string']], [[]]], [[Null, 1, 2], [Null], [1, 2], []], [['Some string', Null, 'Some string'], [Null], []])"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nested_arrays FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_nested_arrays FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_nested_arrays"
${CLICKHOUSE_CLIENT} --query="DROP TABLE parquet_nested_arrays"

View File

@ -1,6 +1,10 @@
=== Try load data from alltypes_dictionary.parquet
0 1 0 0 0 0 0 0 01/01/09 0 1230768000
1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060
=== Try load data from alltypes_list.parquet
[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['2000-01-01 00:00:00','2001-01-01 00:00:00','2002-01-01 00:00:00'] [0.20,10.00,4.00] [4.00,10000.10,10000.10] [1000000000.00,90.00,101001.01]
=== Try load data from alltypes_plain.parquet
4 1 0 0 0 0 0 0 03/01/09 0 1235865600
5 0 1 1 1 10 1.1 10.1 03/01/09 1 1235865660
@ -258,8 +262,9 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Not y
23.00
24.00
=== Try load data from list_columns.parquet
Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
[1,2,3] ['abc','efg','hij']
[NULL,1] []
[4] ['efg',NULL,'hij','xyz']
=== Try load data from nation.dict-malformed.parquet
0 ALGERIA 0 haggle. carefully final deposits detect slyly agai
1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon
@ -286,9 +291,12 @@ Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not
22 RUSSIA 3 requests against the platelets use never according to the quickly regular pint
23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull
24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
=== Try load data from nested_lists.parquet
[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]]
=== Try load data from nested_lists.snappy.parquet
Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
[[['a','b'],['c']],[[],['d']]] 1
[[['a','b'],['c','d']],[[],['e']]] 1
[[['a','b'],['c','d'],['e']],[[],['f']]] 1
=== Try load data from nested_maps.snappy.parquet
Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
@ -301,6 +309,10 @@ Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
=== Try load data from nullable.impala.parquet
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
=== Try load data from nullable_list.parquet
[1,NULL,2] [NULL,'Some string',NULL] [0.00,NULL,42.42]
[NULL] [NULL] [NULL]
[] [] []
=== Try load data from nulls.snappy.parquet
Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
`a1` Array(Int8), `a2` Array(UInt8), `a3` Array(Int16), `a4` Array(UInt16), `a5` Array(Int32), `a6` Array(UInt32), `a7` Array(Int64), `a8` Array(UInt64), `a9` Array(String), `a10` Array(FixedString(4)), `a11` Array(Float32), `a12` Array(Float64), `a13` Array(Date), `a14` Array(Datetime), `a15` Array(Decimal(4, 2)), `a16` Array(Decimal(10, 2)), `a17` Array(Decimal(25, 2))

View File

@ -1 +1 @@
`int64_list` Nullable(Int64), `utf8_list` Nullable(String)
`int64_list` Array(Nullable(Int64)), `utf8_list` Array(Nullable(String))

View File

@ -0,0 +1 @@
`a1` Array(Array(Array(UInt32))), `a2` Array(Array(Array(String))), `a3` Array(Array(Nullable(UInt32))), `a4` Array(Array(Nullable(String)))

View File

@ -1 +1 @@
`a` Nullable(String), `b` Nullable(Int32)
`a` Array(Array(Array(Nullable(String)))), `b` Nullable(Int32)

View File

@ -0,0 +1 @@
`a1` Array(Nullable(UInt32)), `a2` Array(Nullable(String)), `a3` Array(Nullable(Decimal(4, 2)))