This commit is contained in:
Pavel Kruglov 2021-08-06 18:12:31 +03:00
parent a0c10b546f
commit 0a06470b69
2 changed files with 73 additions and 5 deletions

View File

@ -12,6 +12,7 @@
#include <DataTypes/DataTypeMap.h> #include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeDate32.h> #include <DataTypes/DataTypeDate32.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
#include <common/DateLUTImpl.h> #include <common/DateLUTImpl.h>
#include <common/types.h> #include <common/types.h>
@ -34,9 +35,7 @@
#define FOR_ARROW_NUMERIC_TYPES(M) \ #define FOR_ARROW_NUMERIC_TYPES(M) \
M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::UINT8, DB::UInt8) \
M(arrow::Type::INT8, DB::Int8) \ M(arrow::Type::INT8, DB::Int8) \
M(arrow::Type::UINT16, DB::UInt16) \
M(arrow::Type::INT16, DB::Int16) \ M(arrow::Type::INT16, DB::Int16) \
M(arrow::Type::UINT32, DB::UInt32) \
M(arrow::Type::INT32, DB::Int32) \ M(arrow::Type::INT32, DB::Int32) \
M(arrow::Type::UINT64, DB::UInt64) \ M(arrow::Type::UINT64, DB::UInt64) \
M(arrow::Type::INT64, DB::Int64) \ M(arrow::Type::INT64, DB::Int64) \
@ -156,6 +155,30 @@ static ColumnWithTypeAndName readColumnWithBooleanData(std::shared_ptr<arrow::Ch
return {std::move(internal_column), std::move(internal_type), column_name}; return {std::move(internal_column), std::move(internal_type), column_name};
} }
static ColumnWithTypeAndName readColumnWithDateData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{
auto internal_type = std::make_shared<DataTypeDate>();
auto internal_column = internal_type->createColumn();
PaddedPODArray<UInt16> & column_data = assert_cast<ColumnVector<UInt16> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::UInt16Array & chunk = dynamic_cast<arrow::UInt16Array &>(*(arrow_column->chunk(chunk_i)));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
UInt16 days_num = static_cast<UInt16>(chunk.Value(value_i));
if (days_num > DATE_LUT_MAX_DAY_NUM)
throw Exception{ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE,
"Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", days_num, column_name, DATE_LUT_MAX_DAY_NUM};
column_data.emplace_back(days_num);
}
}
return {std::move(internal_column), std::move(internal_type), column_name};
}
static ColumnWithTypeAndName readColumnWithDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name) static ColumnWithTypeAndName readColumnWithDate32Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{ {
auto internal_type = std::make_shared<DataTypeDate32>(); auto internal_type = std::make_shared<DataTypeDate32>();
@ -183,7 +206,7 @@ static ColumnWithTypeAndName readColumnWithDate32Data(std::shared_ptr<arrow::Chu
/// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving /// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving
static ColumnWithTypeAndName readColumnWithDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name) static ColumnWithTypeAndName readColumnWithDate64Data(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{ {
auto internal_type = std::make_shared<DataTypeUInt32>(); auto internal_type = std::make_shared<DataTypeDateTime>();
auto internal_column = internal_type->createColumn(); auto internal_column = internal_type->createColumn();
auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData(); auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData();
column_data.reserve(arrow_column->length()); column_data.reserve(arrow_column->length());
@ -200,6 +223,22 @@ static ColumnWithTypeAndName readColumnWithDate64Data(std::shared_ptr<arrow::Chu
return {std::move(internal_column), std::move(internal_type), column_name}; return {std::move(internal_column), std::move(internal_type), column_name};
} }
static ColumnWithTypeAndName readColumnWithDateTimeData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{
auto internal_type = std::make_shared<DataTypeDateTime>();
auto internal_column = internal_type->createColumn();
auto & column_data = assert_cast<ColumnVector<UInt32> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = dynamic_cast<arrow::UInt32Array &>(*(arrow_column->chunk(chunk_i)));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
column_data.emplace_back(chunk.Value(value_i));
}
return {std::move(internal_column), std::move(internal_type), column_name};
}
static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name) static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{ {
auto internal_type = std::make_shared<DataTypeUInt32>(); auto internal_type = std::make_shared<DataTypeUInt32>();
@ -353,6 +392,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
return readColumnWithDate32Data(arrow_column, column_name); return readColumnWithDate32Data(arrow_column, column_name);
case arrow::Type::DATE64: case arrow::Type::DATE64:
return readColumnWithDate64Data(arrow_column, column_name); return readColumnWithDate64Data(arrow_column, column_name);
case arrow::Type::UINT16:
return readColumnWithDateData(arrow_column, column_name);
case arrow::Type::UINT32:
return readColumnWithDateTimeData(arrow_column, column_name);
case arrow::Type::TIMESTAMP: case arrow::Type::TIMESTAMP:
return readColumnWithTimestampData(arrow_column, column_name); return readColumnWithTimestampData(arrow_column, column_name);
#if defined(ARCADIA_BUILD) #if defined(ARCADIA_BUILD)

View File

@ -72,6 +72,7 @@ namespace DB
{"Date", arrow::uint16()}, /// uint16 is used instead of date32, because Apache Arrow cannot correctly serialize Date32Array. {"Date", arrow::uint16()}, /// uint16 is used instead of date32, because Apache Arrow cannot correctly serialize Date32Array.
{"DateTime", arrow::uint32()}, /// uint32 is used instead of date64, because we don't need milliseconds. {"DateTime", arrow::uint32()}, /// uint32 is used instead of date64, because we don't need milliseconds.
{"Date32", arrow::date32()},
{"String", arrow::binary()}, {"String", arrow::binary()},
{"FixedString", arrow::binary()}, {"FixedString", arrow::binary()},
@ -335,7 +336,6 @@ namespace DB
size_t end) size_t end)
{ {
const PaddedPODArray<UInt16> & internal_data = assert_cast<const ColumnVector<UInt16> &>(*write_column).getData(); const PaddedPODArray<UInt16> & internal_data = assert_cast<const ColumnVector<UInt16> &>(*write_column).getData();
//arrow::Date32Builder date_builder;
arrow::UInt16Builder & builder = assert_cast<arrow::UInt16Builder &>(*array_builder); arrow::UInt16Builder & builder = assert_cast<arrow::UInt16Builder &>(*array_builder);
arrow::Status status; arrow::Status status;
@ -344,7 +344,6 @@ namespace DB
if (null_bytemap && (*null_bytemap)[value_i]) if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull(); status = builder.AppendNull();
else else
/// Implicitly converts UInt16 to Int32
status = builder.Append(internal_data[value_i]); status = builder.Append(internal_data[value_i]);
checkStatus(status, write_column->getName(), format_name); checkStatus(status, write_column->getName(), format_name);
} }
@ -373,6 +372,28 @@ namespace DB
} }
} }
static void fillArrowArrayWithDate32ColumnData(
ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap,
const String & format_name,
arrow::ArrayBuilder* array_builder,
size_t start,
size_t end)
{
const PaddedPODArray<Int32> & internal_data = assert_cast<const ColumnVector<Int32> &>(*write_column).getData();
arrow::Date32Builder & builder = assert_cast<arrow::Date32Builder &>(*array_builder);
arrow::Status status;
for (size_t value_i = start; value_i < end; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
else
status = builder.Append(internal_data[value_i]);
checkStatus(status, write_column->getName(), format_name);
}
}
static void fillArrowArray( static void fillArrowArray(
const String & column_name, const String & column_name,
ColumnPtr & column, ColumnPtr & column,
@ -411,6 +432,10 @@ namespace DB
{ {
fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end); fillArrowArrayWithDateTimeColumnData(column, null_bytemap, format_name, array_builder, start, end);
} }
else if (isDate32(column_type))
{
fillArrowArrayWithDate32ColumnData(column, null_bytemap, format_name, array_builder, start, end);
}
else if (isArray(column_type)) else if (isArray(column_type))
{ {
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);