Small update ORCBlockOutputFormat and add ORC output format to performance test.

This commit is contained in:
Avogar 2020-06-26 23:17:28 +03:00
parent 717f63923a
commit 81d459a5a8
4 changed files with 55 additions and 28 deletions

View File

@ -364,11 +364,13 @@ target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${DOUBLE_C
target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${MSGPACK_INCLUDE_DIR})
target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${ORC_INCLUDE_DIR})
configure_file (
"${ORC_INCLUDE_DIR}/orc/orc-config.hh.in"
"${ORC_INCLUDE_DIR}/orc/orc-config.hh"
)
if (USE_ORC)
target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${ORC_INCLUDE_DIR})
configure_file (
"${ORC_INCLUDE_DIR}/orc/orc-config.hh.in"
"${ORC_INCLUDE_DIR}/orc/orc-config.hh"
)
endif ()
if (ENABLE_TESTS AND USE_GTEST)
macro (grep_gtest_sources BASE_DIR DST_VAR)

View File

@ -1,5 +1,7 @@
#include <Processors/Formats/Impl/ORCBlockOutputFormat.h>
#if USE_ORC
#include <Common/assert_cast.h>
#include <Formats/FormatFactory.h>
@ -129,11 +131,12 @@ ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & t
}
}
template <typename NumberType, typename NumberVectorBatch>
void ORCBlockOutputFormat::ORCBlockOutputFormat::writeNumbers(
template <typename NumberType, typename NumberVectorBatch, typename ConvertFunc>
void ORCBlockOutputFormat::writeNumbers(
orc::ColumnVectorBatch * orc_column,
const IColumn & column,
const PaddedPODArray<UInt8> * null_bytemap)
const PaddedPODArray<UInt8> * null_bytemap,
ConvertFunc convert)
{
NumberVectorBatch * number_orc_column = dynamic_cast<NumberVectorBatch *>(orc_column);
const auto & number_column = assert_cast<const ColumnVector<NumberType> &>(column);
@ -146,16 +149,13 @@ void ORCBlockOutputFormat::ORCBlockOutputFormat::writeNumbers(
number_orc_column->notNull[i] = 0;
continue;
}
if constexpr (std::is_same<NumberType, UInt8>::value)
number_orc_column->data[i] = static_cast<unsigned char>(number_column.getElement(i));
else
number_orc_column->data[i] = number_column.getElement(i);
number_orc_column->data[i] = convert(number_column.getElement(i));
}
number_orc_column->numElements = number_column.size();
}
template <typename Decimal, typename DecimalVectorBatch, typename ConvertFunc>
void ORCBlockOutputFormat::ORCBlockOutputFormat::writeDecimals(
void ORCBlockOutputFormat::writeDecimals(
orc::ColumnVectorBatch * orc_column,
const IColumn & column,
DataTypePtr & type,
@ -181,7 +181,7 @@ void ORCBlockOutputFormat::ORCBlockOutputFormat::writeDecimals(
}
template <typename ColumnType>
void ORCBlockOutputFormat::ORCBlockOutputFormat::writeStrings(
void ORCBlockOutputFormat::writeStrings(
orc::ColumnVectorBatch * orc_column,
const IColumn & column,
const PaddedPODArray<UInt8> * null_bytemap)
@ -205,7 +205,7 @@ void ORCBlockOutputFormat::ORCBlockOutputFormat::writeStrings(
}
template <typename ColumnType, typename GetSecondsFunc, typename GetNanosecondsFunc>
void ORCBlockOutputFormat::ORCBlockOutputFormat::writeDateTimes(
void ORCBlockOutputFormat::writeDateTimes(
orc::ColumnVectorBatch * orc_column,
const IColumn & column,
const PaddedPODArray<UInt8> * null_bytemap,
@ -244,53 +244,53 @@ void ORCBlockOutputFormat::writeColumn(
{
case TypeIndex::Int8:
{
writeNumbers<Int8, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<Int8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int8 & value){ return value ;});
break;
}
case TypeIndex::UInt8:
{
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt8 & value){ return uint8_t(value) ;});
break;
}
case TypeIndex::Int16:
{
writeNumbers<Int16, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<Int16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int16 & value){ return value ;});
break;
}
case TypeIndex::Date: [[fallthrough]];
case TypeIndex::UInt16:
{
writeNumbers<UInt16, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<UInt16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt16 & value){ return value ;});
break;
}
case TypeIndex::Int32:
{
writeNumbers<Int32, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<Int32, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int32 & value){ return value ;});
break;
}
case TypeIndex::UInt32:
{
writeNumbers<UInt32, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<UInt32, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt32 & value){ return value ;});
break;
}
case TypeIndex::Int64:
{
writeNumbers<Int64, orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<Int64, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int64 & value){ return value ;});
break;
}
case TypeIndex::UInt64:
{
writeNumbers<UInt64,orc::LongVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<UInt64,orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt64 & value){ return value ;});
break;
}
case TypeIndex::Float32:
{
writeNumbers<Float32, orc::DoubleVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<Float32, orc::DoubleVectorBatch>(orc_column, column, null_bytemap, [](const Float32 & value){ return value ;});
break;
}
case TypeIndex::Float64:
{
writeNumbers<Float64, orc::DoubleVectorBatch>(orc_column, column, null_bytemap);
writeNumbers<Float64, orc::DoubleVectorBatch>(orc_column, column, null_bytemap, [](const Float64 & value){ return value ;});
break;
}
case TypeIndex::FixedString:
@ -368,6 +368,7 @@ void ORCBlockOutputFormat::writeColumn(
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
const ColumnArray::Offsets & offsets = list_column.getOffsets();
list_orc_column->resize(list_column.size());
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
list_orc_column->offsets[0] = 0;
for (size_t i = 0; i != list_column.size(); ++i)
{
@ -439,3 +440,15 @@ void registerOutputFormatProcessorORC(FormatFactory & factory)
}
}
#else
namespace DB
{
class FormatFactory;
void registerOutputFormatProcessorORC(FormatFactory &)
{
}
}
#endif

View File

@ -1,5 +1,7 @@
#pragma once
#include "config_formats.h"
#if USE_ORC
#include <IO/WriteBuffer.h>
#include <Processors/Formats/IOutputFormat.h>
#include <Formats/FormatSettings.h>
@ -10,6 +12,7 @@ namespace DB
class WriteBuffer;
/// orc::Writer writes only in orc::OutputStream
class ORCOutputStream : public orc::OutputStream
{
public:
@ -39,22 +42,29 @@ public:
private:
ORC_UNIQUE_PTR<orc::Type> getORCType(const DataTypePtr & type);
/// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be
/// converted to unsigned char (bugprone-signed-char-misuse in clang).
template <typename NumberType, typename NumberVectorBatch, typename ConvertFunc>
void writeNumbers(orc::ColumnVectorBatch * orc_column, const IColumn & column, const PaddedPODArray<UInt8> * null_bytemap, ConvertFunc convert);
/// ConvertFunc is needed to convert ClickHouse Int128 to ORC Int128.
template <typename Decimal, typename DecimalVectorBatch, typename ConvertFunc>
void writeDecimals(orc::ColumnVectorBatch * orc_column, const IColumn & column, DataTypePtr & type,
const PaddedPODArray<UInt8> * null_bytemap, ConvertFunc convert);
template <typename NumberType, typename NumberVectorBatch>
void writeNumbers(orc::ColumnVectorBatch * orc_column, const IColumn & column, const PaddedPODArray<UInt8> * null_bytemap);
template <typename ColumnType>
void writeStrings(orc::ColumnVectorBatch * orc_column, const IColumn & column, const PaddedPODArray<UInt8> * null_bytemap);
/// ORC column TimestampVectorBatch stores only seconds and nanoseconds,
/// GetSecondsFunc and GetNanosecondsFunc are needed to extract them from DataTime type.
template <typename ColumnType, typename GetSecondsFunc, typename GetNanosecondsFunc>
void writeDateTimes(orc::ColumnVectorBatch * orc_column, const IColumn & column, const PaddedPODArray<UInt8> * null_bytemap,
GetSecondsFunc get_seconds, GetNanosecondsFunc get_nanoseconds);
void writeColumn(orc::ColumnVectorBatch * orc_column, const IColumn & column, DataTypePtr & type, const PaddedPODArray<UInt8> * null_bytemap);
/// These two functions are needed to know maximum nested size of arrays to
/// create an ORC Batch with the appropriate size
size_t getColumnSize(const IColumn & column, DataTypePtr & type);
size_t getMaxColumnSize(Chunk & chunk);
@ -67,3 +77,4 @@ private:
};
}
#endif

View File

@ -35,6 +35,7 @@
<value>ODBCDriver2</value>
<value>Avro</value>
<value>MsgPack</value>
<value>ORC</value>
</values>
</substitution>
</substitutions>