Fix very strange behavior of Apache ORC

This commit is contained in:
Alexey Milovidov 2022-09-18 08:25:25 +02:00
parent c8c728e740
commit 8764fa4439
4 changed files with 22 additions and 14 deletions

View File

@ -55,7 +55,7 @@ ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & hea
data_types.push_back(recursiveRemoveLowCardinality(type)); data_types.push_back(recursiveRemoveLowCardinality(type));
} }
ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & type) std::unique_ptr<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & type)
{ {
switch (type->getTypeId()) switch (type->getTypeId())
{ {
@ -398,17 +398,21 @@ void ORCBlockOutputFormat::writeColumn(
const auto & list_column = assert_cast<const ColumnArray &>(column); const auto & list_column = assert_cast<const ColumnArray &>(column);
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType(); auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
const ColumnArray::Offsets & offsets = list_column.getOffsets(); const ColumnArray::Offsets & offsets = list_column.getOffsets();
list_orc_column.resize(list_column.size());
size_t column_size = list_column.size();
list_orc_column.resize(column_size);
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
list_orc_column.offsets[0] = 0; list_orc_column.offsets[0] = 0;
for (size_t i = 0; i != list_column.size(); ++i) for (size_t i = 0; i != column_size; ++i)
{ {
list_orc_column.offsets[i + 1] = offsets[i]; list_orc_column.offsets[i + 1] = offsets[i];
list_orc_column.notNull[i] = 1; list_orc_column.notNull[i] = 1;
} }
orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements; orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements;
writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap); writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap);
list_orc_column.numElements = list_column.size(); list_orc_column.numElements = column_size;
break; break;
} }
case TypeIndex::Tuple: case TypeIndex::Tuple:
@ -429,10 +433,12 @@ void ORCBlockOutputFormat::writeColumn(
const auto & map_type = assert_cast<const DataTypeMap &>(*type); const auto & map_type = assert_cast<const DataTypeMap &>(*type);
const ColumnArray::Offsets & offsets = list_column.getOffsets(); const ColumnArray::Offsets & offsets = list_column.getOffsets();
size_t column_size = list_column.size();
map_orc_column.resize(list_column.size()); map_orc_column.resize(list_column.size());
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
map_orc_column.offsets[0] = 0; map_orc_column.offsets[0] = 0;
for (size_t i = 0; i != list_column.size(); ++i) for (size_t i = 0; i != column_size; ++i)
{ {
map_orc_column.offsets[i + 1] = offsets[i]; map_orc_column.offsets[i + 1] = offsets[i];
map_orc_column.notNull[i] = 1; map_orc_column.notNull[i] = 1;
@ -447,7 +453,7 @@ void ORCBlockOutputFormat::writeColumn(
auto value_type = map_type.getValueType(); auto value_type = map_type.getValueType();
writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap); writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap);
map_orc_column.numElements = list_column.size(); map_orc_column.numElements = column_size;
break; break;
} }
default: default:
@ -461,8 +467,9 @@ size_t ORCBlockOutputFormat::getColumnSize(const IColumn & column, DataTypePtr &
{ {
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType(); auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
const IColumn & nested_column = assert_cast<const ColumnArray &>(column).getData(); const IColumn & nested_column = assert_cast<const ColumnArray &>(column).getData();
return getColumnSize(nested_column, nested_type); return std::max(column.size(), getColumnSize(nested_column, nested_type));
} }
return column.size(); return column.size();
} }
@ -471,9 +478,7 @@ size_t ORCBlockOutputFormat::getMaxColumnSize(Chunk & chunk)
size_t columns_num = chunk.getNumColumns(); size_t columns_num = chunk.getNumColumns();
size_t max_column_size = 0; size_t max_column_size = 0;
for (size_t i = 0; i != columns_num; ++i) for (size_t i = 0; i != columns_num; ++i)
{
max_column_size = std::max(max_column_size, getColumnSize(*chunk.getColumns()[i], data_types[i])); max_column_size = std::max(max_column_size, getColumnSize(*chunk.getColumns()[i], data_types[i]));
}
return max_column_size; return max_column_size;
} }
@ -484,8 +489,9 @@ void ORCBlockOutputFormat::consume(Chunk chunk)
size_t columns_num = chunk.getNumColumns(); size_t columns_num = chunk.getNumColumns();
size_t rows_num = chunk.getNumRows(); size_t rows_num = chunk.getNumRows();
/// getMaxColumnSize is needed to write arrays. /// getMaxColumnSize is needed to write arrays.
/// The size of the batch must be no less than total amount of array elements. /// The size of the batch must be no less than total amount of array elements
ORC_UNIQUE_PTR<orc::ColumnVectorBatch> batch = writer->createRowBatch(getMaxColumnSize(chunk)); /// and no less than the number of rows (ORC writes a null bit for every row).
std::unique_ptr<orc::ColumnVectorBatch> batch = writer->createRowBatch(getMaxColumnSize(chunk));
orc::StructVectorBatch & root = dynamic_cast<orc::StructVectorBatch &>(*batch); orc::StructVectorBatch & root = dynamic_cast<orc::StructVectorBatch &>(*batch);
auto columns = chunk.detachColumns(); auto columns = chunk.detachColumns();
for (auto & column : columns) for (auto & column : columns)
@ -493,6 +499,7 @@ void ORCBlockOutputFormat::consume(Chunk chunk)
for (size_t i = 0; i != columns_num; ++i) for (size_t i = 0; i != columns_num; ++i)
writeColumn(*root.fields[i], *columns[i], data_types[i], nullptr); writeColumn(*root.fields[i], *columns[i], data_types[i], nullptr);
root.numElements = rows_num; root.numElements = rows_num;
writer->add(*batch); writer->add(*batch);
} }

View File

@ -42,7 +42,7 @@ private:
void consume(Chunk chunk) override; void consume(Chunk chunk) override;
void finalizeImpl() override; void finalizeImpl() override;
ORC_UNIQUE_PTR<orc::Type> getORCType(const DataTypePtr & type); std::unique_ptr<orc::Type> getORCType(const DataTypePtr & type);
/// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be /// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be
/// converted to unsigned char (bugprone-signed-char-misuse in clang). /// converted to unsigned char (bugprone-signed-char-misuse in clang).
@ -75,8 +75,8 @@ private:
const FormatSettings format_settings; const FormatSettings format_settings;
ORCOutputStream output_stream; ORCOutputStream output_stream;
DataTypes data_types; DataTypes data_types;
ORC_UNIQUE_PTR<orc::Writer> writer; std::unique_ptr<orc::Writer> writer;
ORC_UNIQUE_PTR<orc::Type> schema; std::unique_ptr<orc::Type> schema;
orc::WriterOptions options; orc::WriterOptions options;
}; };

Binary file not shown.

View File

@ -0,0 +1 @@
SELECT arrayJoin([[], [1]]) FORMAT ORC;