mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 09:02:00 +00:00
Fix very strange behavior of Apache ORC
This commit is contained in:
parent
c8c728e740
commit
8764fa4439
@ -55,7 +55,7 @@ ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & hea
|
|||||||
data_types.push_back(recursiveRemoveLowCardinality(type));
|
data_types.push_back(recursiveRemoveLowCardinality(type));
|
||||||
}
|
}
|
||||||
|
|
||||||
ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & type)
|
std::unique_ptr<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & type)
|
||||||
{
|
{
|
||||||
switch (type->getTypeId())
|
switch (type->getTypeId())
|
||||||
{
|
{
|
||||||
@ -398,17 +398,21 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
const auto & list_column = assert_cast<const ColumnArray &>(column);
|
const auto & list_column = assert_cast<const ColumnArray &>(column);
|
||||||
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
|
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
|
||||||
const ColumnArray::Offsets & offsets = list_column.getOffsets();
|
const ColumnArray::Offsets & offsets = list_column.getOffsets();
|
||||||
list_orc_column.resize(list_column.size());
|
|
||||||
|
size_t column_size = list_column.size();
|
||||||
|
list_orc_column.resize(column_size);
|
||||||
|
|
||||||
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
|
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
|
||||||
list_orc_column.offsets[0] = 0;
|
list_orc_column.offsets[0] = 0;
|
||||||
for (size_t i = 0; i != list_column.size(); ++i)
|
for (size_t i = 0; i != column_size; ++i)
|
||||||
{
|
{
|
||||||
list_orc_column.offsets[i + 1] = offsets[i];
|
list_orc_column.offsets[i + 1] = offsets[i];
|
||||||
list_orc_column.notNull[i] = 1;
|
list_orc_column.notNull[i] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements;
|
orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements;
|
||||||
writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap);
|
writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap);
|
||||||
list_orc_column.numElements = list_column.size();
|
list_orc_column.numElements = column_size;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case TypeIndex::Tuple:
|
case TypeIndex::Tuple:
|
||||||
@ -429,10 +433,12 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
const auto & map_type = assert_cast<const DataTypeMap &>(*type);
|
const auto & map_type = assert_cast<const DataTypeMap &>(*type);
|
||||||
const ColumnArray::Offsets & offsets = list_column.getOffsets();
|
const ColumnArray::Offsets & offsets = list_column.getOffsets();
|
||||||
|
|
||||||
|
size_t column_size = list_column.size();
|
||||||
|
|
||||||
map_orc_column.resize(list_column.size());
|
map_orc_column.resize(list_column.size());
|
||||||
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
|
/// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i].
|
||||||
map_orc_column.offsets[0] = 0;
|
map_orc_column.offsets[0] = 0;
|
||||||
for (size_t i = 0; i != list_column.size(); ++i)
|
for (size_t i = 0; i != column_size; ++i)
|
||||||
{
|
{
|
||||||
map_orc_column.offsets[i + 1] = offsets[i];
|
map_orc_column.offsets[i + 1] = offsets[i];
|
||||||
map_orc_column.notNull[i] = 1;
|
map_orc_column.notNull[i] = 1;
|
||||||
@ -447,7 +453,7 @@ void ORCBlockOutputFormat::writeColumn(
|
|||||||
auto value_type = map_type.getValueType();
|
auto value_type = map_type.getValueType();
|
||||||
writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap);
|
writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap);
|
||||||
|
|
||||||
map_orc_column.numElements = list_column.size();
|
map_orc_column.numElements = column_size;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@ -461,8 +467,9 @@ size_t ORCBlockOutputFormat::getColumnSize(const IColumn & column, DataTypePtr &
|
|||||||
{
|
{
|
||||||
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
|
auto nested_type = assert_cast<const DataTypeArray &>(*type).getNestedType();
|
||||||
const IColumn & nested_column = assert_cast<const ColumnArray &>(column).getData();
|
const IColumn & nested_column = assert_cast<const ColumnArray &>(column).getData();
|
||||||
return getColumnSize(nested_column, nested_type);
|
return std::max(column.size(), getColumnSize(nested_column, nested_type));
|
||||||
}
|
}
|
||||||
|
|
||||||
return column.size();
|
return column.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -471,9 +478,7 @@ size_t ORCBlockOutputFormat::getMaxColumnSize(Chunk & chunk)
|
|||||||
size_t columns_num = chunk.getNumColumns();
|
size_t columns_num = chunk.getNumColumns();
|
||||||
size_t max_column_size = 0;
|
size_t max_column_size = 0;
|
||||||
for (size_t i = 0; i != columns_num; ++i)
|
for (size_t i = 0; i != columns_num; ++i)
|
||||||
{
|
|
||||||
max_column_size = std::max(max_column_size, getColumnSize(*chunk.getColumns()[i], data_types[i]));
|
max_column_size = std::max(max_column_size, getColumnSize(*chunk.getColumns()[i], data_types[i]));
|
||||||
}
|
|
||||||
return max_column_size;
|
return max_column_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -484,8 +489,9 @@ void ORCBlockOutputFormat::consume(Chunk chunk)
|
|||||||
size_t columns_num = chunk.getNumColumns();
|
size_t columns_num = chunk.getNumColumns();
|
||||||
size_t rows_num = chunk.getNumRows();
|
size_t rows_num = chunk.getNumRows();
|
||||||
/// getMaxColumnSize is needed to write arrays.
|
/// getMaxColumnSize is needed to write arrays.
|
||||||
/// The size of the batch must be no less than total amount of array elements.
|
/// The size of the batch must be no less than total amount of array elements
|
||||||
ORC_UNIQUE_PTR<orc::ColumnVectorBatch> batch = writer->createRowBatch(getMaxColumnSize(chunk));
|
/// and no less than the number of rows (ORC writes a null bit for every row).
|
||||||
|
std::unique_ptr<orc::ColumnVectorBatch> batch = writer->createRowBatch(getMaxColumnSize(chunk));
|
||||||
orc::StructVectorBatch & root = dynamic_cast<orc::StructVectorBatch &>(*batch);
|
orc::StructVectorBatch & root = dynamic_cast<orc::StructVectorBatch &>(*batch);
|
||||||
auto columns = chunk.detachColumns();
|
auto columns = chunk.detachColumns();
|
||||||
for (auto & column : columns)
|
for (auto & column : columns)
|
||||||
@ -493,6 +499,7 @@ void ORCBlockOutputFormat::consume(Chunk chunk)
|
|||||||
|
|
||||||
for (size_t i = 0; i != columns_num; ++i)
|
for (size_t i = 0; i != columns_num; ++i)
|
||||||
writeColumn(*root.fields[i], *columns[i], data_types[i], nullptr);
|
writeColumn(*root.fields[i], *columns[i], data_types[i], nullptr);
|
||||||
|
|
||||||
root.numElements = rows_num;
|
root.numElements = rows_num;
|
||||||
writer->add(*batch);
|
writer->add(*batch);
|
||||||
}
|
}
|
||||||
|
@ -42,7 +42,7 @@ private:
|
|||||||
void consume(Chunk chunk) override;
|
void consume(Chunk chunk) override;
|
||||||
void finalizeImpl() override;
|
void finalizeImpl() override;
|
||||||
|
|
||||||
ORC_UNIQUE_PTR<orc::Type> getORCType(const DataTypePtr & type);
|
std::unique_ptr<orc::Type> getORCType(const DataTypePtr & type);
|
||||||
|
|
||||||
/// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be
|
/// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be
|
||||||
/// converted to unsigned char (bugprone-signed-char-misuse in clang).
|
/// converted to unsigned char (bugprone-signed-char-misuse in clang).
|
||||||
@ -75,8 +75,8 @@ private:
|
|||||||
const FormatSettings format_settings;
|
const FormatSettings format_settings;
|
||||||
ORCOutputStream output_stream;
|
ORCOutputStream output_stream;
|
||||||
DataTypes data_types;
|
DataTypes data_types;
|
||||||
ORC_UNIQUE_PTR<orc::Writer> writer;
|
std::unique_ptr<orc::Writer> writer;
|
||||||
ORC_UNIQUE_PTR<orc::Type> schema;
|
std::unique_ptr<orc::Type> schema;
|
||||||
orc::WriterOptions options;
|
orc::WriterOptions options;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
BIN
tests/queries/0_stateless/02426_orc_bug.reference
Normal file
BIN
tests/queries/0_stateless/02426_orc_bug.reference
Normal file
Binary file not shown.
1
tests/queries/0_stateless/02426_orc_bug.sql
Normal file
1
tests/queries/0_stateless/02426_orc_bug.sql
Normal file
@ -0,0 +1 @@
|
|||||||
|
SELECT arrayJoin([[], [1]]) FORMAT ORC;
|
Loading…
Reference in New Issue
Block a user