2019-08-21 14:19:47 +00:00
|
|
|
#include "ORCBlockInputFormat.h"
|
2022-03-09 08:07:02 +00:00
|
|
|
#include <boost/algorithm/string/case_conv.hpp>
|
2019-08-21 14:19:47 +00:00
|
|
|
#if USE_ORC
|
|
|
|
|
|
|
|
#include <Formats/FormatFactory.h>
|
|
|
|
#include <IO/ReadBufferFromMemory.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/copyData.h>
|
2020-05-04 00:52:28 +00:00
|
|
|
#include "ArrowBufferedStreams.h"
|
2019-08-21 14:19:47 +00:00
|
|
|
#include "ArrowColumnToCHColumn.h"
|
2021-07-01 17:59:28 +00:00
|
|
|
#include <DataTypes/NestedUtils.h>
|
2019-08-21 14:19:47 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:20:08 +00:00
|
|
|
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-05-03 23:19:56 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2020-02-25 18:10:48 +00:00
|
|
|
extern const int CANNOT_READ_ALL_DATA;
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-07-01 17:59:28 +00:00
|
|
|
ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_)
|
|
|
|
: IInputFormat(std::move(header_), in_), format_settings(format_settings_)
|
2020-02-25 18:20:08 +00:00
|
|
|
{
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
Chunk ORCBlockInputFormat::generate()
|
|
|
|
{
|
|
|
|
Chunk res;
|
2021-11-30 07:44:59 +00:00
|
|
|
block_missing_values.clear();
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-04-18 01:51:53 +00:00
|
|
|
if (!file_reader)
|
|
|
|
prepareReader();
|
|
|
|
|
2021-12-27 19:42:56 +00:00
|
|
|
if (is_stopped)
|
|
|
|
return {};
|
|
|
|
|
2021-12-18 09:25:25 +00:00
|
|
|
std::shared_ptr<arrow::RecordBatchReader> batch_reader;
|
2021-12-20 16:47:57 +00:00
|
|
|
auto result = file_reader->NextStripeReader(format_settings.orc.row_batch_size, include_indices);
|
|
|
|
if (!result.ok())
|
|
|
|
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", result.status().ToString());
|
|
|
|
batch_reader = std::move(result).ValueOrDie();
|
2021-11-13 14:18:16 +00:00
|
|
|
if (!batch_reader)
|
2021-10-31 19:53:24 +00:00
|
|
|
{
|
2021-12-18 09:25:25 +00:00
|
|
|
return res;
|
2021-11-13 14:18:16 +00:00
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-12-18 09:25:25 +00:00
|
|
|
std::shared_ptr<arrow::Table> table;
|
|
|
|
arrow::Status table_status = batch_reader->ReadAll(&table);
|
|
|
|
if (!table_status.ok())
|
|
|
|
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_status.ToString());
|
2020-05-02 19:40:50 +00:00
|
|
|
|
2021-12-18 09:25:25 +00:00
|
|
|
if (!table || !table->num_rows())
|
2021-11-13 14:18:16 +00:00
|
|
|
return res;
|
2021-10-31 19:53:24 +00:00
|
|
|
|
2022-03-11 06:34:46 +00:00
|
|
|
if (format_settings.use_lowercase_column_name)
|
2022-03-11 09:26:13 +00:00
|
|
|
table = *table->RenameColumns(include_column_names);
|
2022-03-11 06:34:46 +00:00
|
|
|
|
2021-12-18 09:25:25 +00:00
|
|
|
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
|
2021-12-02 08:14:25 +00:00
|
|
|
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
|
|
|
|
/// Otherwise fill the missing columns with zero values of its type.
|
|
|
|
if (format_settings.defaults_for_omitted_fields)
|
|
|
|
for (size_t row_idx = 0; row_idx < res.getNumRows(); ++row_idx)
|
|
|
|
for (const auto & column_idx : missing_columns)
|
|
|
|
block_missing_values.setBit(column_idx, row_idx);
|
2019-11-26 23:46:19 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ORCBlockInputFormat::resetParser()
|
|
|
|
{
|
|
|
|
IInputFormat::resetParser();
|
|
|
|
|
|
|
|
file_reader.reset();
|
2021-05-14 10:48:31 +00:00
|
|
|
include_indices.clear();
|
2022-03-11 09:26:13 +00:00
|
|
|
include_column_names.clear();
|
2021-11-30 07:44:59 +00:00
|
|
|
block_missing_values.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
const BlockMissingValues & ORCBlockInputFormat::getMissingValues() const
|
|
|
|
{
|
|
|
|
return block_missing_values;
|
2021-04-15 04:01:15 +00:00
|
|
|
}
|
|
|
|
|
2021-05-20 13:47:12 +00:00
|
|
|
static size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
|
2021-05-14 10:48:31 +00:00
|
|
|
{
|
|
|
|
if (type->id() == arrow::Type::LIST)
|
|
|
|
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + 1;
|
|
|
|
|
2021-05-25 12:01:28 +00:00
|
|
|
if (type->id() == arrow::Type::STRUCT)
|
|
|
|
{
|
2021-05-27 19:01:06 +00:00
|
|
|
int indices = 1;
|
2021-05-25 12:01:28 +00:00
|
|
|
auto * struct_type = static_cast<arrow::StructType *>(type.get());
|
|
|
|
for (int i = 0; i != struct_type->num_fields(); ++i)
|
|
|
|
indices += countIndicesForType(struct_type->field(i)->type());
|
2021-05-27 19:01:06 +00:00
|
|
|
return indices;
|
2021-05-25 12:01:28 +00:00
|
|
|
}
|
|
|
|
|
2021-05-27 19:01:06 +00:00
|
|
|
if (type->id() == arrow::Type::MAP)
|
|
|
|
{
|
|
|
|
auto * map_type = static_cast<arrow::MapType *>(type.get());
|
2021-05-27 19:10:45 +00:00
|
|
|
return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type());
|
2021-05-27 19:01:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
2021-05-14 10:48:31 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
static void getFileReaderAndSchema(
|
2021-12-27 17:30:40 +00:00
|
|
|
ReadBuffer & in,
|
|
|
|
std::unique_ptr<arrow::adapters::orc::ORCFileReader> & file_reader,
|
|
|
|
std::shared_ptr<arrow::Schema> & schema,
|
|
|
|
const FormatSettings & format_settings,
|
|
|
|
std::atomic<int> & is_stopped)
|
2021-04-15 04:01:15 +00:00
|
|
|
{
|
2021-12-27 17:30:40 +00:00
|
|
|
auto arrow_file = asArrowFile(in, format_settings, is_stopped);
|
2021-12-27 19:42:56 +00:00
|
|
|
if (is_stopped)
|
|
|
|
return;
|
|
|
|
|
2022-03-02 17:22:12 +00:00
|
|
|
auto result = arrow::adapters::orc::ORCFileReader::Open(arrow_file, arrow::default_memory_pool());
|
2021-11-30 16:39:29 +00:00
|
|
|
if (!result.ok())
|
|
|
|
throw Exception(result.status().ToString(), ErrorCodes::BAD_ARGUMENTS);
|
2021-12-01 11:19:24 +00:00
|
|
|
file_reader = std::move(result).ValueOrDie();
|
2021-05-14 10:48:31 +00:00
|
|
|
|
2021-11-30 16:39:29 +00:00
|
|
|
auto read_schema_result = file_reader->ReadSchema();
|
|
|
|
if (!read_schema_result.ok())
|
|
|
|
throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS);
|
2021-12-15 11:30:57 +00:00
|
|
|
schema = std::move(read_schema_result).ValueOrDie();
|
2022-03-11 06:34:46 +00:00
|
|
|
|
|
|
|
if (format_settings.use_lowercase_column_name)
|
|
|
|
{
|
|
|
|
std::vector<std::shared_ptr<::arrow::Field>> fields;
|
|
|
|
fields.reserve(schema->num_fields());
|
|
|
|
for (int i = 0; i < schema->num_fields(); ++i)
|
|
|
|
{
|
|
|
|
const auto& field = schema->field(i);
|
|
|
|
auto name = field->name();
|
|
|
|
boost::to_lower(name);
|
|
|
|
fields.push_back(field->WithName(name));
|
|
|
|
}
|
|
|
|
schema = arrow::schema(fields, schema->metadata());
|
|
|
|
}
|
2021-12-15 11:30:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void ORCBlockInputFormat::prepareReader()
|
|
|
|
{
|
2022-03-11 09:26:13 +00:00
|
|
|
std::shared_ptr<arrow::Schema> schema;
|
2021-12-27 17:30:40 +00:00
|
|
|
getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped);
|
|
|
|
if (is_stopped)
|
|
|
|
return;
|
2021-05-14 10:48:31 +00:00
|
|
|
|
2021-11-30 06:52:26 +00:00
|
|
|
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
|
2022-03-11 06:34:46 +00:00
|
|
|
getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns);
|
2021-12-02 08:14:25 +00:00
|
|
|
missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema);
|
2021-07-01 17:59:28 +00:00
|
|
|
|
|
|
|
std::unordered_set<String> nested_table_names;
|
|
|
|
if (format_settings.orc.import_nested)
|
|
|
|
nested_table_names = Nested::getAllTableNames(getPort().getHeader());
|
2021-06-07 15:15:58 +00:00
|
|
|
|
2021-05-24 11:39:08 +00:00
|
|
|
/// In ReadStripe column indices should be started from 1,
|
|
|
|
/// because 0 indicates to select all columns.
|
|
|
|
int index = 1;
|
2021-05-14 10:48:31 +00:00
|
|
|
for (int i = 0; i < schema->num_fields(); ++i)
|
|
|
|
{
|
2021-05-25 12:01:28 +00:00
|
|
|
/// LIST type require 2 indices, STRUCT - the number of elements + 1,
|
|
|
|
/// so we should recursively count the number of indices we need for this type.
|
2021-05-24 11:39:08 +00:00
|
|
|
int indexes_count = countIndicesForType(schema->field(i)->type());
|
2022-03-11 06:34:46 +00:00
|
|
|
const auto & name = schema->field(i)->name();
|
2021-07-01 17:59:28 +00:00
|
|
|
if (getPort().getHeader().has(name) || nested_table_names.contains(name))
|
2021-05-14 10:48:31 +00:00
|
|
|
{
|
|
|
|
for (int j = 0; j != indexes_count; ++j)
|
2022-03-11 09:26:13 +00:00
|
|
|
{
|
2021-05-24 11:39:08 +00:00
|
|
|
include_indices.push_back(index + j);
|
2022-03-11 09:26:13 +00:00
|
|
|
include_column_names.push_back(name);
|
|
|
|
}
|
2021-05-14 10:48:31 +00:00
|
|
|
}
|
2021-05-24 11:39:08 +00:00
|
|
|
index += indexes_count;
|
2021-05-14 10:48:31 +00:00
|
|
|
}
|
2020-02-25 18:20:08 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
NamesAndTypesList ORCSchemaReader::readSchema()
|
|
|
|
{
|
|
|
|
std::unique_ptr<arrow::adapters::orc::ORCFileReader> file_reader;
|
|
|
|
std::shared_ptr<arrow::Schema> schema;
|
2021-12-27 17:30:40 +00:00
|
|
|
std::atomic<int> is_stopped = 0;
|
|
|
|
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
|
2021-12-15 11:30:57 +00:00
|
|
|
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC");
|
|
|
|
return header.getNamesAndTypesList();
|
|
|
|
}
|
|
|
|
|
|
|
|
void registerInputFormatORC(FormatFactory & factory)
|
2020-02-25 18:20:08 +00:00
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
factory.registerInputFormat(
|
2020-02-25 18:20:08 +00:00
|
|
|
"ORC",
|
|
|
|
[](ReadBuffer &buf,
|
|
|
|
const Block &sample,
|
|
|
|
const RowInputFormatParams &,
|
2021-07-01 17:59:28 +00:00
|
|
|
const FormatSettings & settings)
|
2020-05-03 23:19:56 +00:00
|
|
|
{
|
2021-07-01 17:59:28 +00:00
|
|
|
return std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
|
2020-02-25 18:20:08 +00:00
|
|
|
});
|
2021-03-30 21:25:37 +00:00
|
|
|
factory.markFormatAsColumnOriented("ORC");
|
2020-02-25 18:20:08 +00:00
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
void registerORCSchemaReader(FormatFactory & factory)
|
|
|
|
{
|
|
|
|
factory.registerSchemaReader(
|
|
|
|
"ORC",
|
|
|
|
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
|
|
|
{
|
|
|
|
return std::make_shared<ORCSchemaReader>(buf, settings);
|
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2019-08-21 14:19:47 +00:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
class FormatFactory;
|
2021-10-11 16:11:50 +00:00
|
|
|
void registerInputFormatORC(FormatFactory &)
|
2019-08-21 14:19:47 +00:00
|
|
|
{
|
|
|
|
}
|
2021-12-15 11:30:57 +00:00
|
|
|
|
|
|
|
void registerORCSchemaReader(FormatFactory &)
|
|
|
|
{
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|