2019-08-21 14:19:47 +00:00
|
|
|
#include "ORCBlockInputFormat.h"
|
|
|
|
#if USE_ORC
|
|
|
|
|
|
|
|
#include <Formats/FormatFactory.h>
|
|
|
|
#include <IO/ReadBufferFromMemory.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/copyData.h>
|
2020-05-03 23:19:56 +00:00
|
|
|
#include <arrow/adapters/orc/adapter.h>
|
2019-08-21 14:19:47 +00:00
|
|
|
#include <arrow/io/memory.h>
|
2020-05-04 00:52:28 +00:00
|
|
|
#include "ArrowBufferedStreams.h"
|
2019-08-21 14:19:47 +00:00
|
|
|
#include "ArrowColumnToCHColumn.h"
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:20:08 +00:00
|
|
|
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-05-03 23:19:56 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2020-02-25 18:10:48 +00:00
|
|
|
extern const int CANNOT_READ_ALL_DATA;
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
#define THROW_ARROW_NOT_OK(status) \
|
|
|
|
do \
|
|
|
|
{ \
|
|
|
|
if (::arrow::Status _s = (status); !_s.ok()) \
|
|
|
|
throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \
|
|
|
|
} while (false)
|
|
|
|
|
2020-05-02 19:40:50 +00:00
|
|
|
ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_) : IInputFormat(std::move(header_), in_)
|
2020-02-25 18:20:08 +00:00
|
|
|
{
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
Chunk ORCBlockInputFormat::generate()
|
|
|
|
{
|
|
|
|
Chunk res;
|
2020-05-04 13:19:25 +00:00
|
|
|
const Block & header = getPort().getHeader();
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-04-18 01:51:53 +00:00
|
|
|
if (!file_reader)
|
|
|
|
prepareReader();
|
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
if (stripe_current >= stripe_total)
|
2020-05-03 23:19:56 +00:00
|
|
|
return res;
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
std::shared_ptr<arrow::RecordBatch> batch_result;
|
2021-05-14 10:48:31 +00:00
|
|
|
arrow::Status batch_status = file_reader->ReadStripe(stripe_current, include_indices, &batch_result);
|
2021-04-15 04:01:15 +00:00
|
|
|
if (!batch_status.ok())
|
|
|
|
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA,
|
|
|
|
"Error while reading batch of ORC data: {}", batch_status.ToString());
|
2020-05-02 19:40:50 +00:00
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
auto table_result = arrow::Table::FromRecordBatches({batch_result});
|
|
|
|
if (!table_result.ok())
|
|
|
|
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA,
|
|
|
|
"Error while reading batch of ORC data: {}", table_result.status().ToString());
|
2019-11-26 23:46:19 +00:00
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
++stripe_current;
|
2019-11-26 23:46:19 +00:00
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
ArrowColumnToCHColumn::arrowTableToCHChunk(res, *table_result, header, "ORC");
|
2020-02-25 18:20:08 +00:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ORCBlockInputFormat::resetParser()
|
|
|
|
{
|
|
|
|
IInputFormat::resetParser();
|
|
|
|
|
|
|
|
file_reader.reset();
|
2021-05-14 10:48:31 +00:00
|
|
|
include_indices.clear();
|
2021-04-18 01:51:53 +00:00
|
|
|
stripe_current = 0;
|
2021-04-15 04:01:15 +00:00
|
|
|
}
|
|
|
|
|
2021-05-20 13:47:12 +00:00
|
|
|
static size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
|
2021-05-14 10:48:31 +00:00
|
|
|
{
|
|
|
|
if (type->id() == arrow::Type::LIST)
|
|
|
|
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + 1;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2021-04-15 04:01:15 +00:00
|
|
|
void ORCBlockInputFormat::prepareReader()
|
|
|
|
{
|
|
|
|
THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader));
|
|
|
|
stripe_total = file_reader->NumberOfStripes();
|
|
|
|
stripe_current = 0;
|
2021-05-14 10:48:31 +00:00
|
|
|
|
|
|
|
std::shared_ptr<arrow::Schema> schema;
|
|
|
|
THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema));
|
|
|
|
|
|
|
|
int index = 0;
|
|
|
|
for (int i = 0; i < schema->num_fields(); ++i)
|
|
|
|
{
|
|
|
|
if (getPort().getHeader().has(schema->field(i)->name()))
|
|
|
|
{
|
|
|
|
/// LIST type require 2 indices, so we should recursively
|
|
|
|
/// count the number of indices we need for this type.
|
|
|
|
int indexes_count = countIndicesForType(schema->field(i)->type());
|
|
|
|
for (int j = 0; j != indexes_count; ++j)
|
|
|
|
include_indices.push_back(index++);
|
|
|
|
}
|
|
|
|
}
|
2020-02-25 18:20:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void registerInputFormatProcessorORC(FormatFactory &factory)
|
|
|
|
{
|
|
|
|
factory.registerInputFormatProcessor(
|
|
|
|
"ORC",
|
|
|
|
[](ReadBuffer &buf,
|
|
|
|
const Block &sample,
|
|
|
|
const RowInputFormatParams &,
|
|
|
|
const FormatSettings & /* settings */)
|
2020-05-03 23:19:56 +00:00
|
|
|
{
|
2020-02-25 18:20:08 +00:00
|
|
|
return std::make_shared<ORCBlockInputFormat>(buf, sample);
|
|
|
|
});
|
2021-03-30 21:25:37 +00:00
|
|
|
factory.markFormatAsColumnOriented("ORC");
|
2020-02-25 18:20:08 +00:00
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
class FormatFactory;
|
|
|
|
void registerInputFormatProcessorORC(FormatFactory &)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|