2019-08-21 14:19:47 +00:00
|
|
|
#include "ORCBlockInputFormat.h"
|
|
|
|
#if USE_ORC
|
|
|
|
|
|
|
|
#include <Formats/FormatFactory.h>
|
|
|
|
#include <IO/BufferBase.h>
|
|
|
|
#include <IO/ReadBufferFromMemory.h>
|
|
|
|
#include <IO/WriteBufferFromString.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/copyData.h>
|
|
|
|
#include <arrow/io/memory.h>
|
|
|
|
#include "ArrowColumnToCHColumn.h"
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:20:08 +00:00
|
|
|
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_READ_ALL_DATA;
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
|
|
|
|
2020-05-02 19:40:50 +00:00
|
|
|
ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_) : IInputFormat(std::move(header_), in_)
|
2020-02-25 18:20:08 +00:00
|
|
|
{
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
Chunk ORCBlockInputFormat::generate()
|
|
|
|
{
|
|
|
|
Chunk res;
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto & header = getPort().getHeader();
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
if (!in.eof())
|
|
|
|
{
|
|
|
|
if (row_group_current < row_group_total)
|
|
|
|
throw Exception{"Got new data, but data from previous chunks was not read " +
|
|
|
|
std::to_string(row_group_current) + "/" + std::to_string(row_group_total),
|
|
|
|
ErrorCodes::CANNOT_READ_ALL_DATA};
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
file_data.clear();
|
|
|
|
{
|
|
|
|
WriteBufferFromString file_buffer(file_data);
|
|
|
|
copyData(in, file_buffer);
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
std::unique_ptr<arrow::Buffer> local_buffer = std::make_unique<arrow::Buffer>(file_data);
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-05-02 19:54:29 +00:00
|
|
|
std::shared_ptr<arrow::io::RandomAccessFile> in_stream = std::make_shared<arrow::io::BufferReader>(*local_buffer);
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
bool ok = arrow::adapters::orc::ORCFileReader::Open(in_stream, arrow::default_memory_pool(),
|
|
|
|
&file_reader).ok();
|
|
|
|
if (!ok)
|
2019-08-21 14:19:47 +00:00
|
|
|
return res;
|
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
row_group_total = file_reader->NumberOfRows();
|
|
|
|
row_group_current = 0;
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-05-02 19:40:50 +00:00
|
|
|
}
|
|
|
|
else
|
2020-02-25 18:20:08 +00:00
|
|
|
return res;
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
if (row_group_current >= row_group_total)
|
2019-08-21 14:19:47 +00:00
|
|
|
return res;
|
2020-05-02 19:40:50 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
std::shared_ptr<arrow::Table> table;
|
2019-08-21 14:19:47 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
arrow::Status read_status = file_reader->Read(&table);
|
2020-05-03 18:12:14 +00:00
|
|
|
if (!read_status.ok())
|
|
|
|
throw Exception{"Error while reading ORC data: " + read_status.ToString(),
|
|
|
|
ErrorCodes::CANNOT_READ_ALL_DATA};
|
2019-11-26 23:46:19 +00:00
|
|
|
|
2020-05-03 18:12:14 +00:00
|
|
|
++row_group_current;
|
|
|
|
|
|
|
|
ArrowColumnToCHColumn::arrowTableToCHChunk(res, table, header, "ORC");
|
2019-11-26 23:46:19 +00:00
|
|
|
|
2020-02-25 18:20:08 +00:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ORCBlockInputFormat::resetParser()
|
|
|
|
{
|
|
|
|
IInputFormat::resetParser();
|
|
|
|
|
|
|
|
file_reader.reset();
|
|
|
|
file_data.clear();
|
|
|
|
row_group_total = 0;
|
|
|
|
row_group_current = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void registerInputFormatProcessorORC(FormatFactory &factory)
|
|
|
|
{
|
|
|
|
factory.registerInputFormatProcessor(
|
|
|
|
"ORC",
|
|
|
|
[](ReadBuffer &buf,
|
|
|
|
const Block &sample,
|
|
|
|
const RowInputFormatParams &,
|
|
|
|
const FormatSettings & /* settings */)
|
|
|
|
{
|
|
|
|
return std::make_shared<ORCBlockInputFormat>(buf, sample);
|
|
|
|
});
|
|
|
|
}
|
2019-08-21 14:19:47 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
class FormatFactory;
|
|
|
|
void registerInputFormatProcessorORC(FormatFactory &)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|