ClickHouse/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp

112 lines
3.2 KiB
C++
Raw Normal View History

#include "ParquetBlockOutputFormat.h"
2019-06-25 17:19:32 +00:00
#if USE_PARQUET
// TODO: clean includes
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Common/assert_cast.h>
#include <Core/callOnTypeIndex.h>
#include <DataStreams/SquashingBlockOutputStream.h>
#include <Formats/FormatFactory.h>
#include <IO/WriteHelpers.h>
#include <arrow/api.h>
#include <arrow/util/memory.h>
#include <parquet/arrow/writer.h>
2020-05-03 18:12:14 +00:00
#include "ArrowBufferedStreams.h"
#include "CHColumnToArrowColumn.h"
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_EXCEPTION;
}
2019-08-03 11:02:40 +00:00
ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_)
2021-06-07 15:15:58 +00:00
: IOutputFormat(header_, out_), format_settings{format_settings_}
{
}
void ParquetBlockOutputFormat::consume(Chunk chunk)
{
const Block & header = getPort(PortKind::Main).getHeader();
const size_t columns_num = chunk.getNumColumns();
std::shared_ptr<arrow::Table> arrow_table;
2021-06-07 15:15:58 +00:00
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
if (!file_writer)
{
2020-05-03 02:46:36 +00:00
auto sink = std::make_shared<ArrowBufferedOutputStream>(out);
parquet::WriterProperties::Builder builder;
#if USE_SNAPPY
builder.compression(parquet::Compression::SNAPPY);
#endif
auto props = builder.build();
auto status = parquet::arrow::FileWriter::Open(
*arrow_table->schema(),
arrow::default_memory_pool(),
sink,
props, /*parquet::default_writer_properties(),*/
&file_writer);
if (!status.ok())
throw Exception{"Error while opening a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
2021-06-07 15:15:58 +00:00
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet");
}
// TODO: calculate row_group_size depending on a number of rows and table size
auto status = file_writer->WriteTable(*arrow_table, format_settings.parquet.row_group_size);
if (!status.ok())
throw Exception{"Error while writing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
void ParquetBlockOutputFormat::finalize()
{
if (!file_writer)
{
const Block & header = getPort(PortKind::Main).getHeader();
2020-08-07 07:40:05 +00:00
consume(Chunk(header.getColumns(), 0));
}
auto status = file_writer->Close();
if (!status.ok())
throw Exception{"Error while closing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
void registerOutputFormatProcessorParquet(FormatFactory & factory)
{
factory.registerOutputFormatProcessor(
"Parquet",
[](WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams &,
const FormatSettings & format_settings)
{
auto impl = std::make_shared<ParquetBlockOutputFormat>(buf, sample, format_settings);
/// TODO
// auto res = std::make_shared<SquashingBlockOutputStream>(impl, impl->getHeader(), format_settings.parquet.row_group_size, 0);
// res->disableFlush();
return impl;
});
}
}
#else
namespace DB
{
class FormatFactory;
void registerOutputFormatProcessorParquet(FormatFactory &)
{
}
}
#endif