ClickHouse/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp

103 lines
2.7 KiB
C++
Raw Normal View History

#include "ParquetBlockOutputFormat.h"
2019-06-25 17:19:32 +00:00
#if USE_PARQUET
#include <Formats/FormatFactory.h>
#include <parquet/arrow/writer.h>
2020-05-03 18:12:14 +00:00
#include "ArrowBufferedStreams.h"
#include "CHColumnToArrowColumn.h"
namespace DB
{
2021-07-24 11:49:42 +00:00
namespace ErrorCodes
{
extern const int UNKNOWN_EXCEPTION;
}
2019-08-03 11:02:40 +00:00
ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_)
2021-06-07 15:15:58 +00:00
: IOutputFormat(header_, out_), format_settings{format_settings_}
{
}
void ParquetBlockOutputFormat::consume(Chunk chunk)
{
const size_t columns_num = chunk.getNumColumns();
std::shared_ptr<arrow::Table> arrow_table;
2021-06-08 10:37:54 +00:00
if (!ch_column_to_arrow_column)
{
const Block & header = getPort(PortKind::Main).getHeader();
2021-07-24 11:46:00 +00:00
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false);
2021-06-08 10:37:54 +00:00
}
2021-06-07 15:15:58 +00:00
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
if (!file_writer)
{
2020-05-03 02:46:36 +00:00
auto sink = std::make_shared<ArrowBufferedOutputStream>(out);
parquet::WriterProperties::Builder builder;
#if USE_SNAPPY
builder.compression(parquet::Compression::SNAPPY);
#endif
auto props = builder.build();
auto status = parquet::arrow::FileWriter::Open(
*arrow_table->schema(),
arrow::default_memory_pool(),
sink,
props, /*parquet::default_writer_properties(),*/
&file_writer);
if (!status.ok())
throw Exception{"Error while opening a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
// TODO: calculate row_group_size depending on a number of rows and table size
auto status = file_writer->WriteTable(*arrow_table, format_settings.parquet.row_group_size);
if (!status.ok())
throw Exception{"Error while writing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
void ParquetBlockOutputFormat::finalize()
{
if (!file_writer)
{
const Block & header = getPort(PortKind::Main).getHeader();
2020-08-07 07:40:05 +00:00
consume(Chunk(header.getColumns(), 0));
}
auto status = file_writer->Close();
if (!status.ok())
throw Exception{"Error while closing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
2021-10-11 16:11:50 +00:00
void registerOutputFormatParquet(FormatFactory & factory)
{
2021-10-11 16:11:50 +00:00
factory.registerOutputFormat(
"Parquet",
[](WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams &,
const FormatSettings & format_settings)
{
2021-07-24 11:49:42 +00:00
return std::make_shared<ParquetBlockOutputFormat>(buf, sample, format_settings);
});
}
}
#else
namespace DB
{
class FormatFactory;
2021-10-11 16:11:50 +00:00
void registerOutputFormatParquet(FormatFactory &)
{
}
}
#endif