ClickHouse/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp
Azat Khuzhin 5947e54c1b Fix incorrect assertion during writing to StorageKafka
The problem it does not triggered in CI always because buffers was not
destroyed by that time.

Fixes: #26547
2021-08-19 22:07:20 +03:00

160 lines
4.3 KiB
C++

#include "WriteBufferToKafkaProducer.h"
#include "Core/Block.h"
#include "Columns/ColumnString.h"
#include "Columns/ColumnsNumber.h"
namespace DB
{
WriteBufferToKafkaProducer::WriteBufferToKafkaProducer(
ProducerPtr producer_,
const std::string & topic_,
std::optional<char> delimiter,
size_t rows_per_message,
size_t chunk_size_,
std::chrono::milliseconds poll_timeout,
const Block & header
)
: WriteBuffer(nullptr, 0)
, producer(producer_)
, topic(topic_)
, delim(delimiter)
, max_rows(rows_per_message)
, chunk_size(chunk_size_)
, timeout(poll_timeout)
{
if (header.has("_key"))
{
auto column_index = header.getPositionByName("_key");
auto column_info = header.getByPosition(column_index);
if (isString(column_info.type))
{
key_column_index = column_index;
}
// else ? (not sure it's a good place to report smth to user)
}
if (header.has("_timestamp"))
{
auto column_index = header.getPositionByName("_timestamp");
auto column_info = header.getByPosition(column_index);
if (isDateTime(column_info.type))
{
timestamp_column_index = column_index;
}
}
reinitializeChunks();
}
WriteBufferToKafkaProducer::~WriteBufferToKafkaProducer()
{
assert(rows == 0);
}
void WriteBufferToKafkaProducer::countRow(const Columns & columns, size_t current_row)
{
if (++rows % max_rows == 0)
{
const std::string & last_chunk = chunks.back();
size_t last_chunk_size = offset();
// if last character of last chunk is delimiter - we don't need it
if (delim && last_chunk[last_chunk_size - 1] == delim)
--last_chunk_size;
std::string payload;
payload.reserve((chunks.size() - 1) * chunk_size + last_chunk_size);
// concat all chunks except the last one
for (auto i = chunks.begin(), e = --chunks.end(); i != e; ++i)
payload.append(*i);
// add last one
payload.append(last_chunk, 0, last_chunk_size);
cppkafka::MessageBuilder builder(topic);
builder.payload(payload);
// Note: if it will be few rows per message - it will take the value from last row of block
if (key_column_index)
{
const auto & key_column = assert_cast<const ColumnString &>(*columns[key_column_index.value()]);
const auto key_data = key_column.getDataAt(current_row);
builder.key(cppkafka::Buffer(key_data.data, key_data.size));
}
if (timestamp_column_index)
{
const auto & timestamp_column = assert_cast<const ColumnUInt32 &>(*columns[timestamp_column_index.value()]);
const auto timestamp = std::chrono::seconds{timestamp_column.getElement(current_row)};
builder.timestamp(timestamp);
}
while (true)
{
try
{
producer->produce(builder);
}
catch (cppkafka::HandleException & e)
{
if (e.get_error() == RD_KAFKA_RESP_ERR__QUEUE_FULL)
{
producer->poll(timeout);
continue;
}
throw e;
}
break;
}
reinitializeChunks();
}
}
void WriteBufferToKafkaProducer::flush()
{
// For unknown reason we may hit some internal timeout when inserting for the first time.
while (true)
{
try
{
producer->flush(timeout);
}
catch (cppkafka::HandleException & e)
{
if (e.get_error() == RD_KAFKA_RESP_ERR__TIMED_OUT)
continue;
throw e;
}
break;
}
}
void WriteBufferToKafkaProducer::nextImpl()
{
addChunk();
}
void WriteBufferToKafkaProducer::addChunk()
{
chunks.push_back(std::string());
chunks.back().resize(chunk_size);
set(chunks.back().data(), chunk_size);
}
void WriteBufferToKafkaProducer::reinitializeChunks()
{
rows = 0;
chunks.clear();
/// We cannot leave the buffer in the undefined state (i.e. without any
/// underlying buffer), since in this case the WriteBuffeR::next() will
/// not call our nextImpl() (due to available() == 0)
addChunk();
}
}