2021-10-07 08:26:08 +00:00
|
|
|
#include <Storages/Kafka/KafkaSource.h>
|
2019-01-21 14:02:03 +00:00
|
|
|
|
|
|
|
#include <Formats/FormatFactory.h>
|
2022-10-28 16:41:10 +00:00
|
|
|
#include <IO/EmptyReadBuffer.h>
|
|
|
|
#include <Storages/Kafka/KafkaConsumer.h>
|
2021-08-27 03:00:12 +00:00
|
|
|
#include <Processors/Executors/StreamingFormatExecutor.h>
|
2022-04-27 15:05:45 +00:00
|
|
|
#include <Common/logger_useful.h>
|
2020-11-06 14:07:56 +00:00
|
|
|
#include <Interpreters/Context.h>
|
2019-01-21 14:02:03 +00:00
|
|
|
|
2022-04-04 14:05:31 +00:00
|
|
|
#include <Common/ProfileEvents.h>
|
|
|
|
|
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
|
|
|
extern const Event KafkaMessagesRead;
|
|
|
|
extern const Event KafkaMessagesFailed;
|
|
|
|
extern const Event KafkaRowsRead;
|
|
|
|
extern const Event KafkaRowsRejected;
|
|
|
|
}
|
|
|
|
|
2019-01-21 14:02:03 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
}
|
2020-06-11 00:51:27 +00:00
|
|
|
|
|
|
|
// with default poll timeout (500ms) it will give about 5 sec delay for doing 10 retries
|
|
|
|
// when selecting from empty topic
|
|
|
|
const auto MAX_FAILED_POLL_ATTEMPTS = 10;
|
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
KafkaSource::KafkaSource(
|
2020-06-16 12:48:10 +00:00
|
|
|
StorageKafka & storage_,
|
2021-07-09 03:15:41 +00:00
|
|
|
const StorageSnapshotPtr & storage_snapshot_,
|
2021-06-02 16:52:33 +00:00
|
|
|
const ContextPtr & context_,
|
2020-06-16 12:48:10 +00:00
|
|
|
const Names & columns,
|
2020-06-22 09:03:53 +00:00
|
|
|
Poco::Logger * log_,
|
2020-06-16 12:48:10 +00:00
|
|
|
size_t max_block_size_,
|
|
|
|
bool commit_in_suffix_)
|
2022-05-20 19:49:31 +00:00
|
|
|
: ISource(storage_snapshot_->getSampleBlockForColumns(columns))
|
2021-10-07 08:26:08 +00:00
|
|
|
, storage(storage_)
|
2021-07-09 03:15:41 +00:00
|
|
|
, storage_snapshot(storage_snapshot_)
|
2019-11-01 11:34:29 +00:00
|
|
|
, context(context_)
|
|
|
|
, column_names(columns)
|
2020-06-11 00:51:27 +00:00
|
|
|
, log(log_)
|
2019-11-01 11:34:29 +00:00
|
|
|
, max_block_size(max_block_size_)
|
|
|
|
, commit_in_suffix(commit_in_suffix_)
|
2021-07-09 03:15:41 +00:00
|
|
|
, non_virtual_header(storage_snapshot->metadata->getSampleBlockNonMaterialized())
|
|
|
|
, virtual_header(storage_snapshot->getSampleBlockForColumns(storage.getVirtualColumnNames()))
|
2021-03-18 08:54:39 +00:00
|
|
|
, handle_error_mode(storage.getHandleKafkaErrorMode())
|
2019-01-21 14:02:03 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
KafkaSource::~KafkaSource()
|
2019-01-21 14:02:03 +00:00
|
|
|
{
|
2022-10-28 16:41:10 +00:00
|
|
|
if (!consumer)
|
2019-01-21 14:02:03 +00:00
|
|
|
return;
|
|
|
|
|
2019-01-30 17:41:06 +00:00
|
|
|
if (broken)
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer->unsubscribe();
|
2019-01-21 14:02:03 +00:00
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
storage.pushConsumer(consumer);
|
2019-01-21 14:02:03 +00:00
|
|
|
}
|
|
|
|
|
2022-05-20 19:49:31 +00:00
|
|
|
bool KafkaSource::checkTimeLimit() const
|
|
|
|
{
|
|
|
|
if (max_execution_time != 0)
|
|
|
|
{
|
|
|
|
auto elapsed_ns = total_stopwatch.elapsed();
|
|
|
|
|
|
|
|
if (elapsed_ns > static_cast<UInt64>(max_execution_time.totalMicroseconds()) * 1000)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
Chunk KafkaSource::generateImpl()
|
2019-05-22 19:38:43 +00:00
|
|
|
{
|
2022-10-28 16:41:10 +00:00
|
|
|
if (!consumer)
|
2021-10-07 08:26:08 +00:00
|
|
|
{
|
|
|
|
auto timeout = std::chrono::milliseconds(context->getSettingsRef().kafka_max_wait_ms.totalMilliseconds());
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer = storage.popConsumer(timeout);
|
2019-01-21 14:02:03 +00:00
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
if (!consumer)
|
2021-10-07 08:26:08 +00:00
|
|
|
return {};
|
2019-01-30 17:41:06 +00:00
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer->subscribe();
|
2019-09-20 12:12:32 +00:00
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
broken = true;
|
|
|
|
}
|
2019-05-14 15:52:03 +00:00
|
|
|
|
2021-10-11 11:28:46 +00:00
|
|
|
if (is_finished)
|
2021-10-07 08:26:08 +00:00
|
|
|
return {};
|
2019-05-14 15:52:03 +00:00
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
is_finished = true;
|
2020-01-30 19:30:45 +00:00
|
|
|
// now it's one-time usage InputStream
|
|
|
|
// one block of the needed size (or with desired flush timeout) is formed in one internal iteration
|
|
|
|
// otherwise external iteration will reuse that and logic will became even more fuzzy
|
2019-11-29 06:21:31 +00:00
|
|
|
MutableColumns virtual_columns = virtual_header.cloneEmptyColumns();
|
|
|
|
|
2021-03-18 05:26:32 +00:00
|
|
|
auto put_error_to_stream = handle_error_mode == HandleKafkaErrorMode::STREAM;
|
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
EmptyReadBuffer empty_buf;
|
2023-03-20 07:55:44 +00:00
|
|
|
auto input_format = FormatFactory::instance().getInput(
|
|
|
|
storage.getFormatName(), empty_buf, non_virtual_header, context, max_block_size, std::nullopt, 1);
|
2019-11-26 23:46:19 +00:00
|
|
|
|
2021-03-19 03:49:30 +00:00
|
|
|
std::optional<std::string> exception_message;
|
2021-08-27 03:00:12 +00:00
|
|
|
size_t total_rows = 0;
|
|
|
|
size_t failed_poll_attempts = 0;
|
|
|
|
|
|
|
|
auto on_error = [&](const MutableColumns & result_columns, Exception & e)
|
2019-09-20 12:12:32 +00:00
|
|
|
{
|
2022-04-04 14:05:31 +00:00
|
|
|
ProfileEvents::increment(ProfileEvents::KafkaMessagesFailed);
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
if (put_error_to_stream)
|
2019-09-20 12:12:32 +00:00
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
exception_message = e.message();
|
|
|
|
for (const auto & column : result_columns)
|
2019-11-28 16:43:06 +00:00
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
// read_kafka_message could already push some rows to result_columns
|
|
|
|
// before exception, we need to fix it.
|
|
|
|
auto cur_rows = column->size();
|
|
|
|
if (cur_rows > total_rows)
|
|
|
|
column->popBack(cur_rows - total_rows);
|
|
|
|
|
|
|
|
// all data columns will get default value in case of error
|
|
|
|
column->insertDefault();
|
2019-11-28 16:43:06 +00:00
|
|
|
}
|
2021-08-27 03:00:12 +00:00
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
e.addMessage("while parsing Kafka message (topic: {}, partition: {}, offset: {})'",
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer->currentTopic(), consumer->currentPartition(), consumer->currentOffset());
|
2023-02-28 13:33:41 +00:00
|
|
|
throw std::move(e);
|
2019-09-20 12:12:32 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
StreamingFormatExecutor executor(non_virtual_header, input_format, std::move(on_error));
|
2019-12-03 10:44:27 +00:00
|
|
|
|
2019-12-03 21:12:47 +00:00
|
|
|
while (true)
|
2019-09-20 12:12:32 +00:00
|
|
|
{
|
2021-03-19 03:49:30 +00:00
|
|
|
size_t new_rows = 0;
|
|
|
|
exception_message.reset();
|
2022-10-28 16:41:10 +00:00
|
|
|
if (auto buf = consumer->consume())
|
2022-04-04 14:05:31 +00:00
|
|
|
{
|
|
|
|
ProfileEvents::increment(ProfileEvents::KafkaMessagesRead);
|
2022-10-28 16:41:10 +00:00
|
|
|
new_rows = executor.execute(*buf);
|
2022-04-04 14:05:31 +00:00
|
|
|
}
|
2021-03-19 03:49:30 +00:00
|
|
|
|
2020-06-11 00:51:27 +00:00
|
|
|
if (new_rows)
|
|
|
|
{
|
2022-10-28 16:41:10 +00:00
|
|
|
// In read_kafka_message(), KafkaConsumer::nextImpl()
|
2020-07-02 19:45:46 +00:00
|
|
|
// will be called, that may make something unusable, i.e. clean
|
2022-10-28 16:41:10 +00:00
|
|
|
// KafkaConsumer::messages, which is accessed from
|
|
|
|
// KafkaConsumer::currentTopic() (and other helpers).
|
|
|
|
if (consumer->isStalled())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Polled messages became unusable");
|
2020-07-02 19:45:46 +00:00
|
|
|
|
2022-04-04 14:05:31 +00:00
|
|
|
ProfileEvents::increment(ProfileEvents::KafkaRowsRead, new_rows);
|
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer->storeLastReadMessageOffset();
|
2020-01-30 19:30:45 +00:00
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
auto topic = consumer->currentTopic();
|
|
|
|
auto key = consumer->currentKey();
|
|
|
|
auto offset = consumer->currentOffset();
|
|
|
|
auto partition = consumer->currentPartition();
|
|
|
|
auto timestamp_raw = consumer->currentTimestamp();
|
|
|
|
auto header_list = consumer->currentHeaderList();
|
2020-05-29 08:44:10 +00:00
|
|
|
|
2020-06-11 00:51:27 +00:00
|
|
|
Array headers_names;
|
|
|
|
Array headers_values;
|
2020-05-29 08:44:10 +00:00
|
|
|
|
2020-06-11 00:51:27 +00:00
|
|
|
if (!header_list.empty())
|
2020-05-29 08:44:10 +00:00
|
|
|
{
|
2020-06-11 00:51:27 +00:00
|
|
|
headers_names.reserve(header_list.size());
|
|
|
|
headers_values.reserve(header_list.size());
|
|
|
|
for (const auto & header : header_list)
|
|
|
|
{
|
|
|
|
headers_names.emplace_back(header.get_name());
|
|
|
|
headers_values.emplace_back(static_cast<std::string>(header.get_value()));
|
|
|
|
}
|
2020-05-29 08:44:10 +00:00
|
|
|
}
|
2020-05-28 17:41:57 +00:00
|
|
|
|
2020-06-11 00:51:27 +00:00
|
|
|
for (size_t i = 0; i < new_rows; ++i)
|
2019-12-05 21:21:15 +00:00
|
|
|
{
|
2020-06-11 00:51:27 +00:00
|
|
|
virtual_columns[0]->insert(topic);
|
|
|
|
virtual_columns[1]->insert(key);
|
|
|
|
virtual_columns[2]->insert(offset);
|
|
|
|
virtual_columns[3]->insert(partition);
|
|
|
|
if (timestamp_raw)
|
|
|
|
{
|
|
|
|
auto ts = timestamp_raw->get_timestamp();
|
|
|
|
virtual_columns[4]->insert(std::chrono::duration_cast<std::chrono::seconds>(ts).count());
|
|
|
|
virtual_columns[5]->insert(DecimalField<Decimal64>(std::chrono::duration_cast<std::chrono::milliseconds>(ts).count(),3));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
virtual_columns[4]->insertDefault();
|
|
|
|
virtual_columns[5]->insertDefault();
|
|
|
|
}
|
|
|
|
virtual_columns[6]->insert(headers_names);
|
|
|
|
virtual_columns[7]->insert(headers_values);
|
2021-03-18 05:26:32 +00:00
|
|
|
if (put_error_to_stream)
|
|
|
|
{
|
2021-03-19 03:49:30 +00:00
|
|
|
if (exception_message)
|
|
|
|
{
|
2022-10-28 16:41:10 +00:00
|
|
|
auto payload = consumer->currentPayload();
|
2021-03-19 03:49:30 +00:00
|
|
|
virtual_columns[8]->insert(payload);
|
|
|
|
virtual_columns[9]->insert(*exception_message);
|
|
|
|
}
|
2021-03-18 05:26:32 +00:00
|
|
|
else
|
2021-03-19 03:49:30 +00:00
|
|
|
{
|
|
|
|
virtual_columns[8]->insertDefault();
|
|
|
|
virtual_columns[9]->insertDefault();
|
|
|
|
}
|
2021-03-18 05:26:32 +00:00
|
|
|
}
|
2019-12-05 21:21:15 +00:00
|
|
|
}
|
2020-01-30 19:30:45 +00:00
|
|
|
|
2020-06-11 00:51:27 +00:00
|
|
|
total_rows = total_rows + new_rows;
|
|
|
|
}
|
2022-10-28 16:41:10 +00:00
|
|
|
else if (consumer->polledDataUnusable())
|
2020-06-11 00:51:27 +00:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
2022-10-28 16:41:10 +00:00
|
|
|
else if (consumer->isStalled())
|
2020-07-02 19:25:59 +00:00
|
|
|
{
|
|
|
|
++failed_poll_attempts;
|
|
|
|
}
|
2020-06-11 00:51:27 +00:00
|
|
|
else
|
|
|
|
{
|
2021-08-17 17:12:12 +00:00
|
|
|
// We came here in case of tombstone (or sometimes zero-length) messages, and it is not something abnormal
|
|
|
|
// TODO: it seems like in case of put_error_to_stream=true we may need to process those differently
|
|
|
|
// currently we just skip them with note in logs.
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer->storeLastReadMessageOffset();
|
|
|
|
LOG_DEBUG(log, "Parsing of message (topic: {}, partition: {}, offset: {}) return no rows.", consumer->currentTopic(), consumer->currentPartition(), consumer->currentOffset());
|
2020-06-11 00:51:27 +00:00
|
|
|
}
|
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
if (!consumer->hasMorePolledMessages()
|
2020-06-11 00:51:27 +00:00
|
|
|
&& (total_rows >= max_block_size || !checkTimeLimit() || failed_poll_attempts >= MAX_FAILED_POLL_ATTEMPTS))
|
2020-01-28 13:24:37 +00:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
2019-09-20 12:12:32 +00:00
|
|
|
}
|
2019-05-23 13:20:25 +00:00
|
|
|
|
2022-04-04 14:05:31 +00:00
|
|
|
if (total_rows == 0)
|
|
|
|
{
|
|
|
|
return {};
|
|
|
|
}
|
2022-10-28 16:41:10 +00:00
|
|
|
else if (consumer->polledDataUnusable())
|
2022-04-04 14:05:31 +00:00
|
|
|
{
|
|
|
|
// the rows were counted already before by KafkaRowsRead,
|
|
|
|
// so let's count the rows we ignore separately
|
|
|
|
// (they will be retried after the rebalance)
|
|
|
|
ProfileEvents::increment(ProfileEvents::KafkaRowsRejected, total_rows);
|
2021-10-07 08:26:08 +00:00
|
|
|
return {};
|
2022-04-04 14:05:31 +00:00
|
|
|
}
|
2019-05-23 13:20:25 +00:00
|
|
|
|
2019-12-03 21:12:47 +00:00
|
|
|
/// MATERIALIZED columns can be added here, but I think
|
|
|
|
// they are not needed here:
|
|
|
|
// and it's misleading to use them here,
|
|
|
|
// as columns 'materialized' that way stays 'ephemeral'
|
|
|
|
// i.e. will not be stored anythere
|
2019-12-08 21:03:09 +00:00
|
|
|
// If needed any extra columns can be added using DEFAULT they can be added at MV level if needed.
|
2019-12-03 21:12:47 +00:00
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
auto result_block = non_virtual_header.cloneWithColumns(executor.getResultColumns());
|
2019-11-28 23:06:03 +00:00
|
|
|
auto virtual_block = virtual_header.cloneWithColumns(std::move(virtual_columns));
|
|
|
|
|
|
|
|
for (const auto & column : virtual_block.getColumnsWithTypeAndName())
|
2019-11-29 06:21:31 +00:00
|
|
|
result_block.insert(column);
|
2019-11-28 23:06:03 +00:00
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
auto converting_dag = ActionsDAG::makeConvertingActions(
|
|
|
|
result_block.cloneEmpty().getColumnsWithTypeAndName(),
|
|
|
|
getPort().getHeader().getColumnsWithTypeAndName(),
|
|
|
|
ActionsDAG::MatchColumnsMode::Name);
|
|
|
|
|
|
|
|
auto converting_actions = std::make_shared<ExpressionActions>(std::move(converting_dag));
|
|
|
|
converting_actions->execute(result_block);
|
|
|
|
|
|
|
|
return Chunk(result_block.getColumns(), result_block.rows());
|
2019-05-22 19:38:43 +00:00
|
|
|
}
|
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
Chunk KafkaSource::generate()
|
2019-10-02 21:17:19 +00:00
|
|
|
{
|
2021-10-07 08:26:08 +00:00
|
|
|
auto chunk = generateImpl();
|
|
|
|
if (!chunk && commit_in_suffix)
|
2019-10-02 21:17:19 +00:00
|
|
|
commit();
|
2021-10-07 08:26:08 +00:00
|
|
|
|
|
|
|
return chunk;
|
2019-10-02 21:17:19 +00:00
|
|
|
}
|
|
|
|
|
2021-10-07 08:26:08 +00:00
|
|
|
void KafkaSource::commit()
|
2019-01-21 14:02:03 +00:00
|
|
|
{
|
2022-10-28 16:41:10 +00:00
|
|
|
if (!consumer)
|
2019-08-20 11:17:57 +00:00
|
|
|
return;
|
|
|
|
|
2022-10-28 16:41:10 +00:00
|
|
|
consumer->commit();
|
2020-03-04 06:58:44 +00:00
|
|
|
|
|
|
|
broken = false;
|
2019-01-21 14:02:03 +00:00
|
|
|
}
|
|
|
|
|
2019-04-22 13:23:05 +00:00
|
|
|
}
|