2018-12-27 13:27:01 +00:00
|
|
|
#include <Storages/Kafka/StorageKafka.h>
|
|
|
|
|
2019-01-23 14:48:50 +00:00
|
|
|
#include <DataStreams/IBlockInputStream.h>
|
2017-10-03 23:39:24 +00:00
|
|
|
#include <DataStreams/LimitBlockInputStream.h>
|
2017-12-16 01:45:11 +00:00
|
|
|
#include <DataStreams/UnionBlockInputStream.h>
|
2017-10-03 23:39:24 +00:00
|
|
|
#include <DataStreams/copyData.h>
|
2019-08-06 14:18:37 +00:00
|
|
|
#include <DataTypes/DataTypeDateTime.h>
|
|
|
|
#include <DataTypes/DataTypeNullable.h>
|
2019-05-21 11:24:32 +00:00
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
|
|
#include <DataTypes/DataTypeString.h>
|
2017-10-03 23:39:24 +00:00
|
|
|
#include <Interpreters/InterpreterInsertQuery.h>
|
2017-12-30 00:36:06 +00:00
|
|
|
#include <Interpreters/evaluateConstantExpression.h>
|
2018-12-27 13:27:01 +00:00
|
|
|
#include <Parsers/ASTCreateQuery.h>
|
2017-10-03 23:39:24 +00:00
|
|
|
#include <Parsers/ASTExpressionList.h>
|
|
|
|
#include <Parsers/ASTInsertQuery.h>
|
2017-12-30 00:36:06 +00:00
|
|
|
#include <Parsers/ASTLiteral.h>
|
2018-08-01 17:23:50 +00:00
|
|
|
#include <Storages/Kafka/KafkaSettings.h>
|
2019-01-21 14:02:03 +00:00
|
|
|
#include <Storages/Kafka/KafkaBlockInputStream.h>
|
2019-08-20 11:17:57 +00:00
|
|
|
#include <Storages/Kafka/KafkaBlockOutputStream.h>
|
2019-10-22 10:31:28 +00:00
|
|
|
#include <Storages/Kafka/WriteBufferToKafkaProducer.h>
|
2017-12-30 00:36:06 +00:00
|
|
|
#include <Storages/StorageFactory.h>
|
2018-12-27 13:27:01 +00:00
|
|
|
#include <Storages/StorageMaterializedView.h>
|
|
|
|
#include <boost/algorithm/string/replace.hpp>
|
|
|
|
#include <boost/algorithm/string/split.hpp>
|
|
|
|
#include <boost/algorithm/string/trim.hpp>
|
|
|
|
#include <Poco/Util/AbstractConfiguration.h>
|
|
|
|
#include <Common/Exception.h>
|
|
|
|
#include <Common/Macros.h>
|
|
|
|
#include <Common/config_version.h>
|
|
|
|
#include <Common/setThreadName.h>
|
|
|
|
#include <Common/typeid_cast.h>
|
2017-10-03 23:39:24 +00:00
|
|
|
#include <common/logger_useful.h>
|
2019-12-20 16:53:37 +00:00
|
|
|
#include <Common/quoteString.h>
|
2020-02-17 16:08:22 +00:00
|
|
|
#include <Processors/Sources/SourceFromInputStream.h>
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2018-06-05 19:46:49 +00:00
|
|
|
|
2017-10-03 23:39:24 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-02-25 18:02:41 +00:00
|
|
|
extern const int NOT_IMPLEMENTED;
|
2017-12-16 01:45:11 +00:00
|
|
|
extern const int LOGICAL_ERROR;
|
2017-12-30 00:36:06 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
2019-01-21 14:02:03 +00:00
|
|
|
namespace
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-01-21 14:02:03 +00:00
|
|
|
const auto RESCHEDULE_MS = 500;
|
|
|
|
const auto CLEANUP_TIMEOUT_MS = 3000;
|
2017-12-23 00:32:48 +00:00
|
|
|
|
2019-01-21 14:02:03 +00:00
|
|
|
/// Configuration prefix
|
|
|
|
const String CONFIG_PREFIX = "kafka";
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2019-01-21 14:02:03 +00:00
|
|
|
void loadFromConfig(cppkafka::Configuration & conf, const Poco::Util::AbstractConfiguration & config, const std::string & path)
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-01-21 14:02:03 +00:00
|
|
|
Poco::Util::AbstractConfiguration::Keys keys;
|
|
|
|
std::vector<char> errstr(512);
|
2019-01-18 12:48:38 +00:00
|
|
|
|
2019-01-21 14:02:03 +00:00
|
|
|
config.keys(path, keys);
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2019-01-21 14:02:03 +00:00
|
|
|
for (const auto & key : keys)
|
2017-12-16 01:45:11 +00:00
|
|
|
{
|
2019-01-21 14:02:03 +00:00
|
|
|
const String key_path = path + "." + key;
|
|
|
|
const String key_name = boost::replace_all_copy(key, "_", ".");
|
|
|
|
conf.set(key_name, config.getString(key_path));
|
2017-12-16 01:45:11 +00:00
|
|
|
}
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
2019-06-13 10:43:37 +00:00
|
|
|
}
|
2017-12-16 01:45:11 +00:00
|
|
|
|
2017-10-03 23:39:24 +00:00
|
|
|
StorageKafka::StorageKafka(
|
2019-12-04 16:06:55 +00:00
|
|
|
const StorageID & table_id_,
|
2017-10-03 23:39:24 +00:00
|
|
|
Context & context_,
|
2018-03-06 20:18:34 +00:00
|
|
|
const ColumnsDescription & columns_,
|
2019-05-21 11:24:32 +00:00
|
|
|
const String & brokers_,
|
|
|
|
const String & group_,
|
|
|
|
const Names & topics_,
|
|
|
|
const String & format_name_,
|
|
|
|
char row_delimiter_,
|
|
|
|
const String & schema_name_,
|
|
|
|
size_t num_consumers_,
|
|
|
|
UInt64 max_block_size_,
|
|
|
|
size_t skip_broken_,
|
2019-05-16 15:20:30 +00:00
|
|
|
bool intermediate_commit_)
|
2019-12-04 16:06:55 +00:00
|
|
|
: IStorage(table_id_,
|
2019-05-21 11:24:32 +00:00
|
|
|
ColumnsDescription({{"_topic", std::make_shared<DataTypeString>()},
|
|
|
|
{"_key", std::make_shared<DataTypeString>()},
|
2019-07-22 11:32:11 +00:00
|
|
|
{"_offset", std::make_shared<DataTypeUInt64>()},
|
2019-08-06 14:18:37 +00:00
|
|
|
{"_partition", std::make_shared<DataTypeUInt64>()},
|
|
|
|
{"_timestamp", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeDateTime>())}}, true))
|
2019-12-13 01:21:40 +00:00
|
|
|
, global_context(context_.getGlobalContext())
|
|
|
|
, kafka_context(Context(global_context))
|
2019-05-21 11:24:32 +00:00
|
|
|
, topics(global_context.getMacros()->expand(topics_))
|
|
|
|
, brokers(global_context.getMacros()->expand(brokers_))
|
|
|
|
, group(global_context.getMacros()->expand(group_))
|
|
|
|
, format_name(global_context.getMacros()->expand(format_name_))
|
|
|
|
, row_delimiter(row_delimiter_)
|
|
|
|
, schema_name(global_context.getMacros()->expand(schema_name_))
|
|
|
|
, num_consumers(num_consumers_)
|
|
|
|
, max_block_size(max_block_size_)
|
2019-12-04 16:06:55 +00:00
|
|
|
, log(&Logger::get("StorageKafka (" + table_id_.table_name + ")"))
|
2019-05-21 11:24:32 +00:00
|
|
|
, semaphore(0, num_consumers_)
|
|
|
|
, skip_broken(skip_broken_)
|
|
|
|
, intermediate_commit(intermediate_commit_)
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-12-13 01:21:40 +00:00
|
|
|
kafka_context.makeQueryContext();
|
|
|
|
|
2019-08-24 21:20:20 +00:00
|
|
|
setColumns(columns_);
|
2019-08-20 11:17:57 +00:00
|
|
|
task = global_context.getSchedulePool().createTask(log->name(), [this]{ threadFunc(); });
|
2018-09-26 05:12:32 +00:00
|
|
|
task->deactivate();
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-02-19 16:07:28 +00:00
|
|
|
Pipes StorageKafka::read(
|
2019-05-22 19:38:43 +00:00
|
|
|
const Names & column_names,
|
2019-05-21 11:24:32 +00:00
|
|
|
const SelectQueryInfo & /* query_info */,
|
2017-10-03 23:39:24 +00:00
|
|
|
const Context & context,
|
2019-05-21 11:24:32 +00:00
|
|
|
QueryProcessingStage::Enum /* processed_stage */,
|
|
|
|
size_t /* max_block_size */,
|
2019-06-17 16:27:18 +00:00
|
|
|
unsigned /* num_streams */)
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2018-10-16 18:21:27 +00:00
|
|
|
if (num_created_consumers == 0)
|
2020-02-17 16:08:22 +00:00
|
|
|
return {};
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2019-06-17 16:27:18 +00:00
|
|
|
/// Always use all consumers at once, otherwise SELECT may not read messages from all partitions.
|
2020-02-17 16:08:22 +00:00
|
|
|
Pipes pipes;
|
|
|
|
pipes.reserve(num_created_consumers);
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2017-12-16 01:45:11 +00:00
|
|
|
// Claim as many consumers as requested, but don't block
|
2019-06-17 16:27:18 +00:00
|
|
|
for (size_t i = 0; i < num_created_consumers; ++i)
|
2017-12-16 01:45:11 +00:00
|
|
|
{
|
2019-05-21 11:24:32 +00:00
|
|
|
/// Use block size of 1, otherwise LIMIT won't work properly as it will buffer excess messages in the last block
|
2019-05-22 19:38:43 +00:00
|
|
|
/// TODO: probably that leads to awful performance.
|
2019-06-17 16:27:18 +00:00
|
|
|
/// FIXME: seems that doesn't help with extra reading and committing unprocessed messages.
|
2020-02-17 16:08:22 +00:00
|
|
|
/// TODO: rewrite KafkaBlockInputStream to KafkaSource. Now it is used in other place.
|
|
|
|
pipes.emplace_back(std::make_shared<SourceFromInputStream>(std::make_shared<KafkaBlockInputStream>(*this, context, column_names, 1)));
|
2017-12-16 01:45:11 +00:00
|
|
|
}
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2020-02-17 16:08:22 +00:00
|
|
|
LOG_DEBUG(log, "Starting reading " << pipes.size() << " streams");
|
|
|
|
return pipes;
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
BlockOutputStreamPtr StorageKafka::write(const ASTPtr &, const Context & context)
|
|
|
|
{
|
|
|
|
if (topics.size() > 1)
|
|
|
|
throw Exception("Can't write to Kafka table with multiple topics!", ErrorCodes::NOT_IMPLEMENTED);
|
|
|
|
return std::make_shared<KafkaBlockOutputStream>(*this, context);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-03 23:39:24 +00:00
|
|
|
void StorageKafka::startup()
|
|
|
|
{
|
2017-12-16 01:45:11 +00:00
|
|
|
for (size_t i = 0; i < num_consumers; ++i)
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-05-15 14:00:13 +00:00
|
|
|
try
|
|
|
|
{
|
2019-08-20 11:17:57 +00:00
|
|
|
pushReadBuffer(createReadBuffer());
|
2019-05-15 14:00:13 +00:00
|
|
|
++num_created_consumers;
|
|
|
|
}
|
2019-05-15 20:57:24 +00:00
|
|
|
catch (const cppkafka::Exception &)
|
2019-05-15 14:00:13 +00:00
|
|
|
{
|
|
|
|
tryLogCurrentException(log);
|
|
|
|
}
|
2017-12-16 01:45:11 +00:00
|
|
|
}
|
2017-10-03 23:39:24 +00:00
|
|
|
|
|
|
|
// Start the reader thread
|
2018-09-26 05:12:32 +00:00
|
|
|
task->activateAndSchedule();
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void StorageKafka::shutdown()
|
|
|
|
{
|
2017-12-16 01:45:11 +00:00
|
|
|
// Interrupt streaming thread
|
|
|
|
stream_cancelled = true;
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2018-09-26 05:12:32 +00:00
|
|
|
// Close all consumers
|
2018-03-19 17:52:48 +00:00
|
|
|
for (size_t i = 0; i < num_created_consumers; ++i)
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-08-20 11:17:57 +00:00
|
|
|
auto buffer = popReadBuffer();
|
2019-04-22 13:23:05 +00:00
|
|
|
// FIXME: not sure if we really close consumers here, and if we really need to close them here.
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
2018-09-26 05:12:32 +00:00
|
|
|
LOG_TRACE(log, "Waiting for cleanup");
|
2017-10-03 23:39:24 +00:00
|
|
|
rd_kafka_wait_destroyed(CLEANUP_TIMEOUT_MS);
|
2018-09-26 05:12:32 +00:00
|
|
|
|
|
|
|
task->deactivate();
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
2017-12-16 01:45:11 +00:00
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
void StorageKafka::pushReadBuffer(ConsumerBufferPtr buffer)
|
2017-12-20 19:09:13 +00:00
|
|
|
{
|
2019-08-20 11:17:57 +00:00
|
|
|
std::lock_guard lock(mutex);
|
|
|
|
buffers.push_back(buffer);
|
|
|
|
semaphore.set();
|
2017-12-20 19:09:13 +00:00
|
|
|
}
|
2017-12-16 01:45:11 +00:00
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
|
|
|
|
ConsumerBufferPtr StorageKafka::popReadBuffer()
|
2017-12-16 01:45:11 +00:00
|
|
|
{
|
2019-08-20 11:17:57 +00:00
|
|
|
return popReadBuffer(std::chrono::milliseconds::zero());
|
2019-04-22 13:23:05 +00:00
|
|
|
}
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
|
|
|
|
ConsumerBufferPtr StorageKafka::popReadBuffer(std::chrono::milliseconds timeout)
|
2019-04-22 13:23:05 +00:00
|
|
|
{
|
|
|
|
// Wait for the first free buffer
|
2019-08-20 11:17:57 +00:00
|
|
|
if (timeout == std::chrono::milliseconds::zero())
|
|
|
|
semaphore.wait();
|
|
|
|
else
|
2017-12-16 01:45:11 +00:00
|
|
|
{
|
2019-08-20 11:17:57 +00:00
|
|
|
if (!semaphore.tryWait(timeout.count()))
|
2017-12-16 01:45:11 +00:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-04-22 13:23:05 +00:00
|
|
|
// Take the first available buffer from the list
|
2019-01-02 06:44:36 +00:00
|
|
|
std::lock_guard lock(mutex);
|
2019-04-22 13:23:05 +00:00
|
|
|
auto buffer = buffers.back();
|
|
|
|
buffers.pop_back();
|
|
|
|
return buffer;
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
|
2020-02-04 21:19:34 +00:00
|
|
|
ProducerBufferPtr StorageKafka::createWriteBuffer(const Block & header)
|
2017-12-16 01:45:11 +00:00
|
|
|
{
|
2019-08-20 11:17:57 +00:00
|
|
|
cppkafka::Configuration conf;
|
|
|
|
conf.set("metadata.broker.list", brokers);
|
|
|
|
conf.set("group.id", group);
|
|
|
|
conf.set("client.id", VERSION_FULL);
|
|
|
|
// TODO: fill required settings
|
|
|
|
updateConfiguration(conf);
|
|
|
|
|
|
|
|
auto producer = std::make_shared<cppkafka::Producer>(conf);
|
|
|
|
const Settings & settings = global_context.getSettingsRef();
|
|
|
|
size_t poll_timeout = settings.stream_poll_timeout_ms.totalMilliseconds();
|
|
|
|
|
|
|
|
return std::make_shared<WriteBufferToKafkaProducer>(
|
2020-02-04 21:19:34 +00:00
|
|
|
producer, topics[0], row_delimiter ? std::optional<char>{row_delimiter} : std::nullopt, 1, 1024, std::chrono::milliseconds(poll_timeout), header);
|
2017-12-16 01:45:11 +00:00
|
|
|
}
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2019-05-22 19:38:43 +00:00
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
ConsumerBufferPtr StorageKafka::createReadBuffer()
|
2019-05-22 19:38:43 +00:00
|
|
|
{
|
|
|
|
cppkafka::Configuration conf;
|
|
|
|
conf.set("metadata.broker.list", brokers);
|
|
|
|
conf.set("group.id", group);
|
|
|
|
conf.set("client.id", VERSION_FULL);
|
2019-09-09 16:59:51 +00:00
|
|
|
conf.set("auto.offset.reset", "smallest"); // If no offset stored for this group, read all messages from the start
|
|
|
|
conf.set("enable.auto.commit", "false"); // We manually commit offsets after a stream successfully finished
|
|
|
|
conf.set("enable.auto.offset.store", "false"); // Update offset automatically - to commit them all at once.
|
|
|
|
conf.set("enable.partition.eof", "false"); // Ignore EOF messages
|
2019-08-20 11:17:57 +00:00
|
|
|
updateConfiguration(conf);
|
2019-05-22 19:38:43 +00:00
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
// Create a consumer and subscribe to topics
|
|
|
|
auto consumer = std::make_shared<cppkafka::Consumer>(conf);
|
2019-05-22 19:38:43 +00:00
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
// Limit the number of batched messages to allow early cancellations
|
|
|
|
const Settings & settings = global_context.getSettingsRef();
|
|
|
|
size_t batch_size = max_block_size;
|
|
|
|
if (!batch_size)
|
|
|
|
batch_size = settings.max_block_size.value;
|
|
|
|
size_t poll_timeout = settings.stream_poll_timeout_ms.totalMilliseconds();
|
2019-05-22 19:38:43 +00:00
|
|
|
|
2019-08-29 15:36:07 +00:00
|
|
|
/// NOTE: we pass |stream_cancelled| by reference here, so the buffers should not outlive the storage.
|
2020-01-30 19:30:45 +00:00
|
|
|
return std::make_shared<ReadBufferFromKafkaConsumer>(consumer, log, batch_size, poll_timeout, intermediate_commit, stream_cancelled, getTopics());
|
2019-08-20 11:17:57 +00:00
|
|
|
}
|
2019-05-22 19:38:43 +00:00
|
|
|
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
void StorageKafka::updateConfiguration(cppkafka::Configuration & conf)
|
|
|
|
{
|
2019-05-22 19:38:43 +00:00
|
|
|
// Update consumer configuration from the configuration
|
|
|
|
const auto & config = global_context.getConfigRef();
|
|
|
|
if (config.has(CONFIG_PREFIX))
|
|
|
|
loadFromConfig(conf, config, CONFIG_PREFIX);
|
|
|
|
|
|
|
|
// Update consumer topic-specific configuration
|
|
|
|
for (const auto & topic : topics)
|
|
|
|
{
|
|
|
|
const auto topic_config_key = CONFIG_PREFIX + "_" + topic;
|
|
|
|
if (config.has(topic_config_key))
|
|
|
|
loadFromConfig(conf, config, topic_config_key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-03 16:25:32 +00:00
|
|
|
bool StorageKafka::checkDependencies(const StorageID & table_id)
|
2018-10-16 18:21:27 +00:00
|
|
|
{
|
|
|
|
// Check if all dependencies are attached
|
2020-02-12 16:54:26 +00:00
|
|
|
auto dependencies = DatabaseCatalog::instance().getDependencies(table_id);
|
2020-03-08 21:40:00 +00:00
|
|
|
if (dependencies.empty())
|
2018-10-16 18:21:27 +00:00
|
|
|
return true;
|
|
|
|
|
|
|
|
// Check the dependencies are ready?
|
|
|
|
for (const auto & db_tab : dependencies)
|
|
|
|
{
|
2020-02-17 13:52:59 +00:00
|
|
|
auto table = DatabaseCatalog::instance().tryGetTable(db_tab);
|
2018-10-16 18:21:27 +00:00
|
|
|
if (!table)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If it materialized view, check it's target table
|
|
|
|
auto * materialized_view = dynamic_cast<StorageMaterializedView *>(table.get());
|
|
|
|
if (materialized_view && !materialized_view->tryGetTargetTable())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Check all its dependencies
|
2019-12-12 12:30:31 +00:00
|
|
|
if (!checkDependencies(db_tab))
|
2018-10-16 18:21:27 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
void StorageKafka::threadFunc()
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2018-09-26 05:12:32 +00:00
|
|
|
try
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-12-03 16:25:32 +00:00
|
|
|
auto table_id = getStorageID();
|
2018-10-16 18:21:27 +00:00
|
|
|
// Check if at least one direct dependency is attached
|
2020-02-12 16:54:26 +00:00
|
|
|
size_t dependencies_count = DatabaseCatalog::instance().getDependencies(table_id).size();
|
|
|
|
if (dependencies_count)
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2020-02-12 16:54:26 +00:00
|
|
|
// Keep streaming as long as there are attached views and streaming is not cancelled
|
|
|
|
while (!stream_cancelled && num_created_consumers > 0)
|
|
|
|
{
|
|
|
|
if (!checkDependencies(table_id))
|
|
|
|
break;
|
|
|
|
|
|
|
|
LOG_DEBUG(log, "Started streaming to " << dependencies_count << " attached views");
|
|
|
|
|
|
|
|
// Reschedule if not limited
|
|
|
|
if (!streamToViews())
|
|
|
|
break;
|
|
|
|
}
|
2018-09-26 05:12:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
2018-09-26 05:12:32 +00:00
|
|
|
// Wait for attached views
|
|
|
|
if (!stream_cancelled)
|
2019-01-21 14:02:03 +00:00
|
|
|
task->scheduleAfter(RESCHEDULE_MS);
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-09-26 05:12:32 +00:00
|
|
|
bool StorageKafka::streamToViews()
|
2017-10-03 23:39:24 +00:00
|
|
|
{
|
2019-12-03 16:25:32 +00:00
|
|
|
auto table_id = getStorageID();
|
2020-02-17 13:52:59 +00:00
|
|
|
auto table = DatabaseCatalog::instance().getTable(table_id);
|
2017-10-03 23:39:24 +00:00
|
|
|
if (!table)
|
2019-12-03 16:25:32 +00:00
|
|
|
throw Exception("Engine table " + table_id.getNameForLogs() + " doesn't exist.", ErrorCodes::LOGICAL_ERROR);
|
2017-10-03 23:39:24 +00:00
|
|
|
|
|
|
|
// Create an INSERT query for streaming data
|
|
|
|
auto insert = std::make_shared<ASTInsertQuery>();
|
2020-03-02 20:23:58 +00:00
|
|
|
insert->table_id = table_id;
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
const Settings & settings = global_context.getSettingsRef();
|
2018-10-16 18:21:27 +00:00
|
|
|
size_t block_size = max_block_size;
|
|
|
|
if (block_size == 0)
|
2019-08-09 13:02:01 +00:00
|
|
|
block_size = settings.max_block_size;
|
2017-12-16 01:45:11 +00:00
|
|
|
|
2019-06-24 11:42:58 +00:00
|
|
|
// Create a stream for each consumer and join them in a union stream
|
2019-10-22 10:31:28 +00:00
|
|
|
// Only insert into dependent views and expect that input blocks contain virtual columns
|
2019-12-13 01:21:40 +00:00
|
|
|
InterpreterInsertQuery interpreter(insert, kafka_context, false, true, true);
|
2019-05-22 19:38:43 +00:00
|
|
|
auto block_io = interpreter.execute();
|
|
|
|
|
2017-12-16 01:45:11 +00:00
|
|
|
// Create a stream for each consumer and join them in a union stream
|
|
|
|
BlockInputStreams streams;
|
2018-10-16 18:21:27 +00:00
|
|
|
streams.reserve(num_created_consumers);
|
|
|
|
for (size_t i = 0; i < num_created_consumers; ++i)
|
2017-12-16 01:45:11 +00:00
|
|
|
{
|
2019-10-02 21:17:19 +00:00
|
|
|
auto stream
|
2019-12-13 01:21:40 +00:00
|
|
|
= std::make_shared<KafkaBlockInputStream>(*this, kafka_context, block_io.out->getHeader().getNames(), block_size, false);
|
2018-09-26 05:12:32 +00:00
|
|
|
streams.emplace_back(stream);
|
2017-12-23 00:48:24 +00:00
|
|
|
|
|
|
|
// Limit read batch to maximum block size to allow DDL
|
2019-01-23 14:48:50 +00:00
|
|
|
IBlockInputStream::LocalLimits limits;
|
2019-10-10 14:16:15 +00:00
|
|
|
limits.speed_limits.max_execution_time = settings.stream_flush_interval_ms;
|
2017-12-23 00:48:24 +00:00
|
|
|
limits.timeout_overflow_mode = OverflowMode::BREAK;
|
2019-01-23 14:48:50 +00:00
|
|
|
stream->setLimits(limits);
|
2017-12-16 01:45:11 +00:00
|
|
|
}
|
|
|
|
|
2018-09-26 05:12:32 +00:00
|
|
|
// Join multiple streams if necessary
|
|
|
|
BlockInputStreamPtr in;
|
2018-10-01 01:29:17 +00:00
|
|
|
if (streams.size() > 1)
|
2018-11-28 14:33:40 +00:00
|
|
|
in = std::make_shared<UnionBlockInputStream>(streams, nullptr, streams.size());
|
2018-10-01 01:29:17 +00:00
|
|
|
else
|
2018-09-26 05:12:32 +00:00
|
|
|
in = streams[0];
|
2017-10-03 23:39:24 +00:00
|
|
|
|
2019-09-04 21:25:33 +00:00
|
|
|
std::atomic<bool> stub = {false};
|
2019-08-29 15:36:07 +00:00
|
|
|
copyData(*in, *block_io.out, &stub);
|
2019-10-02 21:17:19 +00:00
|
|
|
for (auto & stream : streams)
|
|
|
|
stream->as<KafkaBlockInputStream>()->commit();
|
2018-09-26 05:12:32 +00:00
|
|
|
|
|
|
|
// Check whether the limits were applied during query execution
|
|
|
|
bool limits_applied = false;
|
2019-01-23 14:48:50 +00:00
|
|
|
const BlockStreamProfileInfo & info = in->getProfileInfo();
|
|
|
|
limits_applied = info.hasAppliedLimit();
|
2018-09-26 05:12:32 +00:00
|
|
|
|
|
|
|
return limits_applied;
|
2017-12-16 01:45:11 +00:00
|
|
|
}
|
|
|
|
|
2017-12-30 00:36:06 +00:00
|
|
|
void registerStorageKafka(StorageFactory & factory)
|
|
|
|
{
|
2020-01-25 19:14:33 +00:00
|
|
|
auto creator_fn = [](const StorageFactory::Arguments & args)
|
2017-12-30 00:36:06 +00:00
|
|
|
{
|
|
|
|
ASTs & engine_args = args.engine_args;
|
2018-08-01 17:23:50 +00:00
|
|
|
size_t args_count = engine_args.size();
|
|
|
|
bool has_settings = args.storage_def->settings;
|
|
|
|
|
|
|
|
KafkaSettings kafka_settings;
|
|
|
|
if (has_settings)
|
|
|
|
{
|
|
|
|
kafka_settings.loadFromQuery(*args.storage_def);
|
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
|
|
|
/** Arguments of engine is following:
|
|
|
|
* - Kafka broker list
|
|
|
|
* - List of topics
|
|
|
|
* - Group ID (may be a constaint expression with a string result)
|
|
|
|
* - Message format (string)
|
2018-08-01 17:23:50 +00:00
|
|
|
* - Row delimiter
|
2017-12-30 00:36:06 +00:00
|
|
|
* - Schema (optional, if the format supports it)
|
2018-08-01 17:23:50 +00:00
|
|
|
* - Number of consumers
|
2019-01-18 12:48:38 +00:00
|
|
|
* - Max block size for background consumption
|
|
|
|
* - Skip (at least) unreadable messages number
|
2019-05-16 15:20:30 +00:00
|
|
|
* - Do intermediate commits when the batch consumed and handled
|
2017-12-30 00:36:06 +00:00
|
|
|
*/
|
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
// Check arguments and settings
|
|
|
|
#define CHECK_KAFKA_STORAGE_ARGUMENT(ARG_NUM, PAR_NAME) \
|
|
|
|
/* One of the four required arguments is not specified */ \
|
2020-03-18 02:02:24 +00:00
|
|
|
if (args_count < (ARG_NUM) && (ARG_NUM) <= 4 && \
|
2019-08-13 11:24:18 +00:00
|
|
|
!kafka_settings.PAR_NAME.changed) \
|
2018-08-01 17:23:50 +00:00
|
|
|
{ \
|
|
|
|
throw Exception( \
|
|
|
|
"Required parameter '" #PAR_NAME "' " \
|
|
|
|
"for storage Kafka not specified", \
|
|
|
|
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); \
|
|
|
|
} \
|
|
|
|
/* The same argument is given in two places */ \
|
|
|
|
if (has_settings && \
|
2019-08-13 11:24:18 +00:00
|
|
|
kafka_settings.PAR_NAME.changed && \
|
2020-03-18 02:02:24 +00:00
|
|
|
args_count >= (ARG_NUM)) \
|
2018-08-01 17:23:50 +00:00
|
|
|
{ \
|
|
|
|
throw Exception( \
|
|
|
|
"The argument №" #ARG_NUM " of storage Kafka " \
|
|
|
|
"and the parameter '" #PAR_NAME "' " \
|
|
|
|
"in SETTINGS cannot be specified at the same time", \
|
|
|
|
ErrorCodes::BAD_ARGUMENTS); \
|
|
|
|
}
|
|
|
|
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(1, kafka_broker_list)
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(2, kafka_topic_list)
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(3, kafka_group_name)
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(4, kafka_format)
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(5, kafka_row_delimiter)
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(6, kafka_schema)
|
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(7, kafka_num_consumers)
|
2018-10-16 18:21:27 +00:00
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(8, kafka_max_block_size)
|
2019-01-18 12:48:38 +00:00
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(9, kafka_skip_broken_messages)
|
2019-05-16 15:20:30 +00:00
|
|
|
CHECK_KAFKA_STORAGE_ARGUMENT(10, kafka_commit_every_batch)
|
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
#undef CHECK_KAFKA_STORAGE_ARGUMENT
|
2017-12-30 00:36:06 +00:00
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
// Get and check broker list
|
2019-08-09 13:02:01 +00:00
|
|
|
String brokers = kafka_settings.kafka_broker_list;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 1)
|
|
|
|
{
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[0]->as<ASTLiteral>();
|
2018-08-01 17:23:50 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::String)
|
|
|
|
{
|
|
|
|
brokers = safeGet<String>(ast->value);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
throw Exception(String("Kafka broker list must be a string"), ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
}
|
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
// Get and check topic list
|
2019-06-03 16:40:22 +00:00
|
|
|
String topic_list = kafka_settings.kafka_topic_list.value;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 2)
|
|
|
|
{
|
|
|
|
engine_args[1] = evaluateConstantExpressionAsLiteral(engine_args[1], args.local_context);
|
2019-03-15 16:14:13 +00:00
|
|
|
topic_list = engine_args[1]->as<ASTLiteral &>().value.safeGet<String>();
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2019-06-03 16:40:22 +00:00
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
Names topics;
|
|
|
|
boost::split(topics, topic_list , [](char c){ return c == ','; });
|
|
|
|
for (String & topic : topics)
|
|
|
|
{
|
|
|
|
boost::trim(topic);
|
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
// Get and check group name
|
2019-06-03 16:40:22 +00:00
|
|
|
String group = kafka_settings.kafka_group_name.value;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 3)
|
|
|
|
{
|
|
|
|
engine_args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[2], args.local_context);
|
2019-03-15 16:14:13 +00:00
|
|
|
group = engine_args[2]->as<ASTLiteral &>().value.safeGet<String>();
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
2018-08-01 17:23:50 +00:00
|
|
|
// Get and check message format name
|
2019-06-03 16:40:22 +00:00
|
|
|
String format = kafka_settings.kafka_format.value;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 4)
|
|
|
|
{
|
|
|
|
engine_args[3] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[3], args.local_context);
|
|
|
|
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[3]->as<ASTLiteral>();
|
2018-08-01 17:23:50 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::String)
|
|
|
|
{
|
|
|
|
format = safeGet<String>(ast->value);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
throw Exception("Format must be a string", ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
}
|
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
2018-07-18 05:22:01 +00:00
|
|
|
// Parse row delimiter (optional)
|
2019-08-09 13:02:01 +00:00
|
|
|
char row_delimiter = kafka_settings.kafka_row_delimiter;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 5)
|
2017-12-30 00:36:06 +00:00
|
|
|
{
|
|
|
|
engine_args[4] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[4], args.local_context);
|
|
|
|
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[4]->as<ASTLiteral>();
|
2018-07-18 05:22:01 +00:00
|
|
|
String arg;
|
2017-12-30 00:36:06 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::String)
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2018-07-18 05:22:01 +00:00
|
|
|
arg = safeGet<String>(ast->value);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2018-07-18 05:22:01 +00:00
|
|
|
else
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2018-07-18 05:22:01 +00:00
|
|
|
throw Exception("Row delimiter must be a char", ErrorCodes::BAD_ARGUMENTS);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2018-07-18 05:22:01 +00:00
|
|
|
if (arg.size() > 1)
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2018-07-18 05:22:01 +00:00
|
|
|
throw Exception("Row delimiter must be a char", ErrorCodes::BAD_ARGUMENTS);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2020-03-08 21:40:00 +00:00
|
|
|
else if (arg.empty())
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2018-07-18 05:22:01 +00:00
|
|
|
row_delimiter = '\0';
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2018-07-18 05:22:01 +00:00
|
|
|
else
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2018-07-18 05:22:01 +00:00
|
|
|
row_delimiter = arg[0];
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
|
|
|
}
|
2018-07-18 05:22:01 +00:00
|
|
|
|
|
|
|
// Parse format schema if supported (optional)
|
2019-06-03 16:40:22 +00:00
|
|
|
String schema = kafka_settings.kafka_schema.value;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 6)
|
2018-07-18 05:22:01 +00:00
|
|
|
{
|
2018-09-17 11:01:10 +00:00
|
|
|
engine_args[5] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[5], args.local_context);
|
2018-07-18 05:22:01 +00:00
|
|
|
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[5]->as<ASTLiteral>();
|
2017-12-30 00:36:06 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::String)
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2017-12-30 00:36:06 +00:00
|
|
|
schema = safeGet<String>(ast->value);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
else
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2017-12-30 00:36:06 +00:00
|
|
|
throw Exception("Format schema must be a string", ErrorCodes::BAD_ARGUMENTS);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
|
|
|
// Parse number of consumers (optional)
|
2019-08-09 13:02:01 +00:00
|
|
|
UInt64 num_consumers = kafka_settings.kafka_num_consumers;
|
2018-08-01 17:23:50 +00:00
|
|
|
if (args_count >= 7)
|
2017-12-30 00:36:06 +00:00
|
|
|
{
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[6]->as<ASTLiteral>();
|
2017-12-30 00:36:06 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::UInt64)
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2017-12-30 00:36:06 +00:00
|
|
|
num_consumers = safeGet<UInt64>(ast->value);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
else
|
2018-08-01 17:23:50 +00:00
|
|
|
{
|
2017-12-30 00:36:06 +00:00
|
|
|
throw Exception("Number of consumers must be a positive integer", ErrorCodes::BAD_ARGUMENTS);
|
2018-08-01 17:23:50 +00:00
|
|
|
}
|
|
|
|
}
|
2017-12-30 00:36:06 +00:00
|
|
|
|
2018-10-16 18:21:27 +00:00
|
|
|
// Parse max block size (optional)
|
2019-08-09 13:02:01 +00:00
|
|
|
UInt64 max_block_size = static_cast<size_t>(kafka_settings.kafka_max_block_size);
|
2018-10-16 18:21:27 +00:00
|
|
|
if (args_count >= 8)
|
|
|
|
{
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[7]->as<ASTLiteral>();
|
2018-10-16 18:21:27 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::UInt64)
|
|
|
|
{
|
|
|
|
max_block_size = static_cast<size_t>(safeGet<UInt64>(ast->value));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-01-18 12:48:38 +00:00
|
|
|
// TODO: no check if the integer is really positive
|
2018-10-16 18:21:27 +00:00
|
|
|
throw Exception("Maximum block size must be a positive integer", ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-09 13:02:01 +00:00
|
|
|
size_t skip_broken = static_cast<size_t>(kafka_settings.kafka_skip_broken_messages);
|
2019-01-18 12:48:38 +00:00
|
|
|
if (args_count >= 9)
|
|
|
|
{
|
2019-03-11 13:22:51 +00:00
|
|
|
const auto * ast = engine_args[8]->as<ASTLiteral>();
|
2019-01-18 12:48:38 +00:00
|
|
|
if (ast && ast->value.getType() == Field::Types::UInt64)
|
|
|
|
{
|
|
|
|
skip_broken = static_cast<size_t>(safeGet<UInt64>(ast->value));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
throw Exception("Number of broken messages to skip must be a non-negative integer", ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-03 16:40:22 +00:00
|
|
|
bool intermediate_commit = static_cast<bool>(kafka_settings.kafka_commit_every_batch);
|
2019-05-16 15:20:30 +00:00
|
|
|
if (args_count >= 10)
|
|
|
|
{
|
|
|
|
const auto * ast = engine_args[9]->as<ASTLiteral>();
|
|
|
|
if (ast && ast->value.getType() == Field::Types::UInt64)
|
|
|
|
{
|
|
|
|
intermediate_commit = static_cast<bool>(safeGet<UInt64>(ast->value));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
throw Exception("Flag for committing every batch must be 0 or 1", ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-30 00:36:06 +00:00
|
|
|
return StorageKafka::create(
|
2019-12-04 16:06:55 +00:00
|
|
|
args.table_id, args.context, args.columns,
|
2019-05-16 15:20:30 +00:00
|
|
|
brokers, group, topics, format, row_delimiter, schema, num_consumers, max_block_size, skip_broken, intermediate_commit);
|
2020-01-25 19:14:33 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
factory.registerStorage("Kafka", creator_fn, StorageFactory::StorageFeatures{ .supports_settings = true, });
|
2017-12-30 00:36:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-03 23:39:24 +00:00
|
|
|
}
|