ClickHouse/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h

180 lines
7.0 KiB
C++
Raw Normal View History

#pragma once
#include <Core/PostgreSQL/Connection.h>
#include <Core/PostgreSQL/insertPostgreSQLValue.h>
2021-02-06 12:28:42 +00:00
#include <Core/BackgroundSchedulePool.h>
2021-02-19 10:40:59 +00:00
#include <Core/Names.h>
2021-10-02 07:13:14 +00:00
#include <base/logger_useful.h>
2021-01-31 19:03:03 +00:00
#include <Storages/IStorage.h>
2021-02-13 20:46:52 +00:00
#include <Parsers/ASTExpressionList.h>
2021-02-06 12:28:42 +00:00
namespace DB
{
2021-08-28 13:56:39 +00:00
struct SettingChange;
2021-06-27 19:09:17 +00:00
class MaterializedPostgreSQLConsumer
{
public:
using Storages = std::unordered_map<String, StoragePtr>;
2021-06-27 19:09:17 +00:00
MaterializedPostgreSQLConsumer(
2021-04-11 07:44:40 +00:00
ContextPtr context_,
2021-05-08 14:55:53 +00:00
std::shared_ptr<postgres::Connection> connection_,
const String & replication_slot_name_,
const String & publication_name_,
const String & start_lsn,
2021-01-31 19:03:03 +00:00
const size_t max_block_size_,
2021-09-12 12:33:54 +00:00
bool schema_as_a_part_of_table_name_,
2021-05-03 21:42:06 +00:00
bool allow_automatic_update_,
2021-08-27 06:30:21 +00:00
Storages storages_,
const String & name_for_logger);
2021-02-21 22:41:18 +00:00
bool consume(std::vector<std::pair<Int32, String>> & skipped_tables);
2021-05-03 21:42:06 +00:00
/// Called from reloadFromSnapshot by replication handler. This method is needed to move a table back into synchronization
/// process if it was skipped due to schema changes.
void updateNested(const String & table_name, StoragePtr nested_storage, Int32 table_id, const String & table_start_lsn);
2021-08-27 06:30:21 +00:00
void addNested(const String & postgres_table_name, StoragePtr nested_storage, const String & table_start_lsn);
2021-08-28 13:42:36 +00:00
void removeNested(const String & postgres_table_name);
2021-08-28 13:56:39 +00:00
void setSetting(const SettingChange & setting);
private:
2021-05-03 21:42:06 +00:00
/// Read approximarely up to max_block_size changes from WAL.
bool readFromReplicationSlot();
2021-02-08 23:23:51 +00:00
2021-06-29 23:11:46 +00:00
void syncTables();
2021-02-08 23:23:51 +00:00
String advanceLSN(std::shared_ptr<pqxx::nontransaction> ntx);
2021-02-08 23:23:51 +00:00
void processReplicationMessage(const char * replication_message, size_t size);
2021-02-03 16:13:18 +00:00
2021-08-27 06:30:21 +00:00
bool isSyncAllowed(Int32 relation_id, const String & relation_name);
2021-02-21 22:41:18 +00:00
struct Buffer
{
ExternalResultDescription description;
MutableColumns columns;
2021-05-13 07:36:40 +00:00
/// Needed to pass to insert query columns list in syncTables().
2021-02-13 20:46:52 +00:00
std::shared_ptr<ASTExpressionList> columnsAST;
2021-05-13 07:36:40 +00:00
/// Needed for insertPostgreSQLValue() method to parse array
std::unordered_map<size_t, PostgreSQLArrayInfo> array_info;
2021-03-20 12:53:12 +00:00
Buffer(StoragePtr storage) { createEmptyBuffer(storage); }
void createEmptyBuffer(StoragePtr storage);
2021-12-26 17:49:33 +00:00
size_t getColumnsNum() const
{
const auto & sample_block = description.sample_block;
return sample_block.columns();
}
};
2021-02-21 22:41:18 +00:00
using Buffers = std::unordered_map<String, Buffer>;
2021-02-21 22:41:18 +00:00
static void insertDefaultValue(Buffer & buffer, size_t column_idx);
2021-12-26 17:49:33 +00:00
void insertValue(Buffer & buffer, const std::string & value, size_t column_idx);
2021-01-31 19:03:03 +00:00
2021-02-08 23:23:51 +00:00
enum class PostgreSQLQuery
{
INSERT,
UPDATE,
DELETE
};
2021-02-21 22:41:18 +00:00
void readTupleData(Buffer & buffer, const char * message, size_t & pos, size_t size, PostgreSQLQuery type, bool old_value = false);
2021-02-08 23:23:51 +00:00
2021-03-20 15:12:46 +00:00
template<typename T>
static T unhexN(const char * message, size_t pos, size_t n);
2021-02-18 18:20:52 +00:00
static void readString(const char * message, size_t & pos, size_t size, String & result);
static Int64 readInt64(const char * message, size_t & pos, size_t size);
static Int32 readInt32(const char * message, size_t & pos, size_t size);
static Int16 readInt16(const char * message, size_t & pos, size_t size);
static Int8 readInt8(const char * message, size_t & pos, size_t size);
2021-01-27 15:29:28 +00:00
2021-02-19 10:40:59 +00:00
void markTableAsSkipped(Int32 relation_id, const String & relation_name);
2021-02-21 22:41:18 +00:00
/// lsn - log sequnce nuumber, like wal offset (64 bit).
Int64 getLSNValue(const std::string & lsn)
{
2021-02-22 12:35:53 +00:00
UInt32 upper_half, lower_half;
std::sscanf(lsn.data(), "%X/%X", &upper_half, &lower_half);
return (static_cast<Int64>(upper_half) << 32) + lower_half;
2021-02-21 22:41:18 +00:00
}
Poco::Logger * log;
2021-04-11 07:44:40 +00:00
ContextPtr context;
2021-02-08 23:23:51 +00:00
const std::string replication_slot_name, publication_name;
2021-05-08 14:55:53 +00:00
std::shared_ptr<postgres::Connection> connection;
2021-02-06 12:28:42 +00:00
std::string current_lsn, final_lsn;
/// current_lsn converted from String to Int64 via getLSNValue().
UInt64 lsn_value;
2021-08-28 13:56:39 +00:00
size_t max_block_size;
2021-10-01 15:54:01 +00:00
2021-09-12 12:33:54 +00:00
bool schema_as_a_part_of_table_name;
2021-10-01 15:54:01 +00:00
2021-05-03 21:42:06 +00:00
bool allow_automatic_update;
2021-02-08 23:23:51 +00:00
String table_to_insert;
2021-02-21 22:41:18 +00:00
/// List of tables which need to be synced after last replication stream.
2021-08-27 06:30:21 +00:00
/// Holds `postgres_table_name` set.
2021-02-08 23:23:51 +00:00
std::unordered_set<std::string> tables_to_sync;
2021-08-27 06:30:21 +00:00
/// `postgres_table_name` -> ReplacingMergeTree table.
Storages storages;
2021-08-27 06:30:21 +00:00
/// `postgres_table_name` -> In-memory buffer.
Buffers buffers;
std::unordered_map<Int32, String> relation_id_to_name;
struct SchemaData
{
Int16 number_of_columns;
/// data_type_id and type_modifier
std::vector<std::pair<Int32, Int32>> column_identifiers;
SchemaData(Int16 number_of_columns_) : number_of_columns(number_of_columns_) {}
2021-12-26 17:49:33 +00:00
SchemaData() = default;
};
2021-02-21 22:41:18 +00:00
/// Cache for table schema data to be able to detect schema changes, because ddl is not
/// replicated with postgresql logical replication protocol, but some table schema info
/// is received if it is the first time we received dml message for given relation in current session or
/// if relation definition has changed since the last relation definition message.
std::unordered_map<Int32, SchemaData> schema_data;
2021-02-21 22:41:18 +00:00
2021-08-27 06:30:21 +00:00
/// `postgres_relation_id` -> `start_lsn`
2021-02-21 23:13:58 +00:00
/// skip_list contains relation ids for tables on which ddl was performed, which can break synchronization.
2021-02-21 22:41:18 +00:00
/// This breaking changes are detected in replication stream in according replication message and table is added to skip list.
/// After it is finished, a temporary replication slot is created with 'export snapshot' option, and start_lsn is returned.
/// Skipped tables are reloaded from snapshot (nested tables are also updated). Afterwards, if a replication message is
/// related to a table in a skip_list, we compare current lsn with start_lsn, which was returned with according snapshot.
/// If current_lsn >= table_start_lsn, we can safely remove table from skip list and continue its synchronization.
2021-02-22 12:35:53 +00:00
/// No needed message, related to reloaded table will be missed, because messages are not consumed in the meantime,
/// i.e. we will not miss the first start_lsn position for reloaded table.
2021-02-21 22:41:18 +00:00
std::unordered_map<Int32, String> skip_list;
2021-08-27 06:30:21 +00:00
/// `postgres_table_name` -> `start_lsn`
/// For dynamically added tables. A new table is loaded via snapshot and we get a start lsn position.
/// Once consumer reaches this position, it starts applying replication messages to this table.
/// Inside replication handler we have to ensure that replication consumer does not read data from wal
/// while the process of adding a table to replication is not finished,
/// because we might go beyond this start lsn position before consumer knows that a new table was added.
std::unordered_map<String, String> waiting_list;
2021-08-28 13:42:36 +00:00
/// Since replication may be some time behind, we need to ensure that replication messages for deleted tables are ignored.
std::unordered_set<String> deleted_tables;
};
}