ClickHouse/src/Storages/StorageHudi.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

70 lines
1.7 KiB
C++
Raw Normal View History

2022-08-08 14:10:50 +00:00
#pragma once
2022-11-04 16:03:12 +00:00
#include "config.h"
2022-08-08 14:10:50 +00:00
2022-09-07 07:16:32 +00:00
#if USE_AWS_S3
2022-09-28 11:21:32 +00:00
# include <Storages/IStorage.h>
# include <Storages/StorageS3.h>
2022-08-22 09:37:20 +00:00
2022-08-26 18:17:32 +00:00
namespace Poco
{
class Logger;
2022-08-22 09:37:20 +00:00
}
namespace Aws::S3
{
2022-08-26 18:17:32 +00:00
class S3Client;
2022-08-22 09:37:20 +00:00
}
2022-08-08 14:10:50 +00:00
namespace DB
{
2022-08-26 18:17:32 +00:00
class StorageHudi : public IStorage
{
2022-08-08 14:10:50 +00:00
public:
2022-11-04 20:49:24 +00:00
// 1. Parses internal file structure of table
// 2. Finds out parts with latest version
// 3. Creates url for underlying StorageS3 enigne to handle reads
2022-08-08 14:10:50 +00:00
StorageHudi(
2022-11-03 18:14:10 +00:00
const StorageS3Configuration & configuration_,
2022-08-08 14:10:50 +00:00
const StorageID & table_id_,
2022-08-26 18:17:32 +00:00
ColumnsDescription columns_,
2022-08-24 15:07:37 +00:00
const ConstraintsDescription & constraints_,
2022-08-22 09:37:20 +00:00
const String & comment,
2022-11-03 18:14:10 +00:00
ContextPtr context_,
std::optional<FormatSettings> format_settings_);
2022-08-08 14:10:50 +00:00
String getName() const override { return "Hudi"; }
2022-08-22 09:37:20 +00:00
2022-11-04 20:49:24 +00:00
// Reads latest version of Apache Hudi table
2022-08-24 15:07:37 +00:00
Pipe read(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum processed_stage,
size_t max_block_size,
2022-11-04 16:03:12 +00:00
size_t num_streams) override;
2022-08-24 15:07:37 +00:00
private:
std::vector<std::string> getKeysFromS3();
2022-11-04 20:49:24 +00:00
// Apache Hudi store parts of data in different files
// Every part file has timestamp in it
// Every partition(directory) in Apache Hudi has different versions of part
// To find needed parts we need to find out latest part file for every partition
// Part format is usually parquet, but can differ
static std::string generateQueryFromKeys(std::vector<std::string> && keys, String format);
2022-08-24 15:07:37 +00:00
2022-08-25 08:48:49 +00:00
StorageS3::S3Configuration base_configuration;
std::shared_ptr<StorageS3> s3engine;
2022-08-22 09:37:20 +00:00
Poco::Logger * log;
2022-08-30 17:38:02 +00:00
String table_path;
2022-08-08 14:10:50 +00:00
};
}
2022-09-07 07:16:32 +00:00
#endif