ClickHouse/src/Storages/StorageAzureBlob.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

352 lines
12 KiB
C++
Raw Normal View History

2023-06-02 14:51:09 +00:00
#pragma once
#include "config.h"
#if USE_AZURE_BLOB_STORAGE
2024-01-07 22:28:08 +00:00
#include <Common/re2.h>
2023-06-02 14:51:09 +00:00
#include <Storages/IStorage.h>
#include <Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h>
#include <Storages/Cache/SchemaCache.h>
#include <Storages/StorageConfiguration.h>
#include <Processors/Executors/PullingPipelineExecutor.h>
#include <Processors/Formats/IInputFormat.h>
2023-06-06 16:48:20 +00:00
#include <Storages/NamedCollectionsHelpers.h>
#include <Storages/prepareReadingFromFormat.h>
2023-08-02 17:47:42 +00:00
#include <Storages/SelectQueryInfo.h>
2023-06-02 14:51:09 +00:00
namespace DB
{
2023-06-06 19:58:54 +00:00
class StorageAzureBlob : public IStorage
2023-06-02 14:51:09 +00:00
{
public:
2023-06-06 16:48:20 +00:00
2023-06-02 14:51:09 +00:00
using AzureClient = Azure::Storage::Blobs::BlobContainerClient;
using AzureClientPtr = std::unique_ptr<Azure::Storage::Blobs::BlobContainerClient>;
struct Configuration : public StatelessTableEngineConfiguration
{
Configuration() = default;
String getPath() const { return blob_path; }
bool update(const ContextPtr & context);
2023-06-02 14:51:09 +00:00
void connect(const ContextPtr & context);
2023-06-02 14:51:09 +00:00
bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; }
bool withWildcard() const
{
static const String PARTITION_ID_WILDCARD = "{_partition_id}";
return blobs_paths.back().find(PARTITION_ID_WILDCARD) != String::npos;
}
Poco::URI getConnectionURL() const;
2023-06-02 14:51:09 +00:00
std::string connection_url;
bool is_connection_string;
std::optional<std::string> account_name;
std::optional<std::string> account_key;
std::string container;
std::string blob_path;
std::vector<String> blobs_paths;
};
2023-06-06 19:58:54 +00:00
StorageAzureBlob(
2023-06-02 14:51:09 +00:00
const Configuration & configuration_,
std::unique_ptr<AzureObjectStorage> && object_storage_,
const ContextPtr & context_,
2023-06-02 14:51:09 +00:00
const StorageID & table_id_,
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
const String & comment,
std::optional<FormatSettings> format_settings_,
bool distributed_processing_,
2023-06-02 14:51:09 +00:00
ASTPtr partition_by_);
static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context);
static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only);
2023-06-06 16:48:20 +00:00
static AzureObjectStorage::SettingsPtr createSettings(const ContextPtr & local_context);
2023-06-06 16:48:20 +00:00
2023-06-06 19:58:54 +00:00
static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection);
2023-06-02 14:51:09 +00:00
String getName() const override
{
return name;
}
2024-01-02 13:45:41 +00:00
void read(
QueryPlan & query_plan,
2023-06-03 21:11:03 +00:00
const Names &,
const StorageSnapshotPtr &,
SelectQueryInfo &,
ContextPtr,
QueryProcessingStage::Enum,
size_t,
2023-06-05 23:28:04 +00:00
size_t) override;
2023-06-02 14:51:09 +00:00
2023-06-08 15:06:15 +00:00
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /* metadata_snapshot */, ContextPtr context, bool /*async_insert*/) override;
2023-06-02 14:51:09 +00:00
void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, TableExclusiveLockHolder &) override;
NamesAndTypesList getVirtuals() const override;
static Names getVirtualColumnNames();
2023-06-02 14:51:09 +00:00
bool supportsPartitionBy() const override;
bool supportsSubcolumns() const override { return true; }
2023-06-05 12:46:52 +00:00
bool supportsSubsetOfColumns(const ContextPtr & context) const;
2023-06-05 12:46:52 +00:00
bool supportsTrivialCountOptimization() const override { return true; }
2023-06-05 12:46:52 +00:00
bool prefersLargeBlocks() const override;
bool parallelizeOutputAfterReading(ContextPtr context) const override;
2023-06-02 14:51:09 +00:00
static SchemaCache & getSchemaCache(const ContextPtr & ctx);
2023-06-06 16:48:20 +00:00
static ColumnsDescription getTableStructureFromData(
AzureObjectStorage * object_storage,
const Configuration & configuration,
const std::optional<FormatSettings> & format_settings,
const ContextPtr & ctx);
static std::pair<ColumnsDescription, String> getTableStructureAndFormatFromData(
AzureObjectStorage * object_storage,
const Configuration & configuration,
const std::optional<FormatSettings> & format_settings,
const ContextPtr & ctx);
2023-06-06 16:48:20 +00:00
private:
static std::pair<ColumnsDescription, String> getTableStructureAndFormatFromDataImpl(
std::optional<String> format,
AzureObjectStorage * object_storage,
const Configuration & configuration,
const std::optional<FormatSettings> & format_settings,
const ContextPtr & ctx);
2024-01-02 13:45:41 +00:00
friend class ReadFromAzureBlob;
std::string name;
Configuration configuration;
std::unique_ptr<AzureObjectStorage> object_storage;
NamesAndTypesList virtual_columns;
const bool distributed_processing;
std::optional<FormatSettings> format_settings;
ASTPtr partition_by;
};
2023-06-06 19:58:54 +00:00
class StorageAzureBlobSource : public ISource, WithContext
{
public:
class IIterator : public WithContext
{
public:
IIterator(const ContextPtr & context_):WithContext(context_) {}
virtual ~IIterator() = default;
virtual RelativePathWithMetadata next() = 0;
RelativePathWithMetadata operator ()() { return next(); }
};
class GlobIterator : public IIterator
2023-06-05 23:28:04 +00:00
{
public:
GlobIterator(
2023-06-05 23:28:04 +00:00
AzureObjectStorage * object_storage_,
const std::string & container_,
String blob_path_with_globs_,
2024-01-02 13:45:41 +00:00
const ActionsDAG::Node * predicate,
const NamesAndTypesList & virtual_columns_,
const ContextPtr & context_,
RelativePathsWithMetadata * outer_blobs_,
std::function<void(FileProgress)> file_progress_callback_ = {});
2023-06-05 23:28:04 +00:00
RelativePathWithMetadata next() override;
~GlobIterator() override = default;
2023-06-06 14:57:51 +00:00
private:
2023-06-05 23:28:04 +00:00
AzureObjectStorage * object_storage;
std::string container;
String blob_path_with_globs;
2024-01-02 13:45:41 +00:00
ActionsDAGPtr filter_dag;
NamesAndTypesList virtual_columns;
2023-06-05 23:28:04 +00:00
size_t index = 0;
2023-06-05 23:28:04 +00:00
RelativePathsWithMetadata blobs_with_metadata;
2023-06-06 14:57:51 +00:00
RelativePathsWithMetadata * outer_blobs;
2023-06-05 23:28:04 +00:00
ObjectStorageIteratorPtr object_storage_iterator;
bool recursive{false};
std::unique_ptr<re2::RE2> matcher;
void createFilterAST(const String & any_key);
bool is_finished = false;
2023-06-08 14:11:27 +00:00
std::mutex next_mutex;
std::function<void(FileProgress)> file_progress_callback;
2023-06-05 23:28:04 +00:00
};
class ReadIterator : public IIterator
{
public:
explicit ReadIterator(const ContextPtr & context_,
const ReadTaskCallback & callback_)
: IIterator(context_), callback(callback_) { }
RelativePathWithMetadata next() override
{
return {callback(), {}};
}
private:
ReadTaskCallback callback;
};
class KeysIterator : public IIterator
{
public:
KeysIterator(
AzureObjectStorage * object_storage_,
const std::string & container_,
2023-06-29 11:59:09 +00:00
const Strings & keys_,
2024-01-02 13:45:41 +00:00
const ActionsDAG::Node * predicate,
const NamesAndTypesList & virtual_columns_,
const ContextPtr & context_,
2023-06-29 11:59:09 +00:00
RelativePathsWithMetadata * outer_blobs,
std::function<void(FileProgress)> file_progress_callback = {});
RelativePathWithMetadata next() override;
~KeysIterator() override = default;
private:
AzureObjectStorage * object_storage;
std::string container;
RelativePathsWithMetadata keys;
2024-01-02 13:45:41 +00:00
ActionsDAGPtr filter_dag;
NamesAndTypesList virtual_columns;
std::atomic<size_t> index = 0;
};
2023-06-06 19:58:54 +00:00
StorageAzureBlobSource(
const ReadFromFormatInfo & info,
2023-06-05 23:28:04 +00:00
const String & format_,
String name_,
const ContextPtr & context_,
2023-06-05 23:28:04 +00:00
std::optional<FormatSettings> format_settings_,
UInt64 max_block_size_,
String compression_hint_,
AzureObjectStorage * object_storage_,
const String & container_,
const String & connection_url_,
std::shared_ptr<IIterator> file_iterator_,
2024-01-02 13:45:41 +00:00
bool need_only_count_);
2023-06-06 19:58:54 +00:00
~StorageAzureBlobSource() override;
Chunk generate() override;
2023-06-05 23:28:04 +00:00
String getName() const override;
private:
void addNumRowsToCache(const String & path, size_t num_rows);
std::optional<size_t> tryGetNumRowsFromCache(const RelativePathWithMetadata & path_with_metadata);
NamesAndTypesList requested_columns;
NamesAndTypesList requested_virtual_columns;
String format;
String name;
Block sample_block;
std::optional<FormatSettings> format_settings;
2023-06-05 21:15:13 +00:00
ColumnsDescription columns_desc;
2023-06-05 23:28:04 +00:00
UInt64 max_block_size;
String compression_hint;
AzureObjectStorage * object_storage;
String container;
String connection_url;
std::shared_ptr<IIterator> file_iterator;
bool need_only_count;
size_t total_rows_in_file = 0;
2023-06-05 23:28:04 +00:00
struct ReaderHolder
{
public:
ReaderHolder(
RelativePathWithMetadata relative_path_with_metadata_,
2023-06-05 23:28:04 +00:00
std::unique_ptr<ReadBuffer> read_buf_,
std::shared_ptr<ISource> source_,
2023-06-05 23:28:04 +00:00
std::unique_ptr<QueryPipeline> pipeline_,
std::unique_ptr<PullingPipelineExecutor> reader_)
: relative_path_with_metadata(std::move(relative_path_with_metadata_))
2023-06-05 23:28:04 +00:00
, read_buf(std::move(read_buf_))
, source(std::move(source_))
2023-06-05 23:28:04 +00:00
, pipeline(std::move(pipeline_))
, reader(std::move(reader_))
{
}
ReaderHolder() = default;
ReaderHolder(const ReaderHolder & other) = delete;
ReaderHolder & operator=(const ReaderHolder & other) = delete;
ReaderHolder(ReaderHolder && other) noexcept
{
*this = std::move(other);
}
ReaderHolder & operator=(ReaderHolder && other) noexcept
{
/// The order of destruction is important.
/// reader uses pipeline, pipeline uses read_buf.
reader = std::move(other.reader);
pipeline = std::move(other.pipeline);
source = std::move(other.source);
2023-06-05 23:28:04 +00:00
read_buf = std::move(other.read_buf);
relative_path_with_metadata = std::move(other.relative_path_with_metadata);
2023-06-05 23:28:04 +00:00
return *this;
}
explicit operator bool() const { return reader != nullptr; }
PullingPipelineExecutor * operator->() { return reader.get(); }
const PullingPipelineExecutor * operator->() const { return reader.get(); }
const String & getRelativePath() const { return relative_path_with_metadata.relative_path; }
const RelativePathWithMetadata & getRelativePathWithMetadata() const { return relative_path_with_metadata; }
const IInputFormat * getInputFormat() const { return dynamic_cast<const IInputFormat *>(source.get()); }
2023-06-05 23:28:04 +00:00
private:
RelativePathWithMetadata relative_path_with_metadata;
2023-06-05 23:28:04 +00:00
std::unique_ptr<ReadBuffer> read_buf;
std::shared_ptr<ISource> source;
2023-06-05 23:28:04 +00:00
std::unique_ptr<QueryPipeline> pipeline;
std::unique_ptr<PullingPipelineExecutor> reader;
};
ReaderHolder reader;
2024-01-23 17:04:50 +00:00
LoggerPtr log = getLogger("StorageAzureBlobSource");
2023-06-05 23:28:04 +00:00
ThreadPool create_reader_pool;
ThreadPoolCallbackRunner<ReaderHolder> create_reader_scheduler;
std::future<ReaderHolder> reader_future;
/// Recreate ReadBuffer and Pipeline for each file.
ReaderHolder createReader();
std::future<ReaderHolder> createReaderAsync();
2023-06-05 23:28:04 +00:00
std::unique_ptr<ReadBuffer> createAzureReadBuffer(const String & key, size_t object_size);
2023-06-06 14:16:49 +00:00
std::unique_ptr<ReadBuffer> createAsyncAzureReadBuffer(
const String & key, const ReadSettings & read_settings, size_t object_size);
2023-06-02 14:51:09 +00:00
};
}
#endif