ClickHouse/src/Storages/StorageURL.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

336 lines
11 KiB
C++
Raw Normal View History

2018-06-11 12:13:00 +00:00
#pragma once
#include <Formats/FormatSettings.h>
2020-04-28 00:56:44 +00:00
#include <IO/CompressionMethod.h>
#include <IO/HTTPHeaderEntries.h>
#include <IO/ReadWriteBufferFromHTTP.h>
#include <Processors/ISource.h>
#include <Processors/Sinks/SinkToStorage.h>
#include <Storages/Cache/SchemaCache.h>
#include <Storages/IStorage.h>
2023-04-21 12:11:18 +00:00
#include <Storages/StorageConfiguration.h>
#include <Storages/StorageFactory.h>
#include <Storages/prepareReadingFromFormat.h>
#include <Poco/URI.h>
2018-06-11 12:13:00 +00:00
2019-08-24 21:20:20 +00:00
2018-06-11 12:13:00 +00:00
namespace DB
{
2021-10-11 16:11:50 +00:00
class IOutputFormat;
using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
class IInputFormat;
struct ConnectionTimeouts;
class NamedCollection;
class PullingPipelineExecutor;
2018-06-11 12:13:00 +00:00
/**
* This class represents table engine for external urls.
* It sends HTTP GET to server when select is called and
2023-04-21 17:54:09 +00:00
* HTTP POST when insert is called. In POST request the data is send
2018-06-11 12:13:00 +00:00
* using Chunked transfer encoding, so server have to support it.
*/
class IStorageURLBase : public IStorage
2018-06-11 12:13:00 +00:00
{
public:
2020-08-03 13:54:14 +00:00
Pipe read(
2019-08-24 21:20:20 +00:00
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
2018-06-25 12:21:54 +00:00
QueryProcessingStage::Enum processed_stage,
size_t max_block_size,
size_t num_streams) override;
2018-06-11 12:13:00 +00:00
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override;
2018-06-11 12:13:00 +00:00
2021-10-26 09:31:01 +00:00
bool supportsPartitionBy() const override { return true; }
NamesAndTypesList getVirtuals() const override;
static ColumnsDescription getTableStructureFromData(
const String & format,
const String & uri,
CompressionMethod compression_method,
const HTTPHeaderEntries & headers,
const std::optional<FormatSettings> & format_settings,
ContextPtr context);
static SchemaCache & getSchemaCache(const ContextPtr & context);
2023-08-23 18:43:08 +00:00
static std::optional<time_t> tryGetLastModificationTime(
const String & url,
const HTTPHeaderEntries & headers,
const Poco::Net::HTTPBasicCredentials & credentials,
const ContextPtr & context);
2018-06-11 12:13:00 +00:00
protected:
2019-08-24 21:20:20 +00:00
IStorageURLBase(
2021-10-26 09:31:01 +00:00
const String & uri_,
ContextPtr context_,
2019-12-04 16:06:55 +00:00
const StorageID & id_,
2018-06-11 12:13:00 +00:00
const String & format_name_,
const std::optional<FormatSettings> & format_settings_,
2019-08-24 21:20:20 +00:00
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
2021-04-23 12:18:23 +00:00
const String & comment,
2021-09-07 11:17:25 +00:00
const String & compression_method_,
const HTTPHeaderEntries & headers_ = {},
2021-10-26 12:22:13 +00:00
const String & method_ = "",
ASTPtr partition_by = nullptr,
bool distributed_processing_ = false);
2018-06-11 12:13:00 +00:00
2021-10-26 09:31:01 +00:00
String uri;
CompressionMethod compression_method;
2018-06-11 12:13:00 +00:00
String format_name;
// For URL engine, we use format settings from server context + `SETTINGS`
// clause of the `CREATE` query. In this case, format_settings is set.
// For `url` table function, we use settings from current query context.
// In this case, format_settings is not set.
std::optional<FormatSettings> format_settings;
HTTPHeaderEntries headers;
2021-10-28 12:44:12 +00:00
String http_method; /// For insert can choose Put instead of default Post.
2021-10-26 12:22:13 +00:00
ASTPtr partition_by;
bool distributed_processing;
2018-06-11 12:13:00 +00:00
NamesAndTypesList virtual_columns;
virtual std::string getReadMethod() const;
2019-08-24 21:20:20 +00:00
virtual std::vector<std::pair<std::string, std::string>> getReadURIParams(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
const SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum & processed_stage,
size_t max_block_size) const;
2019-08-24 21:20:20 +00:00
virtual std::function<void(std::ostream &)> getReadPOSTDataCallback(
const Names & column_names,
2022-02-28 13:29:05 +00:00
const ColumnsDescription & columns_description,
const SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum & processed_stage,
size_t max_block_size) const;
2018-09-22 14:58:03 +00:00
virtual bool supportsSubsetOfColumns(const ContextPtr & context) const;
2022-02-28 13:29:05 +00:00
bool prefersLargeBlocks() const override;
bool parallelizeOutputAfterReading(ContextPtr context) const override;
bool supportsTrivialCountOptimization() const override { return true; }
2021-04-21 14:36:04 +00:00
private:
virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0;
static std::optional<ColumnsDescription> tryGetColumnsFromCache(
const Strings & urls,
const HTTPHeaderEntries & headers,
2022-06-28 16:13:42 +00:00
const Poco::Net::HTTPBasicCredentials & credentials,
const String & format_name,
const std::optional<FormatSettings> & format_settings,
const ContextPtr & context);
static void addColumnsToCache(
const Strings & urls,
const ColumnsDescription & columns,
const String & format_name,
const std::optional<FormatSettings> & format_settings,
const ContextPtr & context);
};
class StorageURLSource : public ISource, WithContext
{
using URIParams = std::vector<std::pair<String, String>>;
public:
class DisclosedGlobIterator
{
public:
DisclosedGlobIterator(const String & uri_, size_t max_addresses, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
String next();
size_t size();
private:
class Impl;
/// shared_ptr to have copy constructor
std::shared_ptr<Impl> pimpl;
};
using FailoverOptions = std::vector<String>;
using IteratorWrapper = std::function<FailoverOptions()>;
StorageURLSource(
const ReadFromFormatInfo & info,
std::shared_ptr<IteratorWrapper> uri_iterator_,
const std::string & http_method,
std::function<void(std::ostream &)> callback,
const String & format,
const std::optional<FormatSettings> & format_settings,
String name_,
ContextPtr context,
UInt64 max_block_size,
const ConnectionTimeouts & timeouts,
CompressionMethod compression_method,
size_t max_parsing_threads,
2023-08-02 17:47:42 +00:00
const SelectQueryInfo & query_info,
const HTTPHeaderEntries & headers_ = {},
const URIParams & params = {},
bool glob_url = false,
bool need_only_count_ = false);
String getName() const override { return name; }
Chunk generate() override;
static void setCredentials(Poco::Net::HTTPBasicCredentials & credentials, const Poco::URI & request_uri);
2023-06-13 14:43:50 +00:00
static std::pair<Poco::URI, std::unique_ptr<ReadWriteBufferFromHTTP>> getFirstAvailableURIAndReadBuffer(
std::vector<String>::const_iterator & option,
const std::vector<String>::const_iterator & end,
ContextPtr context,
const URIParams & params,
const String & http_method,
std::function<void(std::ostream &)> callback,
const ConnectionTimeouts & timeouts,
Poco::Net::HTTPBasicCredentials & credentials,
const HTTPHeaderEntries & headers,
bool glob_url,
bool delay_initialization);
private:
void addNumRowsToCache(const String & uri, size_t num_rows);
std::optional<size_t> tryGetNumRowsFromCache(const String & uri, std::optional<time_t> last_mod_time);
using InitializeFunc = std::function<bool()>;
InitializeFunc initialize;
String name;
ColumnsDescription columns_description;
NamesAndTypesList requested_columns;
NamesAndTypesList requested_virtual_columns;
Block block_for_format;
std::shared_ptr<IteratorWrapper> uri_iterator;
Poco::URI curr_uri;
String format;
const std::optional<FormatSettings> & format_settings;
HTTPHeaderEntries headers;
bool need_only_count;
size_t total_rows_in_file = 0;
std::unique_ptr<ReadBuffer> read_buf;
std::shared_ptr<IInputFormat> input_format;
std::unique_ptr<QueryPipeline> pipeline;
std::unique_ptr<PullingPipelineExecutor> reader;
Poco::Net::HTTPBasicCredentials credentials;
};
2021-07-23 14:25:35 +00:00
class StorageURLSink : public SinkToStorage
2020-04-28 00:56:44 +00:00
{
public:
2021-07-23 14:25:35 +00:00
StorageURLSink(
2021-10-26 09:31:01 +00:00
const String & uri,
const String & format,
const std::optional<FormatSettings> & format_settings,
2021-07-23 14:25:35 +00:00
const Block & sample_block,
ContextPtr context,
const ConnectionTimeouts & timeouts,
2021-10-26 09:31:01 +00:00
CompressionMethod compression_method,
2023-04-21 12:11:18 +00:00
const HTTPHeaderEntries & headers = {},
2021-10-26 09:31:01 +00:00
const String & method = Poco::Net::HTTPRequest::HTTP_POST);
2020-04-28 00:56:44 +00:00
2021-07-23 19:33:59 +00:00
std::string getName() const override { return "StorageURLSink"; }
2021-07-23 14:25:35 +00:00
void consume(Chunk chunk) override;
void onCancel() override;
2023-06-22 09:09:26 +00:00
void onException(std::exception_ptr exception) override;
2021-07-23 14:25:35 +00:00
void onFinish() override;
2020-04-28 00:56:44 +00:00
private:
void finalize();
void release();
2020-04-28 00:56:44 +00:00
std::unique_ptr<WriteBuffer> write_buf;
2021-10-11 16:11:50 +00:00
OutputFormatPtr writer;
std::mutex cancel_mutex;
bool cancelled = false;
2020-04-28 00:56:44 +00:00
};
2019-08-24 21:20:20 +00:00
class StorageURL : public IStorageURLBase
{
public:
2021-04-23 12:18:23 +00:00
StorageURL(
2021-10-26 09:31:01 +00:00
const String & uri_,
2021-04-23 12:18:23 +00:00
const StorageID & table_id_,
const String & format_name_,
const std::optional<FormatSettings> & format_settings_,
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
const String & comment,
ContextPtr context_,
2021-09-07 11:17:25 +00:00
const String & compression_method_,
const HTTPHeaderEntries & headers_ = {},
2021-10-26 12:22:13 +00:00
const String & method_ = "",
ASTPtr partition_by_ = nullptr,
bool distributed_processing_ = false);
String getName() const override
{
return "URL";
}
2018-09-22 14:58:03 +00:00
Block getHeaderBlock(const Names & /*column_names*/, const StorageSnapshotPtr & storage_snapshot) const override
2018-09-22 14:58:03 +00:00
{
return storage_snapshot->metadata->getSampleBlock();
2018-09-22 14:58:03 +00:00
}
2021-04-21 12:32:57 +00:00
2023-07-06 17:47:01 +00:00
bool supportsSubcolumns() const override { return true; }
2021-04-21 12:32:57 +00:00
static FormatSettings getFormatSettingsFromArgs(const StorageFactory::Arguments & args);
2021-09-07 11:17:25 +00:00
2023-04-21 12:11:18 +00:00
struct Configuration : public StatelessTableEngineConfiguration
{
std::string url;
std::string http_method;
HTTPHeaderEntries headers;
2023-04-21 12:11:18 +00:00
std::string addresses_expr;
};
static Configuration getConfiguration(ASTs & args, ContextPtr context);
static ASTs::iterator collectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context);
2022-06-17 12:53:16 +00:00
static void processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection);
2018-06-11 12:13:00 +00:00
};
2021-04-21 14:36:04 +00:00
2023-04-21 12:11:18 +00:00
2021-04-21 14:36:04 +00:00
/// StorageURLWithFailover is allowed only for URL table function, not as a separate storage.
class StorageURLWithFailover final : public StorageURL
{
public:
StorageURLWithFailover(
2023-04-21 12:11:18 +00:00
const std::vector<String> & uri_options_,
const StorageID & table_id_,
const String & format_name_,
const std::optional<FormatSettings> & format_settings_,
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
ContextPtr context_,
const String & compression_method_);
2021-04-21 14:36:04 +00:00
Pipe read(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
2021-04-21 14:36:04 +00:00
SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum processed_stage,
size_t max_block_size,
size_t num_streams) override;
2021-04-21 14:36:04 +00:00
private:
2021-10-26 09:31:01 +00:00
std::vector<String> uri_options;
2018-06-11 12:13:00 +00:00
};
}