2018-06-11 12:13:00 +00:00
|
|
|
#include <Storages/StorageURL.h>
|
2022-06-23 20:04:06 +00:00
|
|
|
#include <Storages/PartitionedSink.h>
|
|
|
|
#include <Storages/checkAndGetLiteralArgument.h>
|
2022-12-16 22:57:09 +00:00
|
|
|
#include <Storages/NamedCollectionsHelpers.h>
|
2023-08-21 12:30:52 +00:00
|
|
|
#include <Storages/VirtualColumnUtils.h>
|
2018-06-11 12:13:00 +00:00
|
|
|
|
|
|
|
#include <Interpreters/evaluateConstantExpression.h>
|
2022-03-30 10:49:37 +00:00
|
|
|
#include <Interpreters/threadPoolCallbackRunner.h>
|
2020-11-05 11:28:20 +00:00
|
|
|
#include <Parsers/ASTCreateQuery.h>
|
2021-10-26 09:31:01 +00:00
|
|
|
#include <Parsers/ASTInsertQuery.h>
|
2022-03-14 09:27:09 +00:00
|
|
|
#include <Parsers/ASTLiteral.h>
|
2022-06-17 12:53:16 +00:00
|
|
|
#include <Parsers/ASTFunction.h>
|
|
|
|
#include <Parsers/ASTIdentifier.h>
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2020-12-10 22:05:02 +00:00
|
|
|
#include <IO/ConnectionTimeouts.h>
|
2022-03-14 09:27:09 +00:00
|
|
|
#include <IO/WriteBufferFromHTTP.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2018-06-13 07:36:47 +00:00
|
|
|
#include <Formats/FormatFactory.h>
|
2021-12-15 11:30:57 +00:00
|
|
|
#include <Formats/ReadSchemaUtils.h>
|
2021-10-13 18:22:02 +00:00
|
|
|
#include <Processors/Formats/IInputFormat.h>
|
2021-12-15 11:30:57 +00:00
|
|
|
#include <Processors/Formats/IOutputFormat.h>
|
2022-06-23 20:04:06 +00:00
|
|
|
#include <Processors/Executors/PullingPipelineExecutor.h>
|
|
|
|
#include <Processors/ISource.h>
|
2023-08-24 13:15:57 +00:00
|
|
|
#include <Processors/Sources/NullSource.h>
|
2023-07-04 16:50:31 +00:00
|
|
|
#include <Processors/Transforms/AddingDefaultsTransform.h>
|
|
|
|
#include <Processors/Transforms/ExtractColumnsTransform.h>
|
2023-08-22 11:59:59 +00:00
|
|
|
#include <Processors/Sources/ConstChunkGenerator.h>
|
2024-01-02 17:50:06 +00:00
|
|
|
#include <Processors/QueryPlan/QueryPlan.h>
|
|
|
|
#include <Processors/QueryPlan/SourceStepWithFilter.h>
|
2018-06-13 07:36:47 +00:00
|
|
|
|
2022-06-23 20:04:06 +00:00
|
|
|
#include <Common/ThreadStatus.h>
|
2021-10-26 09:31:01 +00:00
|
|
|
#include <Common/parseRemoteDescription.h>
|
2022-12-16 22:57:09 +00:00
|
|
|
#include <Common/NamedCollections/NamedCollections.h>
|
2023-08-24 13:07:26 +00:00
|
|
|
#include <Common/ProxyConfigurationResolverProvider.h>
|
2023-08-17 16:54:43 +00:00
|
|
|
#include <Common/ProfileEvents.h>
|
2023-12-23 13:46:21 +00:00
|
|
|
#include <Common/thread_local_rng.h>
|
|
|
|
#include <Common/logger_useful.h>
|
2024-01-07 22:28:08 +00:00
|
|
|
#include <Common/re2.h>
|
2022-06-23 20:04:06 +00:00
|
|
|
#include <IO/ReadWriteBufferFromHTTP.h>
|
2023-04-21 12:11:18 +00:00
|
|
|
#include <IO/HTTPHeaderEntries.h>
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2022-03-14 09:27:09 +00:00
|
|
|
#include <algorithm>
|
2021-10-16 14:03:50 +00:00
|
|
|
#include <QueryPipeline/QueryPipelineBuilder.h>
|
2022-03-14 09:27:09 +00:00
|
|
|
#include <Poco/Net/HTTPRequest.h>
|
2023-05-02 08:54:14 +00:00
|
|
|
#include <DataTypes/DataTypeString.h>
|
2023-06-16 19:38:50 +00:00
|
|
|
#include <DataTypes/DataTypeLowCardinality.h>
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2023-08-17 16:54:43 +00:00
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
|
|
|
extern const Event EngineFileLikeReadFiles;
|
|
|
|
}
|
2018-06-11 12:13:00 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
2021-04-21 14:36:04 +00:00
|
|
|
extern const int NETWORK_ERROR;
|
2021-09-10 11:11:52 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2023-05-30 19:32:24 +00:00
|
|
|
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
|
2018-08-10 04:02:56 +00:00
|
|
|
}
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2022-06-17 12:53:16 +00:00
|
|
|
static constexpr auto bad_arguments_error_message = "Storage URL requires 1-4 arguments: "
|
2023-04-21 12:11:18 +00:00
|
|
|
"url, name of used format (taken from file extension by default), "
|
|
|
|
"optional compression method, optional headers (specified as `headers('name'='value', 'name2'='value2')`)";
|
2021-09-10 11:11:52 +00:00
|
|
|
|
2022-12-16 22:57:09 +00:00
|
|
|
static const std::unordered_set<std::string_view> required_configuration_keys = {
|
|
|
|
"url",
|
|
|
|
};
|
|
|
|
|
|
|
|
static const std::unordered_set<std::string_view> optional_configuration_keys = {
|
|
|
|
"format",
|
|
|
|
"compression",
|
|
|
|
"compression_method",
|
|
|
|
"structure",
|
|
|
|
"filename",
|
2022-12-19 12:56:23 +00:00
|
|
|
"method",
|
|
|
|
"http_method",
|
2022-12-16 23:34:29 +00:00
|
|
|
"description",
|
2022-12-16 22:57:09 +00:00
|
|
|
"headers.header.name",
|
|
|
|
"headers.header.value",
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Headers in config file will have structure "headers.header.name" and "headers.header.value".
|
|
|
|
/// But Poco::AbstractConfiguration converts them into "header", "header[1]", "header[2]".
|
2024-01-07 22:28:08 +00:00
|
|
|
static const std::vector<std::shared_ptr<re2::RE2>> optional_regex_keys = {
|
|
|
|
std::make_shared<re2::RE2>(R"(headers.header\[[0-9]*\].name)"),
|
|
|
|
std::make_shared<re2::RE2>(R"(headers.header\[[0-9]*\].value)"),
|
2022-12-16 22:57:09 +00:00
|
|
|
};
|
|
|
|
|
2022-02-07 19:40:47 +00:00
|
|
|
static bool urlWithGlobs(const String & uri)
|
|
|
|
{
|
2022-03-14 09:27:09 +00:00
|
|
|
return (uri.find('{') != std::string::npos && uri.find('}') != std::string::npos) || uri.find('|') != std::string::npos;
|
2022-02-07 19:40:47 +00:00
|
|
|
}
|
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
static ConnectionTimeouts getHTTPTimeouts(ContextPtr context)
|
|
|
|
{
|
2023-12-05 12:34:37 +00:00
|
|
|
return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), context->getServerSettings().keep_alive_timeout);
|
2023-04-21 12:11:18 +00:00
|
|
|
}
|
2022-02-07 19:40:47 +00:00
|
|
|
|
2019-08-24 21:20:20 +00:00
|
|
|
IStorageURLBase::IStorageURLBase(
|
2021-10-26 09:31:01 +00:00
|
|
|
const String & uri_,
|
2021-12-15 11:30:57 +00:00
|
|
|
ContextPtr context_,
|
2019-12-04 16:06:55 +00:00
|
|
|
const StorageID & table_id_,
|
2018-06-11 12:13:00 +00:00
|
|
|
const String & format_name_,
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings_,
|
2019-08-24 21:20:20 +00:00
|
|
|
const ColumnsDescription & columns_,
|
2019-11-19 12:46:07 +00:00
|
|
|
const ConstraintsDescription & constraints_,
|
2021-04-23 12:18:23 +00:00
|
|
|
const String & comment,
|
2021-09-07 11:17:25 +00:00
|
|
|
const String & compression_method_,
|
2022-12-16 22:57:09 +00:00
|
|
|
const HTTPHeaderEntries & headers_,
|
2021-10-28 12:44:12 +00:00
|
|
|
const String & http_method_,
|
2023-04-21 17:24:37 +00:00
|
|
|
ASTPtr partition_by_,
|
|
|
|
bool distributed_processing_)
|
2021-10-26 09:31:01 +00:00
|
|
|
: IStorage(table_id_)
|
|
|
|
, uri(uri_)
|
2022-08-26 02:34:46 +00:00
|
|
|
, compression_method(chooseCompressionMethod(Poco::URI(uri_).getPath(), compression_method_))
|
2021-10-26 09:31:01 +00:00
|
|
|
, format_name(format_name_)
|
|
|
|
, format_settings(format_settings_)
|
|
|
|
, headers(headers_)
|
2021-10-28 12:44:12 +00:00
|
|
|
, http_method(http_method_)
|
2021-10-26 12:22:13 +00:00
|
|
|
, partition_by(partition_by_)
|
2023-04-21 17:24:37 +00:00
|
|
|
, distributed_processing(distributed_processing_)
|
2018-06-11 12:13:00 +00:00
|
|
|
{
|
2022-05-23 12:48:48 +00:00
|
|
|
FormatFactory::instance().checkFormatName(format_name);
|
2020-06-19 15:39:41 +00:00
|
|
|
StorageInMemoryMetadata storage_metadata;
|
2022-02-08 09:59:20 +00:00
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
if (columns_.empty())
|
|
|
|
{
|
|
|
|
auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_);
|
|
|
|
storage_metadata.setColumns(columns);
|
|
|
|
}
|
|
|
|
else
|
2023-09-19 18:14:42 +00:00
|
|
|
{
|
|
|
|
/// We don't allow special columns in URL storage.
|
|
|
|
if (!columns_.hasOnlyOrdinary())
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine URL doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL");
|
2021-12-15 11:30:57 +00:00
|
|
|
storage_metadata.setColumns(columns_);
|
2023-09-19 18:14:42 +00:00
|
|
|
}
|
2022-02-07 19:40:47 +00:00
|
|
|
|
2020-06-19 15:39:41 +00:00
|
|
|
storage_metadata.setConstraints(constraints_);
|
2021-04-23 12:18:23 +00:00
|
|
|
storage_metadata.setComment(comment);
|
2020-06-19 15:39:41 +00:00
|
|
|
setInMemoryMetadata(storage_metadata);
|
2023-08-17 16:54:43 +00:00
|
|
|
|
2023-11-22 18:12:36 +00:00
|
|
|
virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
2018-06-11 12:13:00 +00:00
|
|
|
}
|
|
|
|
|
2023-01-19 02:19:04 +00:00
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
HTTPHeaderEntries getHeaders(const HTTPHeaderEntries & headers_)
|
2021-10-03 13:53:24 +00:00
|
|
|
{
|
2023-04-21 12:11:18 +00:00
|
|
|
HTTPHeaderEntries headers(headers_.begin(), headers_.end());
|
2022-03-16 08:45:17 +00:00
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
// Propagate OpenTelemetry trace context, if any, downstream.
|
2023-04-21 17:24:37 +00:00
|
|
|
const auto & current_trace_context = OpenTelemetry::CurrentContext();
|
2023-04-21 12:11:18 +00:00
|
|
|
if (current_trace_context.isTraceEnabled())
|
2021-10-03 13:53:24 +00:00
|
|
|
{
|
2023-04-21 12:11:18 +00:00
|
|
|
headers.emplace_back("traceparent", current_trace_context.composeTraceparentHeader());
|
2021-10-03 13:53:24 +00:00
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
if (!current_trace_context.tracestate.empty())
|
|
|
|
{
|
|
|
|
headers.emplace_back("tracestate", current_trace_context.tracestate);
|
|
|
|
}
|
|
|
|
}
|
2021-10-26 09:31:01 +00:00
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
return headers;
|
|
|
|
}
|
2021-10-03 13:53:24 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
StorageURLSource::FailoverOptions getFailoverOptions(const String & uri, size_t max_addresses)
|
|
|
|
{
|
2023-04-21 17:28:14 +00:00
|
|
|
return parseRemoteDescription(uri, 0, uri.size(), '|', max_addresses);
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|
2023-08-24 13:07:26 +00:00
|
|
|
|
|
|
|
auto getProxyConfiguration(const std::string & protocol_string)
|
|
|
|
{
|
|
|
|
auto protocol = protocol_string == "https" ? ProxyConfigurationResolver::Protocol::HTTPS
|
|
|
|
: ProxyConfigurationResolver::Protocol::HTTP;
|
2023-11-04 17:47:52 +00:00
|
|
|
return ProxyConfigurationResolverProvider::get(protocol, Context::getGlobalContextInstance()->getConfigRef())->resolve();
|
2023-08-24 13:07:26 +00:00
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|
2021-12-17 11:03:37 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
class StorageURLSource::DisclosedGlobIterator::Impl
|
|
|
|
{
|
|
|
|
public:
|
2024-01-02 15:18:13 +00:00
|
|
|
Impl(const String & uri_, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
2023-01-19 02:19:04 +00:00
|
|
|
{
|
2023-08-17 16:54:43 +00:00
|
|
|
uris = parseRemoteDescription(uri_, 0, uri_.size(), ',', max_addresses);
|
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
ActionsDAGPtr filter_dag;
|
2023-08-18 17:49:40 +00:00
|
|
|
if (!uris.empty())
|
2024-01-02 15:18:13 +00:00
|
|
|
filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns);
|
2023-08-17 16:54:43 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
if (filter_dag)
|
2023-08-17 16:54:43 +00:00
|
|
|
{
|
|
|
|
std::vector<String> paths;
|
|
|
|
paths.reserve(uris.size());
|
|
|
|
for (const auto & uri : uris)
|
|
|
|
paths.push_back(Poco::URI(uri).getPath());
|
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
VirtualColumnUtils::filterByPathOrFile(uris, paths, filter_dag, virtual_columns, context);
|
2023-08-17 16:54:43 +00:00
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|
2021-12-27 19:42:56 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
String next()
|
|
|
|
{
|
|
|
|
size_t current_index = index.fetch_add(1, std::memory_order_relaxed);
|
|
|
|
if (current_index >= uris.size())
|
|
|
|
return {};
|
|
|
|
|
|
|
|
return uris[current_index];
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t size()
|
|
|
|
{
|
|
|
|
return uris.size();
|
|
|
|
}
|
2022-02-08 11:57:35 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
private:
|
|
|
|
Strings uris;
|
|
|
|
std::atomic_size_t index = 0;
|
|
|
|
};
|
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
StorageURLSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
|
|
|
: pimpl(std::make_shared<StorageURLSource::DisclosedGlobIterator::Impl>(uri, max_addresses, predicate, virtual_columns, context)) {}
|
2023-04-21 17:24:37 +00:00
|
|
|
|
|
|
|
String StorageURLSource::DisclosedGlobIterator::next()
|
|
|
|
{
|
|
|
|
return pimpl->next();
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t StorageURLSource::DisclosedGlobIterator::size()
|
|
|
|
{
|
|
|
|
return pimpl->size();
|
|
|
|
}
|
|
|
|
|
|
|
|
void StorageURLSource::setCredentials(Poco::Net::HTTPBasicCredentials & credentials, const Poco::URI & request_uri)
|
|
|
|
{
|
|
|
|
const auto & user_info = request_uri.getUserInfo();
|
|
|
|
if (!user_info.empty())
|
|
|
|
{
|
|
|
|
std::size_t n = user_info.find(':');
|
|
|
|
if (n != std::string::npos)
|
2018-06-11 12:13:00 +00:00
|
|
|
{
|
2023-04-21 17:24:37 +00:00
|
|
|
credentials.setUsername(user_info.substr(0, n));
|
|
|
|
credentials.setPassword(user_info.substr(n + 1));
|
2023-02-02 13:12:20 +00:00
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|
|
|
|
}
|
2021-12-17 11:03:37 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
StorageURLSource::StorageURLSource(
|
2023-07-04 16:50:31 +00:00
|
|
|
const ReadFromFormatInfo & info,
|
2023-04-21 17:24:37 +00:00
|
|
|
std::shared_ptr<IteratorWrapper> uri_iterator_,
|
|
|
|
const std::string & http_method,
|
|
|
|
std::function<void(std::ostream &)> callback,
|
2023-08-22 11:59:59 +00:00
|
|
|
const String & format_,
|
|
|
|
const std::optional<FormatSettings> & format_settings_,
|
2023-04-21 17:24:37 +00:00
|
|
|
String name_,
|
2023-08-22 11:59:59 +00:00
|
|
|
ContextPtr context_,
|
2023-04-21 17:24:37 +00:00
|
|
|
UInt64 max_block_size,
|
|
|
|
const ConnectionTimeouts & timeouts,
|
|
|
|
CompressionMethod compression_method,
|
2023-08-22 01:21:03 +00:00
|
|
|
size_t max_parsing_threads,
|
2023-04-21 17:24:37 +00:00
|
|
|
const HTTPHeaderEntries & headers_,
|
|
|
|
const URIParams & params,
|
2023-08-21 12:30:52 +00:00
|
|
|
bool glob_url,
|
|
|
|
bool need_only_count_)
|
2023-10-24 19:08:17 +00:00
|
|
|
: SourceWithKeyCondition(info.source_header, false), WithContext(context_)
|
2023-07-04 16:50:31 +00:00
|
|
|
, name(std::move(name_))
|
|
|
|
, columns_description(info.columns_description)
|
|
|
|
, requested_columns(info.requested_columns)
|
|
|
|
, requested_virtual_columns(info.requested_virtual_columns)
|
|
|
|
, block_for_format(info.format_header)
|
|
|
|
, uri_iterator(uri_iterator_)
|
2023-08-22 11:59:59 +00:00
|
|
|
, format(format_)
|
|
|
|
, format_settings(format_settings_)
|
|
|
|
, headers(getHeaders(headers_))
|
2023-08-21 12:30:52 +00:00
|
|
|
, need_only_count(need_only_count_)
|
2023-04-21 17:24:37 +00:00
|
|
|
{
|
|
|
|
/// Lazy initialization. We should not perform requests in constructor, because we need to do it in query pipeline.
|
2023-05-30 19:32:24 +00:00
|
|
|
initialize = [=, this]()
|
2023-04-21 17:24:37 +00:00
|
|
|
{
|
2023-06-13 14:43:50 +00:00
|
|
|
std::vector<String> current_uri_options;
|
|
|
|
std::pair<Poco::URI, std::unique_ptr<ReadWriteBufferFromHTTP>> uri_and_buf;
|
|
|
|
do
|
|
|
|
{
|
|
|
|
current_uri_options = (*uri_iterator)();
|
|
|
|
if (current_uri_options.empty())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto first_option = current_uri_options.cbegin();
|
|
|
|
uri_and_buf = getFirstAvailableURIAndReadBuffer(
|
|
|
|
first_option,
|
|
|
|
current_uri_options.end(),
|
2023-08-22 11:59:59 +00:00
|
|
|
getContext(),
|
2023-06-13 14:43:50 +00:00
|
|
|
params,
|
|
|
|
http_method,
|
|
|
|
callback,
|
|
|
|
timeouts,
|
|
|
|
credentials,
|
|
|
|
headers,
|
|
|
|
glob_url,
|
|
|
|
current_uri_options.size() == 1);
|
|
|
|
|
|
|
|
/// If file is empty and engine_url_skip_empty_files=1, skip it and go to the next file.
|
|
|
|
}
|
2023-08-22 11:59:59 +00:00
|
|
|
while (getContext()->getSettingsRef().engine_url_skip_empty_files && uri_and_buf.second->eof());
|
2023-06-13 14:43:50 +00:00
|
|
|
|
|
|
|
curr_uri = uri_and_buf.first;
|
2023-08-23 18:43:08 +00:00
|
|
|
auto last_mod_time = uri_and_buf.second->tryGetLastModificationTime();
|
2023-06-13 14:43:50 +00:00
|
|
|
read_buf = std::move(uri_and_buf.second);
|
2023-11-22 18:12:36 +00:00
|
|
|
current_file_size = tryGetFileSizeFromReadBuffer(*read_buf);
|
2023-05-22 19:19:57 +00:00
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
if (auto file_progress_callback = getContext()->getFileProgressCallback())
|
2023-11-22 18:12:36 +00:00
|
|
|
file_progress_callback(FileProgress(0, current_file_size.value_or(0)));
|
2023-04-21 12:11:18 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
QueryPipelineBuilder builder;
|
2023-08-22 11:59:59 +00:00
|
|
|
std::optional<size_t> num_rows_from_cache = std::nullopt;
|
|
|
|
if (need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files)
|
|
|
|
num_rows_from_cache = tryGetNumRowsFromCache(curr_uri.toString(), last_mod_time);
|
2023-04-21 17:24:37 +00:00
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
if (num_rows_from_cache)
|
2023-07-04 16:50:31 +00:00
|
|
|
{
|
2023-08-22 11:59:59 +00:00
|
|
|
/// We should not return single chunk with all number of rows,
|
|
|
|
/// because there is a chance that this chunk will be materialized later
|
|
|
|
/// (it can cause memory problems even with default values in columns or when virtual columns are requested).
|
|
|
|
/// Instead, we use special ConstChunkGenerator that will generate chunks
|
|
|
|
/// with max_block_size rows until total number of rows is reached.
|
|
|
|
auto source = std::make_shared<ConstChunkGenerator>(block_for_format, *num_rows_from_cache, max_block_size);
|
|
|
|
builder.init(Pipe(source));
|
|
|
|
}
|
|
|
|
else
|
2023-07-04 16:50:31 +00:00
|
|
|
{
|
2023-08-22 11:59:59 +00:00
|
|
|
// TODO: Pass max_parsing_threads and max_download_threads adjusted for num_streams.
|
|
|
|
input_format = FormatFactory::instance().getInput(
|
|
|
|
format,
|
|
|
|
*read_buf,
|
|
|
|
block_for_format,
|
|
|
|
getContext(),
|
|
|
|
max_block_size,
|
|
|
|
format_settings,
|
2023-10-17 00:00:07 +00:00
|
|
|
max_parsing_threads,
|
2023-08-22 11:59:59 +00:00
|
|
|
/*max_download_threads*/ std::nullopt,
|
|
|
|
/* is_remote_ fs */ true,
|
2023-10-17 00:00:07 +00:00
|
|
|
compression_method,
|
|
|
|
need_only_count);
|
2023-10-24 19:08:17 +00:00
|
|
|
|
|
|
|
if (key_condition)
|
|
|
|
input_format->setKeyCondition(key_condition);
|
2023-08-22 11:59:59 +00:00
|
|
|
|
2023-08-23 20:31:49 +00:00
|
|
|
if (need_only_count)
|
|
|
|
input_format->needOnlyCount();
|
|
|
|
|
|
|
|
builder.init(Pipe(input_format));
|
2023-08-22 11:59:59 +00:00
|
|
|
|
|
|
|
if (columns_description.hasDefaults())
|
2023-07-04 16:50:31 +00:00
|
|
|
{
|
2023-08-22 11:59:59 +00:00
|
|
|
builder.addSimpleTransform([&](const Block & cur_header)
|
|
|
|
{
|
|
|
|
return std::make_shared<AddingDefaultsTransform>(cur_header, columns_description, *input_format, getContext());
|
|
|
|
});
|
|
|
|
}
|
2023-07-04 16:50:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Add ExtractColumnsTransform to extract requested columns/subcolumns
|
|
|
|
/// from chunk read by IInputFormat.
|
|
|
|
builder.addSimpleTransform([&](const Block & header)
|
|
|
|
{
|
|
|
|
return std::make_shared<ExtractColumnsTransform>(header, requested_columns);
|
|
|
|
});
|
2023-04-21 17:24:37 +00:00
|
|
|
|
|
|
|
pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
|
|
|
|
reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
|
2023-08-17 16:54:43 +00:00
|
|
|
|
|
|
|
ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
|
2023-05-30 19:32:24 +00:00
|
|
|
return true;
|
2023-04-21 17:24:37 +00:00
|
|
|
};
|
|
|
|
}
|
2023-01-19 05:18:07 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
Chunk StorageURLSource::generate()
|
|
|
|
{
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
if (isCancelled())
|
2018-06-11 12:13:00 +00:00
|
|
|
{
|
2023-04-21 17:24:37 +00:00
|
|
|
if (reader)
|
|
|
|
reader->cancel();
|
|
|
|
break;
|
2018-06-11 12:13:00 +00:00
|
|
|
}
|
|
|
|
|
2023-05-30 19:32:24 +00:00
|
|
|
if (!reader && !initialize())
|
|
|
|
return {};
|
2023-01-19 02:19:04 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
Chunk chunk;
|
|
|
|
if (reader->pull(chunk))
|
|
|
|
{
|
|
|
|
UInt64 num_rows = chunk.getNumRows();
|
2023-08-22 11:59:59 +00:00
|
|
|
total_rows_in_file += num_rows;
|
|
|
|
size_t chunk_size = 0;
|
|
|
|
if (input_format)
|
|
|
|
chunk_size = input_format->getApproxBytesReadForChunk();
|
2023-06-23 13:43:40 +00:00
|
|
|
progress(num_rows, chunk_size ? chunk_size : chunk.bytes());
|
2023-11-22 18:12:36 +00:00
|
|
|
VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, curr_uri.getPath(), current_file_size);
|
2023-04-21 17:24:37 +00:00
|
|
|
return chunk;
|
2023-01-19 02:19:04 +00:00
|
|
|
}
|
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
if (input_format && getContext()->getSettingsRef().use_cache_for_count_from_files)
|
|
|
|
addNumRowsToCache(curr_uri.toString(), total_rows_in_file);
|
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
pipeline->reset();
|
|
|
|
reader.reset();
|
2023-06-21 23:46:12 +00:00
|
|
|
input_format.reset();
|
|
|
|
read_buf.reset();
|
2023-08-22 11:59:59 +00:00
|
|
|
total_rows_in_file = 0;
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2023-06-13 14:43:50 +00:00
|
|
|
std::pair<Poco::URI, std::unique_ptr<ReadWriteBufferFromHTTP>> StorageURLSource::getFirstAvailableURIAndReadBuffer(
|
2023-04-21 17:24:37 +00:00
|
|
|
std::vector<String>::const_iterator & option,
|
|
|
|
const std::vector<String>::const_iterator & end,
|
2023-08-22 11:59:59 +00:00
|
|
|
ContextPtr context_,
|
2023-04-21 17:24:37 +00:00
|
|
|
const URIParams & params,
|
|
|
|
const String & http_method,
|
|
|
|
std::function<void(std::ostream &)> callback,
|
|
|
|
const ConnectionTimeouts & timeouts,
|
|
|
|
Poco::Net::HTTPBasicCredentials & credentials,
|
|
|
|
const HTTPHeaderEntries & headers,
|
|
|
|
bool glob_url,
|
|
|
|
bool delay_initialization)
|
|
|
|
{
|
|
|
|
String first_exception_message;
|
2023-08-22 11:59:59 +00:00
|
|
|
ReadSettings read_settings = context_->getReadSettings();
|
2023-04-21 17:24:37 +00:00
|
|
|
|
|
|
|
size_t options = std::distance(option, end);
|
2023-06-15 12:59:46 +00:00
|
|
|
std::pair<Poco::URI, std::unique_ptr<ReadWriteBufferFromHTTP>> last_skipped_empty_res;
|
2023-04-21 17:24:37 +00:00
|
|
|
for (; option != end; ++option)
|
|
|
|
{
|
|
|
|
bool skip_url_not_found_error = glob_url && read_settings.http_skip_not_found_url_for_globs && option == std::prev(end);
|
2023-08-22 11:59:59 +00:00
|
|
|
auto request_uri = Poco::URI(*option, context_->getSettingsRef().enable_url_encoding);
|
2021-10-03 04:28:28 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
for (const auto & [param, value] : params)
|
|
|
|
request_uri.addQueryParameter(param, value);
|
2023-01-19 02:19:04 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
setCredentials(credentials, request_uri);
|
2022-02-08 09:59:20 +00:00
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
const auto settings = context_->getSettings();
|
2023-01-19 02:19:04 +00:00
|
|
|
|
2023-08-24 13:07:26 +00:00
|
|
|
auto proxy_config = getProxyConfiguration(http_method);
|
|
|
|
|
2023-05-05 03:11:51 +00:00
|
|
|
try
|
2023-04-21 17:24:37 +00:00
|
|
|
{
|
2023-05-05 03:11:51 +00:00
|
|
|
auto res = std::make_unique<ReadWriteBufferFromHTTP>(
|
|
|
|
request_uri,
|
|
|
|
http_method,
|
|
|
|
callback,
|
|
|
|
timeouts,
|
|
|
|
credentials,
|
|
|
|
settings.max_http_get_redirects,
|
|
|
|
settings.max_read_buffer_size,
|
|
|
|
read_settings,
|
|
|
|
headers,
|
2023-08-22 11:59:59 +00:00
|
|
|
&context_->getRemoteHostFilter(),
|
2023-05-05 03:11:51 +00:00
|
|
|
delay_initialization,
|
|
|
|
/* use_external_buffer */ false,
|
2023-08-24 13:07:26 +00:00
|
|
|
/* skip_url_not_found_error */ skip_url_not_found_error,
|
|
|
|
/* file_info */ std::nullopt,
|
|
|
|
proxy_config);
|
2023-05-05 03:11:51 +00:00
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
if (context_->getSettingsRef().engine_url_skip_empty_files && res->eof() && option != std::prev(end))
|
2023-06-15 12:59:46 +00:00
|
|
|
{
|
|
|
|
last_skipped_empty_res = {request_uri, std::move(res)};
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-05-05 03:11:51 +00:00
|
|
|
return std::make_tuple(request_uri, std::move(res));
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
if (options == 1)
|
|
|
|
throw;
|
2023-04-21 17:24:37 +00:00
|
|
|
|
2023-05-05 03:11:51 +00:00
|
|
|
if (first_exception_message.empty())
|
|
|
|
first_exception_message = getCurrentExceptionMessage(false);
|
2023-04-21 17:24:37 +00:00
|
|
|
|
2023-05-05 03:11:51 +00:00
|
|
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
2023-04-21 17:24:37 +00:00
|
|
|
|
2023-05-05 03:11:51 +00:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|
|
|
|
|
2023-06-15 12:59:46 +00:00
|
|
|
/// If all options are unreachable except empty ones that we skipped,
|
|
|
|
/// return last empty result. It will be skipped later.
|
|
|
|
if (last_skipped_empty_res.second)
|
|
|
|
return last_skipped_empty_res;
|
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
throw Exception(ErrorCodes::NETWORK_ERROR, "All uri ({}) options are unreachable: {}", options, first_exception_message);
|
2018-06-11 12:13:00 +00:00
|
|
|
}
|
2018-06-16 05:54:06 +00:00
|
|
|
|
2024-01-13 02:48:04 +00:00
|
|
|
void StorageURLSource::addNumRowsToCache(const String & uri, size_t num_rows)
|
2023-08-22 11:59:59 +00:00
|
|
|
{
|
|
|
|
auto cache_key = getKeyForSchemaCache(uri, format, format_settings, getContext());
|
|
|
|
StorageURL::getSchemaCache(getContext()).addNumRows(cache_key, num_rows);
|
|
|
|
}
|
|
|
|
|
2024-01-13 02:48:04 +00:00
|
|
|
std::optional<size_t> StorageURLSource::tryGetNumRowsFromCache(const String & uri, std::optional<time_t> last_mod_time)
|
2023-08-22 11:59:59 +00:00
|
|
|
{
|
|
|
|
auto cache_key = getKeyForSchemaCache(uri, format, format_settings, getContext());
|
|
|
|
auto get_last_mod_time = [&]() -> std::optional<time_t>
|
|
|
|
{
|
|
|
|
/// Some URLs could not have Last-Modified header, in this case we cannot be sure that
|
|
|
|
/// data wasn't changed after adding it's schema to cache. Use schema from cache only if
|
|
|
|
/// special setting for this case is enabled.
|
|
|
|
if (!last_mod_time && !getContext()->getSettingsRef().schema_inference_cache_require_modification_time_for_url)
|
|
|
|
return 0;
|
|
|
|
return last_mod_time;
|
|
|
|
};
|
|
|
|
|
|
|
|
return StorageURL::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time);
|
|
|
|
}
|
|
|
|
|
2021-07-23 14:25:35 +00:00
|
|
|
StorageURLSink::StorageURLSink(
|
2021-10-26 09:31:01 +00:00
|
|
|
const String & uri,
|
2021-07-23 14:25:35 +00:00
|
|
|
const String & format,
|
|
|
|
const std::optional<FormatSettings> & format_settings,
|
|
|
|
const Block & sample_block,
|
|
|
|
ContextPtr context,
|
|
|
|
const ConnectionTimeouts & timeouts,
|
2021-10-26 09:31:01 +00:00
|
|
|
const CompressionMethod compression_method,
|
2023-04-21 12:11:18 +00:00
|
|
|
const HTTPHeaderEntries & headers,
|
2021-10-28 12:44:12 +00:00
|
|
|
const String & http_method)
|
2021-07-23 14:25:35 +00:00
|
|
|
: SinkToStorage(sample_block)
|
2020-04-28 00:56:44 +00:00
|
|
|
{
|
2021-12-03 11:42:46 +00:00
|
|
|
std::string content_type = FormatFactory::instance().getContentType(format, context, format_settings);
|
2022-02-12 02:43:53 +00:00
|
|
|
std::string content_encoding = toContentEncodingName(compression_method);
|
2021-12-02 07:49:34 +00:00
|
|
|
|
2023-08-24 13:07:26 +00:00
|
|
|
auto proxy_config = getProxyConfiguration(http_method);
|
|
|
|
|
|
|
|
auto write_buffer = std::make_unique<WriteBufferFromHTTP>(
|
|
|
|
Poco::URI(uri), http_method, content_type, content_encoding, headers, timeouts, DBMS_DEFAULT_BUFFER_SIZE, proxy_config
|
|
|
|
);
|
|
|
|
|
2024-01-03 03:40:26 +00:00
|
|
|
const auto & settings = context->getSettingsRef();
|
2020-04-28 00:56:44 +00:00
|
|
|
write_buf = wrapWriteBufferWithCompressionMethod(
|
2023-08-24 13:07:26 +00:00
|
|
|
std::move(write_buffer),
|
2022-03-14 09:27:09 +00:00
|
|
|
compression_method,
|
2024-01-03 03:40:26 +00:00
|
|
|
static_cast<int>(settings.output_format_compression_level),
|
|
|
|
static_cast<int>(settings.output_format_compression_zstd_window_log));
|
2022-10-28 16:41:10 +00:00
|
|
|
writer = FormatFactory::instance().getOutputFormat(format, *write_buf, sample_block, context, format_settings);
|
2020-04-28 00:56:44 +00:00
|
|
|
}
|
2018-06-16 05:54:06 +00:00
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
|
2021-07-23 14:25:35 +00:00
|
|
|
void StorageURLSink::consume(Chunk chunk)
|
2020-07-09 01:00:16 +00:00
|
|
|
{
|
2022-07-21 12:18:37 +00:00
|
|
|
std::lock_guard lock(cancel_mutex);
|
|
|
|
if (cancelled)
|
|
|
|
return;
|
2021-09-03 17:29:36 +00:00
|
|
|
writer->write(getHeader().cloneWithColumns(chunk.detachColumns()));
|
2020-07-09 01:00:16 +00:00
|
|
|
}
|
|
|
|
|
2022-07-21 12:18:37 +00:00
|
|
|
void StorageURLSink::onCancel()
|
|
|
|
{
|
|
|
|
std::lock_guard lock(cancel_mutex);
|
|
|
|
finalize();
|
|
|
|
cancelled = true;
|
|
|
|
}
|
|
|
|
|
2023-06-22 12:33:25 +00:00
|
|
|
void StorageURLSink::onException(std::exception_ptr exception)
|
2022-05-06 17:30:18 +00:00
|
|
|
{
|
2022-07-21 12:18:37 +00:00
|
|
|
std::lock_guard lock(cancel_mutex);
|
2023-06-22 12:33:25 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
std::rethrow_exception(exception);
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
/// An exception context is needed to proper delete write buffers without finalization
|
|
|
|
release();
|
|
|
|
}
|
2022-05-06 17:30:18 +00:00
|
|
|
}
|
|
|
|
|
2021-07-23 14:25:35 +00:00
|
|
|
void StorageURLSink::onFinish()
|
2020-07-09 01:00:16 +00:00
|
|
|
{
|
2022-07-21 12:18:37 +00:00
|
|
|
std::lock_guard lock(cancel_mutex);
|
|
|
|
finalize();
|
|
|
|
}
|
|
|
|
|
|
|
|
void StorageURLSink::finalize()
|
|
|
|
{
|
|
|
|
if (!writer)
|
|
|
|
return;
|
|
|
|
|
Fix possible "Cannot write to finalized buffer"
It is still possible to get this error since onException does not
finalize format correctly.
Here is an example of such error, that was found by CI [1]:
<details>
[ 2686 ] {fa01bf02-73f6-4f7f-b14f-e725de6d7f9b} <Fatal> : Logical error: 'Cannot write to finalized buffer'.
[ 34577 ] {} <Fatal> BaseDaemon: ########################################
[ 34577 ] {} <Fatal> BaseDaemon: (version 22.6.1.1, build id: AB8040A6769E01A0) (from thread 2686) (query_id: fa01bf02-73f6-4f7f-b14f-e725de6d7f9b) (query: insert into test_02302 select number from numbers(10) settings s3_truncate_on_insert=1;) Received signal Aborted (6)
[ 34577 ] {} <Fatal> BaseDaemon:
[ 34577 ] {} <Fatal> BaseDaemon: Stack trace: 0x7fcbaa5a703b 0x7fcbaa586859 0xfad9bab 0xfad9e05 0xfaf6a3b 0x24a48c7f 0x258fb9b9 0x258f2004 0x258b88f4 0x258b863b 0x2581773d 0x258177ce 0x24bb5e98 0xfad01d6 0xfad0105 0x2419b11d 0xfad01d6 0xfad0105 0x2215afbb 0x2215aa48 0xfad01d6 0xfad0105 0xfcc265d 0x225cc546 0x249a1c40 0x249bc1b6 0x2685902c 0x26859505 0x269d7767 0x269d504c 0x7fcbaa75e609 0x7fcbaa683163
[ 34577 ] {} <Fatal> BaseDaemon: 3. raise @ 0x7fcbaa5a703b in ?
[ 34577 ] {} <Fatal> BaseDaemon: 4. abort @ 0x7fcbaa586859 in ?
[ 34577 ] {} <Fatal> BaseDaemon: 5. ./build_docker/../src/Common/Exception.cpp:47: DB::abortOnFailedAssertion(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) @ 0xfad9bab in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 6. ./build_docker/../src/Common/Exception.cpp:70: DB::Exception::Exception(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int, bool) @ 0xfad9e05 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 7. ./build_docker/../src/IO/WriteBuffer.h:0: DB::WriteBuffer::write(char const*, unsigned long) @ 0xfaf6a3b in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 8. ./build_docker/../src/Processors/Formats/Impl/ArrowBufferedStreams.cpp:47: DB::ArrowBufferedOutputStream::Write(void const*, long) @ 0x24a48c7f in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 9. long parquet::ThriftSerializer::Serialize<parquet::format::FileMetaData>(parquet::format::FileMetaData const*, arrow::io::OutputStream*, std::__1::shared_ptr<parquet::Encryptor> const&) @ 0x258fb9b9 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 10. parquet::FileMetaData::FileMetaDataImpl::WriteTo(arrow::io::OutputStream*, std::__1::shared_ptr<parquet::Encryptor> const&) const @ 0x258f2004 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 11. parquet::WriteFileMetaData(parquet::FileMetaData const&, arrow::io::OutputStream*) @ 0x258b88f4 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 12. parquet::ParquetFileWriter::~ParquetFileWriter() @ 0x258b863b in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 13. parquet::arrow::FileWriterImpl::~FileWriterImpl() @ 0x2581773d in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 14. parquet::arrow::FileWriterImpl::~FileWriterImpl() @ 0x258177ce in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 15. ./build_docker/../src/Processors/Formats/Impl/ParquetBlockOutputFormat.h:27: DB::ParquetBlockOutputFormat::~ParquetBlockOutputFormat() @ 0x24bb5e98 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 16. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:173: std::__1::__shared_count::__release_shared() @ 0xfad01d6 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 17. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:216: std::__1::__shared_weak_count::__release_shared() @ 0xfad0105 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 18.1. inlined from ./build_docker/../contrib/libcxx/include/__memory/unique_ptr.h:312: std::__1::unique_ptr<DB::WriteBuffer, std::__1::default_delete<DB::WriteBuffer> >::reset(DB::WriteBuffer*)
[ 34577 ] {} <Fatal> BaseDaemon: 18.2. inlined from ../contrib/libcxx/include/__memory/unique_ptr.h:269: ~unique_ptr
[ 34577 ] {} <Fatal> BaseDaemon: 18. ../src/Storages/StorageS3.cpp:566: DB::StorageS3Sink::~StorageS3Sink() @ 0x2419b11d in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 19. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:173: std::__1::__shared_count::__release_shared() @ 0xfad01d6 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 20. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:216: std::__1::__shared_weak_count::__release_shared() @ 0xfad0105 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 21. ./build_docker/../contrib/abseil-cpp/absl/container/internal/raw_hash_set.h:1662: absl::lts_20211102::container_internal::raw_hash_set<absl::lts_20211102::container_internal::FlatHashMapPolicy<StringRef, std::__1::shared_ptr<DB::SinkToStorage> >, absl::lts_20211102::hash_internal::Hash<StringRef>, std::__1::equal_to<StringRef>, std::__1::allocator<std::__1::pair<StringRef const, std::__1::shared_ptr<DB::SinkToStorage> > > >::destroy_slots() @ 0x2215afbb in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 22.1. inlined from ./build_docker/../contrib/libcxx/include/string:1445: std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__is_long() const
[ 34577 ] {} <Fatal> BaseDaemon: 22.2. inlined from ../contrib/libcxx/include/string:2231: ~basic_string
[ 34577 ] {} <Fatal> BaseDaemon: 22. ../src/Storages/PartitionedSink.h:14: DB::PartitionedSink::~PartitionedSink() @ 0x2215aa48 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 23. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:173: std::__1::__shared_count::__release_shared() @ 0xfad01d6 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 24. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:216: std::__1::__shared_weak_count::__release_shared() @ 0xfad0105 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 25. ./build_docker/../contrib/libcxx/include/vector:802: std::__1::vector<std::__1::shared_ptr<DB::IProcessor>, std::__1::allocator<std::__1::shared_ptr<DB::IProcessor> > >::__base_destruct_at_end(std::__1::shared_ptr<DB::IProcessor>*) @ 0xfcc265d in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 26.1. inlined from ./build_docker/../contrib/libcxx/include/vector:402: ~vector
[ 34577 ] {} <Fatal> BaseDaemon: 26.2. inlined from ../src/QueryPipeline/QueryPipeline.cpp:29: ~QueryPipeline
[ 34577 ] {} <Fatal> BaseDaemon: 26. ../src/QueryPipeline/QueryPipeline.cpp:535: DB::QueryPipeline::reset() @ 0x225cc546 in /usr/bin/clickhouse
[ 614 ] {} <Fatal> Application: Child process was terminated by signal 6.
</details>
[1]: https://s3.amazonaws.com/clickhouse-test-reports/37542/8a224239c1d922158b4dc9f5d6609dca836dfd06/stress_test__undefined__actions_.html
Follow-up for: #36979
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-05-30 10:13:47 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
writer->finalize();
|
|
|
|
writer->flush();
|
|
|
|
write_buf->finalize();
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
/// Stop ParallelFormattingOutputFormat correctly.
|
2023-06-22 12:33:25 +00:00
|
|
|
release();
|
Fix possible "Cannot write to finalized buffer"
It is still possible to get this error since onException does not
finalize format correctly.
Here is an example of such error, that was found by CI [1]:
<details>
[ 2686 ] {fa01bf02-73f6-4f7f-b14f-e725de6d7f9b} <Fatal> : Logical error: 'Cannot write to finalized buffer'.
[ 34577 ] {} <Fatal> BaseDaemon: ########################################
[ 34577 ] {} <Fatal> BaseDaemon: (version 22.6.1.1, build id: AB8040A6769E01A0) (from thread 2686) (query_id: fa01bf02-73f6-4f7f-b14f-e725de6d7f9b) (query: insert into test_02302 select number from numbers(10) settings s3_truncate_on_insert=1;) Received signal Aborted (6)
[ 34577 ] {} <Fatal> BaseDaemon:
[ 34577 ] {} <Fatal> BaseDaemon: Stack trace: 0x7fcbaa5a703b 0x7fcbaa586859 0xfad9bab 0xfad9e05 0xfaf6a3b 0x24a48c7f 0x258fb9b9 0x258f2004 0x258b88f4 0x258b863b 0x2581773d 0x258177ce 0x24bb5e98 0xfad01d6 0xfad0105 0x2419b11d 0xfad01d6 0xfad0105 0x2215afbb 0x2215aa48 0xfad01d6 0xfad0105 0xfcc265d 0x225cc546 0x249a1c40 0x249bc1b6 0x2685902c 0x26859505 0x269d7767 0x269d504c 0x7fcbaa75e609 0x7fcbaa683163
[ 34577 ] {} <Fatal> BaseDaemon: 3. raise @ 0x7fcbaa5a703b in ?
[ 34577 ] {} <Fatal> BaseDaemon: 4. abort @ 0x7fcbaa586859 in ?
[ 34577 ] {} <Fatal> BaseDaemon: 5. ./build_docker/../src/Common/Exception.cpp:47: DB::abortOnFailedAssertion(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) @ 0xfad9bab in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 6. ./build_docker/../src/Common/Exception.cpp:70: DB::Exception::Exception(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int, bool) @ 0xfad9e05 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 7. ./build_docker/../src/IO/WriteBuffer.h:0: DB::WriteBuffer::write(char const*, unsigned long) @ 0xfaf6a3b in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 8. ./build_docker/../src/Processors/Formats/Impl/ArrowBufferedStreams.cpp:47: DB::ArrowBufferedOutputStream::Write(void const*, long) @ 0x24a48c7f in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 9. long parquet::ThriftSerializer::Serialize<parquet::format::FileMetaData>(parquet::format::FileMetaData const*, arrow::io::OutputStream*, std::__1::shared_ptr<parquet::Encryptor> const&) @ 0x258fb9b9 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 10. parquet::FileMetaData::FileMetaDataImpl::WriteTo(arrow::io::OutputStream*, std::__1::shared_ptr<parquet::Encryptor> const&) const @ 0x258f2004 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 11. parquet::WriteFileMetaData(parquet::FileMetaData const&, arrow::io::OutputStream*) @ 0x258b88f4 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 12. parquet::ParquetFileWriter::~ParquetFileWriter() @ 0x258b863b in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 13. parquet::arrow::FileWriterImpl::~FileWriterImpl() @ 0x2581773d in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 14. parquet::arrow::FileWriterImpl::~FileWriterImpl() @ 0x258177ce in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 15. ./build_docker/../src/Processors/Formats/Impl/ParquetBlockOutputFormat.h:27: DB::ParquetBlockOutputFormat::~ParquetBlockOutputFormat() @ 0x24bb5e98 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 16. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:173: std::__1::__shared_count::__release_shared() @ 0xfad01d6 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 17. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:216: std::__1::__shared_weak_count::__release_shared() @ 0xfad0105 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 18.1. inlined from ./build_docker/../contrib/libcxx/include/__memory/unique_ptr.h:312: std::__1::unique_ptr<DB::WriteBuffer, std::__1::default_delete<DB::WriteBuffer> >::reset(DB::WriteBuffer*)
[ 34577 ] {} <Fatal> BaseDaemon: 18.2. inlined from ../contrib/libcxx/include/__memory/unique_ptr.h:269: ~unique_ptr
[ 34577 ] {} <Fatal> BaseDaemon: 18. ../src/Storages/StorageS3.cpp:566: DB::StorageS3Sink::~StorageS3Sink() @ 0x2419b11d in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 19. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:173: std::__1::__shared_count::__release_shared() @ 0xfad01d6 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 20. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:216: std::__1::__shared_weak_count::__release_shared() @ 0xfad0105 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 21. ./build_docker/../contrib/abseil-cpp/absl/container/internal/raw_hash_set.h:1662: absl::lts_20211102::container_internal::raw_hash_set<absl::lts_20211102::container_internal::FlatHashMapPolicy<StringRef, std::__1::shared_ptr<DB::SinkToStorage> >, absl::lts_20211102::hash_internal::Hash<StringRef>, std::__1::equal_to<StringRef>, std::__1::allocator<std::__1::pair<StringRef const, std::__1::shared_ptr<DB::SinkToStorage> > > >::destroy_slots() @ 0x2215afbb in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 22.1. inlined from ./build_docker/../contrib/libcxx/include/string:1445: std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__is_long() const
[ 34577 ] {} <Fatal> BaseDaemon: 22.2. inlined from ../contrib/libcxx/include/string:2231: ~basic_string
[ 34577 ] {} <Fatal> BaseDaemon: 22. ../src/Storages/PartitionedSink.h:14: DB::PartitionedSink::~PartitionedSink() @ 0x2215aa48 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 23. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:173: std::__1::__shared_count::__release_shared() @ 0xfad01d6 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 24. ./build_docker/../contrib/libcxx/include/__memory/shared_ptr.h:216: std::__1::__shared_weak_count::__release_shared() @ 0xfad0105 in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 25. ./build_docker/../contrib/libcxx/include/vector:802: std::__1::vector<std::__1::shared_ptr<DB::IProcessor>, std::__1::allocator<std::__1::shared_ptr<DB::IProcessor> > >::__base_destruct_at_end(std::__1::shared_ptr<DB::IProcessor>*) @ 0xfcc265d in /usr/bin/clickhouse
[ 34577 ] {} <Fatal> BaseDaemon: 26.1. inlined from ./build_docker/../contrib/libcxx/include/vector:402: ~vector
[ 34577 ] {} <Fatal> BaseDaemon: 26.2. inlined from ../src/QueryPipeline/QueryPipeline.cpp:29: ~QueryPipeline
[ 34577 ] {} <Fatal> BaseDaemon: 26. ../src/QueryPipeline/QueryPipeline.cpp:535: DB::QueryPipeline::reset() @ 0x225cc546 in /usr/bin/clickhouse
[ 614 ] {} <Fatal> Application: Child process was terminated by signal 6.
</details>
[1]: https://s3.amazonaws.com/clickhouse-test-reports/37542/8a224239c1d922158b4dc9f5d6609dca836dfd06/stress_test__undefined__actions_.html
Follow-up for: #36979
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-05-30 10:13:47 +00:00
|
|
|
throw;
|
|
|
|
}
|
2020-07-09 01:00:16 +00:00
|
|
|
}
|
|
|
|
|
2023-06-22 12:33:25 +00:00
|
|
|
void StorageURLSink::release()
|
|
|
|
{
|
|
|
|
writer.reset();
|
|
|
|
write_buf->finalize();
|
|
|
|
}
|
|
|
|
|
2021-10-26 09:31:01 +00:00
|
|
|
class PartitionedStorageURLSink : public PartitionedSink
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
PartitionedStorageURLSink(
|
|
|
|
const ASTPtr & partition_by,
|
|
|
|
const String & uri_,
|
|
|
|
const String & format_,
|
|
|
|
const std::optional<FormatSettings> & format_settings_,
|
|
|
|
const Block & sample_block_,
|
|
|
|
ContextPtr context_,
|
|
|
|
const ConnectionTimeouts & timeouts_,
|
|
|
|
const CompressionMethod compression_method_,
|
2023-04-21 12:11:18 +00:00
|
|
|
const HTTPHeaderEntries & headers_,
|
2021-10-28 12:44:12 +00:00
|
|
|
const String & http_method_)
|
2022-03-14 09:27:09 +00:00
|
|
|
: PartitionedSink(partition_by, context_, sample_block_)
|
|
|
|
, uri(uri_)
|
|
|
|
, format(format_)
|
|
|
|
, format_settings(format_settings_)
|
|
|
|
, sample_block(sample_block_)
|
|
|
|
, context(context_)
|
|
|
|
, timeouts(timeouts_)
|
|
|
|
, compression_method(compression_method_)
|
2023-04-21 12:11:18 +00:00
|
|
|
, headers(headers_)
|
2022-03-14 09:27:09 +00:00
|
|
|
, http_method(http_method_)
|
2021-10-26 09:31:01 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
SinkPtr createSinkForPartition(const String & partition_id) override
|
|
|
|
{
|
|
|
|
auto partition_path = PartitionedSink::replaceWildcards(uri, partition_id);
|
|
|
|
context->getRemoteHostFilter().checkURL(Poco::URI(partition_path));
|
2022-03-14 09:27:09 +00:00
|
|
|
return std::make_shared<StorageURLSink>(
|
2023-04-21 12:11:18 +00:00
|
|
|
partition_path, format, format_settings, sample_block, context, timeouts, compression_method, headers, http_method);
|
2021-10-26 09:31:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const String uri;
|
|
|
|
const String format;
|
|
|
|
const std::optional<FormatSettings> format_settings;
|
|
|
|
const Block sample_block;
|
|
|
|
ContextPtr context;
|
|
|
|
const ConnectionTimeouts timeouts;
|
|
|
|
|
|
|
|
const CompressionMethod compression_method;
|
2023-04-21 12:11:18 +00:00
|
|
|
const HTTPHeaderEntries headers;
|
2021-10-28 12:44:12 +00:00
|
|
|
const String http_method;
|
2021-10-26 09:31:01 +00:00
|
|
|
};
|
2020-07-09 01:00:16 +00:00
|
|
|
|
2018-08-09 18:49:05 +00:00
|
|
|
std::string IStorageURLBase::getReadMethod() const
|
|
|
|
{
|
|
|
|
return Poco::Net::HTTPRequest::HTTP_GET;
|
|
|
|
}
|
|
|
|
|
2020-06-17 16:39:58 +00:00
|
|
|
std::vector<std::pair<std::string, std::string>> IStorageURLBase::getReadURIParams(
|
|
|
|
const Names & /*column_names*/,
|
2021-07-09 03:15:41 +00:00
|
|
|
const StorageSnapshotPtr & /*storage_snapshot*/,
|
2018-06-11 12:13:00 +00:00
|
|
|
const SelectQueryInfo & /*query_info*/,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr /*context*/,
|
2018-06-11 12:13:00 +00:00
|
|
|
QueryProcessingStage::Enum & /*processed_stage*/,
|
2018-08-09 18:49:05 +00:00
|
|
|
size_t /*max_block_size*/) const
|
|
|
|
{
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2020-06-17 16:39:58 +00:00
|
|
|
std::function<void(std::ostream &)> IStorageURLBase::getReadPOSTDataCallback(
|
|
|
|
const Names & /*column_names*/,
|
2022-02-28 13:29:05 +00:00
|
|
|
const ColumnsDescription & /* columns_description */,
|
2018-06-11 12:13:00 +00:00
|
|
|
const SelectQueryInfo & /*query_info*/,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr /*context*/,
|
2018-08-09 18:49:05 +00:00
|
|
|
QueryProcessingStage::Enum & /*processed_stage*/,
|
|
|
|
size_t /*max_block_size*/) const
|
|
|
|
{
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
class ReadBufferIterator : public IReadBufferIterator, WithContext
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
ReadBufferIterator(
|
|
|
|
const std::vector<String> & urls_to_check_,
|
|
|
|
const String & format_,
|
|
|
|
const CompressionMethod & compression_method_,
|
|
|
|
const HTTPHeaderEntries & headers_,
|
|
|
|
const std::optional<FormatSettings> & format_settings_,
|
|
|
|
const ContextPtr & context_)
|
2023-10-20 20:46:41 +00:00
|
|
|
: WithContext(context_), format(format_), compression_method(compression_method_), headers(headers_), format_settings(format_settings_)
|
2023-08-22 11:59:59 +00:00
|
|
|
{
|
2023-10-20 20:46:41 +00:00
|
|
|
url_options_to_check.reserve(urls_to_check_.size());
|
|
|
|
for (const auto & url : urls_to_check_)
|
|
|
|
url_options_to_check.push_back(getFailoverOptions(url, getContext()->getSettingsRef().glob_expansion_max_elements));
|
2023-08-22 11:59:59 +00:00
|
|
|
}
|
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
std::pair<std::unique_ptr<ReadBuffer>, std::optional<ColumnsDescription>> next() override
|
2023-08-22 11:59:59 +00:00
|
|
|
{
|
2023-10-20 20:46:41 +00:00
|
|
|
bool is_first = (current_index == 0);
|
|
|
|
/// For default mode check cached columns for all urls on first iteration.
|
|
|
|
if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT)
|
|
|
|
{
|
|
|
|
for (const auto & options : url_options_to_check)
|
|
|
|
{
|
|
|
|
if (auto cached_columns = tryGetColumnsFromCache(options))
|
|
|
|
return {nullptr, cached_columns};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
std::pair<Poco::URI, std::unique_ptr<ReadWriteBufferFromHTTP>> uri_and_buf;
|
|
|
|
do
|
|
|
|
{
|
2023-10-20 20:46:41 +00:00
|
|
|
if (current_index == url_options_to_check.size())
|
2023-08-22 11:59:59 +00:00
|
|
|
{
|
2023-10-20 20:46:41 +00:00
|
|
|
if (is_first)
|
2023-08-22 11:59:59 +00:00
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
|
|
|
|
"Cannot extract table structure from {} format file, because all files are empty. "
|
|
|
|
"You must specify table structure manually",
|
|
|
|
format);
|
2023-10-20 20:46:41 +00:00
|
|
|
return {nullptr, std::nullopt};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION)
|
|
|
|
{
|
|
|
|
if (auto cached_columns = tryGetColumnsFromCache(url_options_to_check[current_index]))
|
|
|
|
{
|
|
|
|
++current_index;
|
|
|
|
return {nullptr, cached_columns};
|
|
|
|
}
|
2023-08-22 11:59:59 +00:00
|
|
|
}
|
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
auto first_option = url_options_to_check[current_index].cbegin();
|
2023-08-22 11:59:59 +00:00
|
|
|
uri_and_buf = StorageURLSource::getFirstAvailableURIAndReadBuffer(
|
2023-10-20 20:46:41 +00:00
|
|
|
first_option,
|
|
|
|
url_options_to_check[current_index].cend(),
|
2023-08-22 11:59:59 +00:00
|
|
|
getContext(),
|
|
|
|
{},
|
|
|
|
Poco::Net::HTTPRequest::HTTP_GET,
|
|
|
|
{},
|
|
|
|
getHTTPTimeouts(getContext()),
|
|
|
|
credentials,
|
|
|
|
headers,
|
|
|
|
false,
|
|
|
|
false);
|
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
++current_index;
|
2023-08-22 11:59:59 +00:00
|
|
|
} while (getContext()->getSettingsRef().engine_url_skip_empty_files && uri_and_buf.second->eof());
|
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
current_url_option = uri_and_buf.first.toString();
|
|
|
|
return {wrapReadBufferWithCompressionMethod(
|
2023-08-22 11:59:59 +00:00
|
|
|
std::move(uri_and_buf.second),
|
|
|
|
compression_method,
|
2023-10-20 20:46:41 +00:00
|
|
|
static_cast<int>(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt};
|
2023-08-22 11:59:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void setNumRowsToLastFile(size_t num_rows) override
|
|
|
|
{
|
2023-10-20 20:46:41 +00:00
|
|
|
if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url)
|
2023-08-22 11:59:59 +00:00
|
|
|
return;
|
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext());
|
2023-08-22 11:59:59 +00:00
|
|
|
StorageURL::getSchemaCache(getContext()).addNumRows(key, num_rows);
|
|
|
|
}
|
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
void setSchemaToLastFile(const ColumnsDescription & columns) override
|
|
|
|
{
|
|
|
|
if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url
|
|
|
|
|| getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION)
|
|
|
|
return;
|
|
|
|
|
|
|
|
auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext());
|
|
|
|
StorageURL::getSchemaCache(getContext()).addColumns(key, columns);
|
|
|
|
}
|
|
|
|
|
|
|
|
void setResultingSchema(const ColumnsDescription & columns) override
|
|
|
|
{
|
|
|
|
if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url
|
|
|
|
|| getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::DEFAULT)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (const auto & options : url_options_to_check)
|
|
|
|
{
|
|
|
|
auto keys = getKeysForSchemaCache(options, format, format_settings, getContext());
|
|
|
|
StorageURL::getSchemaCache(getContext()).addManyColumns(keys, columns);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
String getLastFileName() const override { return current_url_option; }
|
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
private:
|
2023-10-20 20:46:41 +00:00
|
|
|
std::optional<ColumnsDescription> tryGetColumnsFromCache(const Strings & urls)
|
|
|
|
{
|
|
|
|
if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url)
|
|
|
|
return std::nullopt;
|
|
|
|
|
|
|
|
auto & schema_cache = StorageURL::getSchemaCache(getContext());
|
|
|
|
for (const auto & url : urls)
|
|
|
|
{
|
|
|
|
auto get_last_mod_time = [&]() -> std::optional<time_t>
|
|
|
|
{
|
|
|
|
auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, getContext());
|
|
|
|
/// Some URLs could not have Last-Modified header, in this case we cannot be sure that
|
|
|
|
/// data wasn't changed after adding it's schema to cache. Use schema from cache only if
|
|
|
|
/// special setting for this case is enabled.
|
|
|
|
if (!last_mod_time && !getContext()->getSettingsRef().schema_inference_cache_require_modification_time_for_url)
|
|
|
|
return 0;
|
|
|
|
return last_mod_time;
|
|
|
|
};
|
|
|
|
|
|
|
|
auto cache_key = getKeyForSchemaCache(url, format, format_settings, getContext());
|
|
|
|
auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time);
|
|
|
|
if (columns)
|
|
|
|
return columns;
|
|
|
|
}
|
|
|
|
|
|
|
|
return std::nullopt;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::vector<String>> url_options_to_check;
|
|
|
|
size_t current_index = 0;
|
|
|
|
String current_url_option;
|
2023-08-22 11:59:59 +00:00
|
|
|
const String & format;
|
|
|
|
const CompressionMethod & compression_method;
|
|
|
|
const HTTPHeaderEntries & headers;
|
|
|
|
Poco::Net::HTTPBasicCredentials credentials;
|
|
|
|
const std::optional<FormatSettings> & format_settings;
|
|
|
|
};
|
|
|
|
}
|
2018-08-09 18:49:05 +00:00
|
|
|
|
2022-02-07 19:40:47 +00:00
|
|
|
ColumnsDescription IStorageURLBase::getTableStructureFromData(
|
|
|
|
const String & format,
|
|
|
|
const String & uri,
|
2022-08-25 11:18:04 +00:00
|
|
|
CompressionMethod compression_method,
|
2022-12-16 22:57:09 +00:00
|
|
|
const HTTPHeaderEntries & headers,
|
2022-02-07 19:40:47 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings,
|
|
|
|
ContextPtr context)
|
|
|
|
{
|
2022-03-25 13:30:45 +00:00
|
|
|
context->getRemoteHostFilter().checkURL(Poco::URI(uri));
|
|
|
|
|
2022-02-08 09:59:20 +00:00
|
|
|
Poco::Net::HTTPBasicCredentials credentials;
|
2022-02-07 19:40:47 +00:00
|
|
|
|
2022-02-09 16:14:14 +00:00
|
|
|
std::vector<String> urls_to_check;
|
2022-02-07 19:40:47 +00:00
|
|
|
if (urlWithGlobs(uri))
|
2023-10-20 20:46:41 +00:00
|
|
|
urls_to_check = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements, "url");
|
2022-02-09 16:14:14 +00:00
|
|
|
else
|
|
|
|
urls_to_check = {uri};
|
2022-02-07 19:40:47 +00:00
|
|
|
|
2023-10-20 20:46:41 +00:00
|
|
|
ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context);
|
|
|
|
return readSchemaFromFormat(format, format_settings, read_buffer_iterator, urls_to_check.size() > 1, context);
|
2022-02-07 19:40:47 +00:00
|
|
|
}
|
|
|
|
|
2023-09-11 14:55:37 +00:00
|
|
|
bool IStorageURLBase::supportsSubsetOfColumns(const ContextPtr & context) const
|
2022-02-23 19:31:16 +00:00
|
|
|
{
|
2023-09-11 14:55:37 +00:00
|
|
|
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings);
|
2022-02-23 19:31:16 +00:00
|
|
|
}
|
|
|
|
|
2023-04-29 02:29:51 +00:00
|
|
|
bool IStorageURLBase::prefersLargeBlocks() const
|
|
|
|
{
|
|
|
|
return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(format_name);
|
|
|
|
}
|
|
|
|
|
2023-05-05 04:18:46 +00:00
|
|
|
bool IStorageURLBase::parallelizeOutputAfterReading(ContextPtr context) const
|
|
|
|
{
|
|
|
|
return FormatFactory::instance().checkParallelizeOutputAfterReading(format_name, context);
|
|
|
|
}
|
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
class ReadFromURL : public SourceStepWithFilter
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
std::string getName() const override { return "ReadFromURL"; }
|
|
|
|
void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override;
|
|
|
|
void applyFilters() override;
|
|
|
|
|
|
|
|
ReadFromURL(
|
|
|
|
Block sample_block,
|
2024-01-02 17:14:16 +00:00
|
|
|
std::shared_ptr<IStorageURLBase> storage_,
|
2024-01-02 15:18:13 +00:00
|
|
|
std::vector<String> * uri_options_,
|
|
|
|
ReadFromFormatInfo info_,
|
|
|
|
const bool need_only_count_,
|
|
|
|
std::vector<std::pair<std::string, std::string>> read_uri_params_,
|
|
|
|
std::function<void(std::ostream &)> read_post_data_callback_,
|
|
|
|
ContextPtr context_,
|
|
|
|
size_t max_block_size_,
|
|
|
|
size_t num_streams_)
|
|
|
|
: SourceStepWithFilter(DataStream{.header = std::move(sample_block)})
|
|
|
|
, storage(std::move(storage_))
|
|
|
|
, uri_options(uri_options_)
|
|
|
|
, info(std::move(info_))
|
|
|
|
, need_only_count(need_only_count_)
|
|
|
|
, read_uri_params(std::move(read_uri_params_))
|
|
|
|
, read_post_data_callback(std::move(read_post_data_callback_))
|
|
|
|
, context(std::move(context_))
|
|
|
|
, max_block_size(max_block_size_)
|
|
|
|
, num_streams(num_streams_)
|
2024-02-10 00:27:15 +00:00
|
|
|
, max_num_streams(num_streams_)
|
2024-01-02 15:18:13 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2024-01-02 17:14:16 +00:00
|
|
|
std::shared_ptr<IStorageURLBase> storage;
|
2024-01-02 15:18:13 +00:00
|
|
|
std::vector<String> * uri_options;
|
|
|
|
|
|
|
|
ReadFromFormatInfo info;
|
|
|
|
const bool need_only_count;
|
|
|
|
std::vector<std::pair<std::string, std::string>> read_uri_params;
|
|
|
|
std::function<void(std::ostream &)> read_post_data_callback;
|
|
|
|
|
|
|
|
ContextPtr context;
|
|
|
|
|
|
|
|
size_t max_block_size;
|
|
|
|
size_t num_streams;
|
2024-02-10 00:27:15 +00:00
|
|
|
const size_t max_num_streams;
|
2024-01-02 15:18:13 +00:00
|
|
|
|
|
|
|
std::shared_ptr<StorageURLSource::IteratorWrapper> iterator_wrapper;
|
|
|
|
bool is_url_with_globs = false;
|
|
|
|
bool is_empty_glob = false;
|
|
|
|
|
|
|
|
void createIterator(const ActionsDAG::Node * predicate);
|
|
|
|
};
|
|
|
|
|
|
|
|
void ReadFromURL::applyFilters()
|
|
|
|
{
|
2024-01-25 15:22:49 +00:00
|
|
|
auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes);
|
2024-01-02 15:18:13 +00:00
|
|
|
const ActionsDAG::Node * predicate = nullptr;
|
|
|
|
if (filter_actions_dag)
|
|
|
|
predicate = filter_actions_dag->getOutputs().at(0);
|
|
|
|
|
|
|
|
createIterator(predicate);
|
|
|
|
}
|
|
|
|
|
|
|
|
void IStorageURLBase::read(
|
|
|
|
QueryPlan & query_plan,
|
2020-06-15 19:08:58 +00:00
|
|
|
const Names & column_names,
|
2021-07-09 03:15:41 +00:00
|
|
|
const StorageSnapshotPtr & storage_snapshot,
|
2020-09-20 17:52:17 +00:00
|
|
|
SelectQueryInfo & query_info,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr local_context,
|
2018-06-25 12:21:54 +00:00
|
|
|
QueryProcessingStage::Enum processed_stage,
|
2019-02-18 23:38:44 +00:00
|
|
|
size_t max_block_size,
|
2022-10-07 10:46:45 +00:00
|
|
|
size_t num_streams)
|
2018-06-11 12:13:00 +00:00
|
|
|
{
|
2021-07-09 03:15:41 +00:00
|
|
|
auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size);
|
2023-09-11 14:55:37 +00:00
|
|
|
auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context), getVirtuals());
|
2023-08-24 13:15:57 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty())
|
|
|
|
&& local_context->getSettingsRef().optimize_count_from_files;
|
|
|
|
|
|
|
|
auto read_post_data_callback = getReadPOSTDataCallback(
|
|
|
|
read_from_format_info.columns_description.getNamesOfPhysical(),
|
|
|
|
read_from_format_info.columns_description,
|
|
|
|
query_info,
|
|
|
|
local_context,
|
|
|
|
processed_stage,
|
|
|
|
max_block_size);
|
|
|
|
|
2024-01-02 17:14:16 +00:00
|
|
|
auto this_ptr = std::static_pointer_cast<IStorageURLBase>(shared_from_this());
|
2024-01-02 15:18:13 +00:00
|
|
|
|
|
|
|
auto reading = std::make_unique<ReadFromURL>(
|
|
|
|
read_from_format_info.source_header,
|
|
|
|
std::move(this_ptr),
|
|
|
|
nullptr,
|
|
|
|
std::move(read_from_format_info),
|
|
|
|
need_only_count,
|
|
|
|
std::move(params),
|
|
|
|
std::move(read_post_data_callback),
|
|
|
|
local_context,
|
|
|
|
max_block_size,
|
|
|
|
num_streams);
|
|
|
|
|
|
|
|
query_plan.addStep(std::move(reading));
|
|
|
|
}
|
|
|
|
|
|
|
|
void ReadFromURL::createIterator(const ActionsDAG::Node * predicate)
|
|
|
|
{
|
|
|
|
if (iterator_wrapper || is_empty_glob)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (uri_options)
|
|
|
|
{
|
|
|
|
iterator_wrapper = std::make_shared<StorageURLSource::IteratorWrapper>([&, done = false]() mutable
|
|
|
|
{
|
|
|
|
if (done)
|
|
|
|
return StorageURLSource::FailoverOptions{};
|
|
|
|
done = true;
|
|
|
|
return *uri_options;
|
|
|
|
});
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t max_addresses = context->getSettingsRef().glob_expansion_max_elements;
|
|
|
|
is_url_with_globs = urlWithGlobs(storage->uri);
|
|
|
|
|
|
|
|
if (storage->distributed_processing)
|
2021-10-26 09:31:01 +00:00
|
|
|
{
|
2023-04-21 17:24:37 +00:00
|
|
|
iterator_wrapper = std::make_shared<StorageURLSource::IteratorWrapper>(
|
2024-01-02 15:18:13 +00:00
|
|
|
[callback = context->getReadTaskCallback(), max_addresses]()
|
2023-04-21 17:24:37 +00:00
|
|
|
{
|
|
|
|
String next_uri = callback();
|
|
|
|
if (next_uri.empty())
|
|
|
|
return StorageURLSource::FailoverOptions{};
|
|
|
|
return getFailoverOptions(next_uri, max_addresses);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
else if (is_url_with_globs)
|
|
|
|
{
|
|
|
|
/// Iterate through disclosed globs and make a source for each file
|
2024-01-02 15:18:13 +00:00
|
|
|
auto glob_iterator = std::make_shared<StorageURLSource::DisclosedGlobIterator>(storage->uri, max_addresses, predicate, storage->virtual_columns, context);
|
2023-08-24 13:15:57 +00:00
|
|
|
|
|
|
|
/// check if we filtered out all the paths
|
|
|
|
if (glob_iterator->size() == 0)
|
2024-01-02 15:18:13 +00:00
|
|
|
{
|
|
|
|
is_empty_glob = true;
|
|
|
|
return;
|
|
|
|
}
|
2023-08-24 13:15:57 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
iterator_wrapper = std::make_shared<StorageURLSource::IteratorWrapper>([glob_iterator, max_addresses]()
|
2021-12-17 11:03:37 +00:00
|
|
|
{
|
2023-04-21 17:24:37 +00:00
|
|
|
String next_uri = glob_iterator->next();
|
|
|
|
if (next_uri.empty())
|
|
|
|
return StorageURLSource::FailoverOptions{};
|
|
|
|
return getFailoverOptions(next_uri, max_addresses);
|
|
|
|
});
|
|
|
|
|
|
|
|
if (num_streams > glob_iterator->size())
|
|
|
|
num_streams = glob_iterator->size();
|
2021-10-26 09:31:01 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2024-01-03 09:45:25 +00:00
|
|
|
iterator_wrapper = std::make_shared<StorageURLSource::IteratorWrapper>([max_addresses, done = false, &uri = storage->uri]() mutable
|
2023-04-21 17:24:37 +00:00
|
|
|
{
|
|
|
|
if (done)
|
|
|
|
return StorageURLSource::FailoverOptions{};
|
|
|
|
done = true;
|
|
|
|
return getFailoverOptions(uri, max_addresses);
|
|
|
|
});
|
|
|
|
num_streams = 1;
|
|
|
|
}
|
2024-01-02 15:18:13 +00:00
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
|
|
|
|
{
|
|
|
|
createIterator(nullptr);
|
|
|
|
|
|
|
|
if (is_empty_glob)
|
|
|
|
{
|
|
|
|
pipeline.init(Pipe(std::make_shared<NullSource>(info.source_header)));
|
|
|
|
return;
|
|
|
|
}
|
2023-07-04 16:50:31 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
Pipes pipes;
|
|
|
|
pipes.reserve(num_streams);
|
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
const size_t max_threads = context->getSettingsRef().max_threads;
|
2023-08-22 15:23:10 +00:00
|
|
|
const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / num_streams);
|
2023-08-22 01:21:03 +00:00
|
|
|
|
2023-04-21 17:24:37 +00:00
|
|
|
for (size_t i = 0; i < num_streams; ++i)
|
|
|
|
{
|
2024-01-03 17:44:28 +00:00
|
|
|
auto source = std::make_shared<StorageURLSource>(
|
2024-01-02 15:18:13 +00:00
|
|
|
info,
|
2023-04-21 17:24:37 +00:00
|
|
|
iterator_wrapper,
|
2024-01-02 15:18:13 +00:00
|
|
|
storage->getReadMethod(),
|
|
|
|
read_post_data_callback,
|
|
|
|
storage->format_name,
|
|
|
|
storage->format_settings,
|
|
|
|
storage->getName(),
|
|
|
|
context,
|
2021-10-26 09:31:01 +00:00
|
|
|
max_block_size,
|
2024-01-02 15:18:13 +00:00
|
|
|
getHTTPTimeouts(context),
|
|
|
|
storage->compression_method,
|
2023-08-22 15:23:10 +00:00
|
|
|
max_parsing_threads,
|
2024-01-02 15:18:13 +00:00
|
|
|
storage->headers,
|
|
|
|
read_uri_params,
|
2023-08-21 12:30:52 +00:00
|
|
|
is_url_with_globs,
|
2024-01-03 17:44:28 +00:00
|
|
|
need_only_count);
|
|
|
|
|
|
|
|
source->setKeyCondition(filter_nodes.nodes, context);
|
|
|
|
pipes.emplace_back(std::move(source));
|
2021-10-26 09:31:01 +00:00
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
if (uri_options)
|
|
|
|
std::shuffle(uri_options->begin(), uri_options->end(), thread_local_rng);
|
|
|
|
|
|
|
|
auto pipe = Pipe::unitePipes(std::move(pipes));
|
|
|
|
size_t output_ports = pipe.numOutputPorts();
|
|
|
|
const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages;
|
2024-02-10 00:27:15 +00:00
|
|
|
if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams)
|
|
|
|
pipe.resize(max_num_streams);
|
2024-01-02 15:18:13 +00:00
|
|
|
|
|
|
|
if (pipe.empty())
|
|
|
|
pipe = Pipe(std::make_shared<NullSource>(info.source_header));
|
|
|
|
|
|
|
|
for (const auto & processor : pipe.getProcessors())
|
|
|
|
processors.emplace_back(processor);
|
|
|
|
|
|
|
|
pipeline.init(std::move(pipe));
|
2018-06-11 12:13:00 +00:00
|
|
|
}
|
|
|
|
|
2021-04-21 14:36:04 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
void StorageURLWithFailover::read(
|
|
|
|
QueryPlan & query_plan,
|
2021-04-21 14:36:04 +00:00
|
|
|
const Names & column_names,
|
2021-11-09 12:36:25 +00:00
|
|
|
const StorageSnapshotPtr & storage_snapshot,
|
2021-04-21 14:36:04 +00:00
|
|
|
SelectQueryInfo & query_info,
|
|
|
|
ContextPtr local_context,
|
|
|
|
QueryProcessingStage::Enum processed_stage,
|
|
|
|
size_t max_block_size,
|
2023-08-22 01:21:03 +00:00
|
|
|
size_t num_streams)
|
2021-04-21 14:36:04 +00:00
|
|
|
{
|
2021-11-09 12:36:25 +00:00
|
|
|
auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size);
|
2023-09-11 14:55:37 +00:00
|
|
|
auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context), getVirtuals());
|
2023-07-04 16:50:31 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty())
|
|
|
|
&& local_context->getSettingsRef().optimize_count_from_files;
|
2023-08-22 01:21:03 +00:00
|
|
|
|
2024-01-02 15:18:13 +00:00
|
|
|
auto read_post_data_callback = getReadPOSTDataCallback(
|
|
|
|
read_from_format_info.columns_description.getNamesOfPhysical(),
|
|
|
|
read_from_format_info.columns_description,
|
|
|
|
query_info,
|
|
|
|
local_context,
|
|
|
|
processed_stage,
|
|
|
|
max_block_size);
|
|
|
|
|
|
|
|
auto this_ptr = std::static_pointer_cast<StorageURL>(shared_from_this());
|
|
|
|
|
|
|
|
auto reading = std::make_unique<ReadFromURL>(
|
|
|
|
read_from_format_info.source_header,
|
|
|
|
std::move(this_ptr),
|
|
|
|
&uri_options,
|
|
|
|
std::move(read_from_format_info),
|
|
|
|
need_only_count,
|
|
|
|
std::move(params),
|
|
|
|
std::move(read_post_data_callback),
|
2021-10-03 13:53:24 +00:00
|
|
|
local_context,
|
|
|
|
max_block_size,
|
2024-01-02 15:18:13 +00:00
|
|
|
num_streams);
|
|
|
|
|
|
|
|
query_plan.addStep(std::move(reading));
|
2021-04-21 14:36:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-06-07 18:33:08 +00:00
|
|
|
SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool /*async_insert*/)
|
2018-06-11 12:13:00 +00:00
|
|
|
{
|
2021-10-28 12:44:12 +00:00
|
|
|
if (http_method.empty())
|
|
|
|
http_method = Poco::Net::HTTPRequest::HTTP_POST;
|
2021-10-26 09:31:01 +00:00
|
|
|
|
|
|
|
bool has_wildcards = uri.find(PartitionedSink::PARTITION_ID_WILDCARD) != String::npos;
|
|
|
|
const auto * insert_query = dynamic_cast<const ASTInsertQuery *>(query.get());
|
2021-10-26 12:22:13 +00:00
|
|
|
auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr;
|
|
|
|
bool is_partitioned_implementation = partition_by_ast && has_wildcards;
|
2021-10-26 09:31:01 +00:00
|
|
|
|
|
|
|
if (is_partitioned_implementation)
|
|
|
|
{
|
|
|
|
return std::make_shared<PartitionedStorageURLSink>(
|
2021-10-26 12:22:13 +00:00
|
|
|
partition_by_ast,
|
2022-03-14 09:27:09 +00:00
|
|
|
uri,
|
|
|
|
format_name,
|
|
|
|
format_settings,
|
|
|
|
metadata_snapshot->getSampleBlock(),
|
|
|
|
context,
|
2023-04-21 12:11:18 +00:00
|
|
|
getHTTPTimeouts(context),
|
2022-08-25 11:18:04 +00:00
|
|
|
compression_method,
|
2023-04-21 12:11:18 +00:00
|
|
|
headers,
|
2022-03-14 09:27:09 +00:00
|
|
|
http_method);
|
2021-10-26 09:31:01 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-03-14 09:27:09 +00:00
|
|
|
return std::make_shared<StorageURLSink>(
|
|
|
|
uri,
|
|
|
|
format_name,
|
|
|
|
format_settings,
|
|
|
|
metadata_snapshot->getSampleBlock(),
|
|
|
|
context,
|
2023-04-21 12:11:18 +00:00
|
|
|
getHTTPTimeouts(context),
|
2022-08-25 11:18:04 +00:00
|
|
|
compression_method,
|
2023-04-21 12:11:18 +00:00
|
|
|
headers,
|
2022-03-14 09:27:09 +00:00
|
|
|
http_method);
|
2021-10-26 09:31:01 +00:00
|
|
|
}
|
2018-06-11 12:13:00 +00:00
|
|
|
}
|
2018-06-16 05:54:06 +00:00
|
|
|
|
2023-05-01 13:40:14 +00:00
|
|
|
NamesAndTypesList IStorageURLBase::getVirtuals() const
|
|
|
|
{
|
2023-08-17 16:54:43 +00:00
|
|
|
return virtual_columns;
|
2023-05-01 13:40:14 +00:00
|
|
|
}
|
|
|
|
|
2023-11-28 18:15:07 +00:00
|
|
|
Names IStorageURLBase::getVirtualColumnNames()
|
|
|
|
{
|
|
|
|
return VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage({}).getNames();
|
|
|
|
}
|
|
|
|
|
2022-08-05 16:20:15 +00:00
|
|
|
SchemaCache & IStorageURLBase::getSchemaCache(const ContextPtr & context)
|
2022-06-21 13:02:48 +00:00
|
|
|
{
|
2022-08-05 16:20:15 +00:00
|
|
|
static SchemaCache schema_cache(context->getConfigRef().getUInt("schema_inference_cache_max_elements_for_url", DEFAULT_SCHEMA_CACHE_ELEMENTS));
|
2022-06-21 13:02:48 +00:00
|
|
|
return schema_cache;
|
|
|
|
}
|
|
|
|
|
2023-08-23 18:43:08 +00:00
|
|
|
std::optional<time_t> IStorageURLBase::tryGetLastModificationTime(
|
2022-06-28 16:13:42 +00:00
|
|
|
const String & url,
|
2022-12-16 22:57:09 +00:00
|
|
|
const HTTPHeaderEntries & headers,
|
2022-06-28 16:13:42 +00:00
|
|
|
const Poco::Net::HTTPBasicCredentials & credentials,
|
|
|
|
const ContextPtr & context)
|
2022-06-21 17:18:14 +00:00
|
|
|
{
|
2022-11-03 11:10:49 +00:00
|
|
|
auto settings = context->getSettingsRef();
|
|
|
|
|
2023-08-24 15:21:18 +00:00
|
|
|
auto uri = Poco::URI(url);
|
2023-08-24 13:07:26 +00:00
|
|
|
|
2023-08-24 15:21:18 +00:00
|
|
|
auto proxy_config = getProxyConfiguration(uri.getScheme());
|
2023-08-24 13:07:26 +00:00
|
|
|
|
2023-08-23 18:43:08 +00:00
|
|
|
ReadWriteBufferFromHTTP buf(
|
2023-08-24 15:21:18 +00:00
|
|
|
uri,
|
2023-08-23 18:43:08 +00:00
|
|
|
Poco::Net::HTTPRequest::HTTP_GET,
|
|
|
|
{},
|
|
|
|
getHTTPTimeouts(context),
|
|
|
|
credentials,
|
|
|
|
settings.max_http_get_redirects,
|
|
|
|
settings.max_read_buffer_size,
|
|
|
|
context->getReadSettings(),
|
|
|
|
headers,
|
|
|
|
&context->getRemoteHostFilter(),
|
|
|
|
true,
|
|
|
|
false,
|
2023-08-24 15:21:18 +00:00
|
|
|
false,
|
|
|
|
std::nullopt,
|
|
|
|
proxy_config);
|
2022-06-21 17:18:14 +00:00
|
|
|
|
2023-08-23 18:43:08 +00:00
|
|
|
return buf.tryGetLastModificationTime();
|
2022-06-21 17:18:14 +00:00
|
|
|
}
|
|
|
|
|
2021-04-23 12:18:23 +00:00
|
|
|
StorageURL::StorageURL(
|
2021-10-26 09:31:01 +00:00
|
|
|
const String & uri_,
|
2021-04-23 12:18:23 +00:00
|
|
|
const StorageID & table_id_,
|
|
|
|
const String & format_name_,
|
|
|
|
const std::optional<FormatSettings> & format_settings_,
|
|
|
|
const ColumnsDescription & columns_,
|
|
|
|
const ConstraintsDescription & constraints_,
|
|
|
|
const String & comment,
|
|
|
|
ContextPtr context_,
|
2021-09-07 11:17:25 +00:00
|
|
|
const String & compression_method_,
|
2022-12-16 22:57:09 +00:00
|
|
|
const HTTPHeaderEntries & headers_,
|
2021-10-28 12:44:12 +00:00
|
|
|
const String & http_method_,
|
2023-04-21 17:24:37 +00:00
|
|
|
ASTPtr partition_by_,
|
|
|
|
bool distributed_processing_)
|
2022-03-14 09:27:09 +00:00
|
|
|
: IStorageURLBase(
|
|
|
|
uri_,
|
|
|
|
context_,
|
|
|
|
table_id_,
|
|
|
|
format_name_,
|
|
|
|
format_settings_,
|
|
|
|
columns_,
|
|
|
|
constraints_,
|
|
|
|
comment,
|
|
|
|
compression_method_,
|
|
|
|
headers_,
|
|
|
|
http_method_,
|
2023-04-21 17:24:37 +00:00
|
|
|
partition_by_,
|
|
|
|
distributed_processing_)
|
2021-03-23 11:29:29 +00:00
|
|
|
{
|
2021-10-26 09:31:01 +00:00
|
|
|
context_->getRemoteHostFilter().checkURL(Poco::URI(uri));
|
2023-06-15 13:49:49 +00:00
|
|
|
context_->getHTTPHeaderFilter().checkHeaders(headers);
|
2021-03-23 11:29:29 +00:00
|
|
|
}
|
|
|
|
|
2021-04-21 12:32:57 +00:00
|
|
|
|
2021-04-21 14:36:04 +00:00
|
|
|
StorageURLWithFailover::StorageURLWithFailover(
|
2021-05-02 16:33:45 +00:00
|
|
|
const std::vector<String> & uri_options_,
|
|
|
|
const StorageID & table_id_,
|
|
|
|
const String & format_name_,
|
|
|
|
const std::optional<FormatSettings> & format_settings_,
|
|
|
|
const ColumnsDescription & columns_,
|
|
|
|
const ConstraintsDescription & constraints_,
|
|
|
|
ContextPtr context_,
|
|
|
|
const String & compression_method_)
|
2021-10-26 09:31:01 +00:00
|
|
|
: StorageURL("", table_id_, format_name_, format_settings_, columns_, constraints_, String{}, context_, compression_method_)
|
2021-04-21 14:36:04 +00:00
|
|
|
{
|
|
|
|
for (const auto & uri_option : uri_options_)
|
|
|
|
{
|
2021-04-22 07:37:20 +00:00
|
|
|
Poco::URI poco_uri(uri_option);
|
|
|
|
context_->getRemoteHostFilter().checkURL(poco_uri);
|
2024-01-23 17:04:50 +00:00
|
|
|
LOG_DEBUG(getLogger("StorageURLDistributed"), "Adding URL option: {}", uri_option);
|
2022-03-02 17:22:12 +00:00
|
|
|
uri_options.emplace_back(uri_option);
|
2021-04-21 14:36:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-04-21 12:32:57 +00:00
|
|
|
FormatSettings StorageURL::getFormatSettingsFromArgs(const StorageFactory::Arguments & args)
|
|
|
|
{
|
|
|
|
// Use format settings from global server context + settings from
|
|
|
|
// the SETTINGS clause of the create query. Settings from current
|
|
|
|
// session and user are ignored.
|
|
|
|
FormatSettings format_settings;
|
|
|
|
if (args.storage_def->settings)
|
|
|
|
{
|
|
|
|
FormatFactorySettings user_format_settings;
|
|
|
|
|
|
|
|
// Apply changed settings from global context, but ignore the
|
|
|
|
// unknown ones, because we only have the format settings here.
|
|
|
|
const auto & changes = args.getContext()->getSettingsRef().changes();
|
|
|
|
for (const auto & change : changes)
|
|
|
|
{
|
|
|
|
if (user_format_settings.has(change.name))
|
|
|
|
{
|
|
|
|
user_format_settings.set(change.name, change.value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Apply changes from SETTINGS clause, with validation.
|
|
|
|
user_format_settings.applyChanges(args.storage_def->settings->changes);
|
|
|
|
|
2022-03-14 09:27:09 +00:00
|
|
|
format_settings = getFormatSettings(args.getContext(), user_format_settings);
|
2021-04-21 12:32:57 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
format_settings = getFormatSettings(args.getContext());
|
|
|
|
}
|
|
|
|
|
|
|
|
return format_settings;
|
|
|
|
}
|
|
|
|
|
2024-01-18 01:56:21 +00:00
|
|
|
size_t StorageURL::evalArgsAndCollectHeaders(
|
2022-12-16 22:57:09 +00:00
|
|
|
ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context)
|
2022-06-17 12:53:16 +00:00
|
|
|
{
|
2022-06-22 14:51:18 +00:00
|
|
|
ASTs::iterator headers_it = url_function_args.end();
|
2022-06-17 12:53:16 +00:00
|
|
|
|
2022-10-18 09:40:12 +00:00
|
|
|
for (auto * arg_it = url_function_args.begin(); arg_it != url_function_args.end(); ++arg_it)
|
2022-06-17 12:53:16 +00:00
|
|
|
{
|
2022-06-22 14:51:18 +00:00
|
|
|
const auto * headers_ast_function = (*arg_it)->as<ASTFunction>();
|
2022-12-19 11:16:50 +00:00
|
|
|
if (headers_ast_function && headers_ast_function->name == "headers")
|
2022-06-17 12:53:16 +00:00
|
|
|
{
|
2022-06-22 14:51:18 +00:00
|
|
|
if (headers_it != url_function_args.end())
|
2022-06-17 12:53:16 +00:00
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::BAD_ARGUMENTS,
|
|
|
|
"URL table function can have only one key-value argument: headers=(). {}",
|
|
|
|
bad_arguments_error_message);
|
|
|
|
|
|
|
|
const auto * headers_function_args_expr = assert_cast<const ASTExpressionList *>(headers_ast_function->arguments.get());
|
|
|
|
auto headers_function_args = headers_function_args_expr->children;
|
|
|
|
|
|
|
|
for (auto & header_arg : headers_function_args)
|
|
|
|
{
|
|
|
|
const auto * header_ast = header_arg->as<ASTFunction>();
|
2022-06-22 14:51:18 +00:00
|
|
|
if (!header_ast || header_ast->name != "equals")
|
2022-06-17 12:53:16 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Headers argument is incorrect. {}", bad_arguments_error_message);
|
|
|
|
|
|
|
|
const auto * header_args_expr = assert_cast<const ASTExpressionList *>(header_ast->arguments.get());
|
|
|
|
auto header_args = header_args_expr->children;
|
|
|
|
if (header_args.size() != 2)
|
2022-06-22 14:51:18 +00:00
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::BAD_ARGUMENTS,
|
|
|
|
"Headers argument is incorrect: expected 2 arguments, got {}",
|
|
|
|
header_args.size());
|
2022-06-17 12:53:16 +00:00
|
|
|
|
|
|
|
auto ast_literal = evaluateConstantExpressionOrIdentifierAsLiteral(header_args[0], context);
|
2022-06-22 14:51:18 +00:00
|
|
|
auto arg_name_value = ast_literal->as<ASTLiteral>()->value;
|
|
|
|
if (arg_name_value.getType() != Field::Types::Which::String)
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected string as header name");
|
|
|
|
auto arg_name = arg_name_value.safeGet<String>();
|
|
|
|
|
2022-06-17 12:53:16 +00:00
|
|
|
ast_literal = evaluateConstantExpressionOrIdentifierAsLiteral(header_args[1], context);
|
|
|
|
auto arg_value = ast_literal->as<ASTLiteral>()->value;
|
2022-06-22 14:51:18 +00:00
|
|
|
if (arg_value.getType() != Field::Types::Which::String)
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected string as header value");
|
|
|
|
|
2022-12-16 22:57:09 +00:00
|
|
|
header_entries.emplace_back(arg_name, arg_value.safeGet<String>());
|
2022-06-17 12:53:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
headers_it = arg_it;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-12-19 11:16:50 +00:00
|
|
|
if (headers_ast_function && headers_ast_function->name == "equals")
|
|
|
|
continue;
|
|
|
|
|
2022-06-17 12:53:16 +00:00
|
|
|
(*arg_it) = evaluateConstantExpressionOrIdentifierAsLiteral((*arg_it), context);
|
|
|
|
}
|
|
|
|
|
2024-01-18 01:56:21 +00:00
|
|
|
if (headers_it == url_function_args.end())
|
|
|
|
return url_function_args.size();
|
|
|
|
|
|
|
|
std::rotate(headers_it, std::next(headers_it), url_function_args.end());
|
|
|
|
return url_function_args.size() - 1;
|
2022-06-17 12:53:16 +00:00
|
|
|
}
|
|
|
|
|
2022-12-16 22:57:09 +00:00
|
|
|
void StorageURL::processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection)
|
2018-06-11 12:13:00 +00:00
|
|
|
{
|
2022-12-16 22:57:09 +00:00
|
|
|
validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys, optional_regex_keys);
|
|
|
|
|
|
|
|
configuration.url = collection.get<String>("url");
|
|
|
|
configuration.headers = getHeadersFromNamedCollection(collection);
|
|
|
|
|
2022-12-19 12:56:23 +00:00
|
|
|
configuration.http_method = collection.getOrDefault<String>("http_method", collection.getOrDefault<String>("method", ""));
|
2022-12-16 22:57:09 +00:00
|
|
|
if (!configuration.http_method.empty() && configuration.http_method != Poco::Net::HTTPRequest::HTTP_POST
|
|
|
|
&& configuration.http_method != Poco::Net::HTTPRequest::HTTP_PUT)
|
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::BAD_ARGUMENTS,
|
|
|
|
"Http method can be POST or PUT (current: {}). For insert default is POST, for select GET",
|
|
|
|
configuration.http_method);
|
|
|
|
|
|
|
|
configuration.format = collection.getOrDefault<String>("format", "auto");
|
|
|
|
configuration.compression_method = collection.getOrDefault<String>("compression_method", collection.getOrDefault<String>("compression", "auto"));
|
|
|
|
configuration.structure = collection.getOrDefault<String>("structure", "auto");
|
|
|
|
}
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2022-12-16 22:57:09 +00:00
|
|
|
StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr local_context)
|
|
|
|
{
|
|
|
|
StorageURL::Configuration configuration;
|
2021-10-26 09:31:01 +00:00
|
|
|
|
2023-04-21 12:11:18 +00:00
|
|
|
if (auto named_collection = tryGetNamedCollectionWithOverrides(args, local_context))
|
2022-12-16 22:57:09 +00:00
|
|
|
{
|
|
|
|
StorageURL::processNamedCollectionResult(configuration, *named_collection);
|
2024-01-18 01:56:21 +00:00
|
|
|
evalArgsAndCollectHeaders(args, configuration.headers, local_context);
|
2021-09-07 11:17:25 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2024-01-18 01:56:21 +00:00
|
|
|
size_t count = evalArgsAndCollectHeaders(args, configuration.headers, local_context);
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2024-01-18 01:56:21 +00:00
|
|
|
if (count == 0 || count > 3)
|
|
|
|
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, bad_arguments_error_message);
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2022-06-23 20:04:06 +00:00
|
|
|
configuration.url = checkAndGetLiteralArgument<String>(args[0], "url");
|
2024-01-18 01:56:21 +00:00
|
|
|
if (count > 1)
|
2022-06-23 20:04:06 +00:00
|
|
|
configuration.format = checkAndGetLiteralArgument<String>(args[1], "format");
|
2024-01-18 01:56:21 +00:00
|
|
|
if (count == 3)
|
2022-06-23 20:04:06 +00:00
|
|
|
configuration.compression_method = checkAndGetLiteralArgument<String>(args[2], "compression_method");
|
2021-09-07 11:17:25 +00:00
|
|
|
}
|
2018-06-11 12:13:00 +00:00
|
|
|
|
2022-01-13 13:14:18 +00:00
|
|
|
if (configuration.format == "auto")
|
2022-08-25 18:31:06 +00:00
|
|
|
configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(configuration.url).getPath(), true);
|
2022-01-13 13:14:18 +00:00
|
|
|
|
2022-12-16 22:57:09 +00:00
|
|
|
for (const auto & [header, value] : configuration.headers)
|
|
|
|
{
|
|
|
|
if (header == "Range")
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed");
|
|
|
|
}
|
|
|
|
|
2021-09-07 11:17:25 +00:00
|
|
|
return configuration;
|
|
|
|
}
|
2018-06-11 12:13:00 +00:00
|
|
|
|
|
|
|
|
2021-09-07 11:17:25 +00:00
|
|
|
void registerStorageURL(StorageFactory & factory)
|
|
|
|
{
|
2022-03-14 09:27:09 +00:00
|
|
|
factory.registerStorage(
|
|
|
|
"URL",
|
|
|
|
[](const StorageFactory::Arguments & args)
|
2019-11-19 12:46:07 +00:00
|
|
|
{
|
2022-03-14 09:27:09 +00:00
|
|
|
ASTs & engine_args = args.engine_args;
|
|
|
|
auto configuration = StorageURL::getConfiguration(engine_args, args.getLocalContext());
|
|
|
|
auto format_settings = StorageURL::getFormatSettingsFromArgs(args);
|
2020-11-05 11:28:20 +00:00
|
|
|
|
2022-03-14 09:27:09 +00:00
|
|
|
ASTPtr partition_by;
|
|
|
|
if (args.storage_def->partition_by)
|
|
|
|
partition_by = args.storage_def->partition_by->clone();
|
|
|
|
|
2022-04-19 20:47:29 +00:00
|
|
|
return std::make_shared<StorageURL>(
|
2022-03-14 09:27:09 +00:00
|
|
|
configuration.url,
|
|
|
|
args.table_id,
|
|
|
|
configuration.format,
|
|
|
|
format_settings,
|
|
|
|
args.columns,
|
|
|
|
args.constraints,
|
|
|
|
args.comment,
|
|
|
|
args.getContext(),
|
|
|
|
configuration.compression_method,
|
2022-12-16 22:57:09 +00:00
|
|
|
configuration.headers,
|
2022-03-14 09:27:09 +00:00
|
|
|
configuration.http_method,
|
|
|
|
partition_by);
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.supports_settings = true,
|
|
|
|
.supports_schema_inference = true,
|
|
|
|
.source_access_type = AccessType::URL,
|
|
|
|
});
|
2018-06-11 12:13:00 +00:00
|
|
|
}
|
2023-04-21 17:24:37 +00:00
|
|
|
}
|