2018-11-19 08:17:09 +00:00
|
|
|
#pragma once
|
2021-06-11 11:26:33 +00:00
|
|
|
|
2022-09-28 08:45:15 +00:00
|
|
|
#include "config.h"
|
2021-06-11 11:26:33 +00:00
|
|
|
|
2018-12-05 13:24:45 +00:00
|
|
|
#if USE_HDFS
|
2018-11-19 08:17:09 +00:00
|
|
|
|
2022-05-20 19:49:31 +00:00
|
|
|
#include <Processors/ISource.h>
|
2018-11-19 08:17:09 +00:00
|
|
|
#include <Storages/IStorage.h>
|
2022-06-21 13:02:48 +00:00
|
|
|
#include <Storages/Cache/SchemaCache.h>
|
2023-07-04 16:50:31 +00:00
|
|
|
#include <Storages/prepareReadingFromFormat.h>
|
2023-08-02 17:47:42 +00:00
|
|
|
#include <Storages/SelectQueryInfo.h>
|
2018-11-19 08:17:09 +00:00
|
|
|
#include <Poco/URI.h>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2023-06-16 15:51:18 +00:00
|
|
|
|
|
|
|
class IInputFormat;
|
|
|
|
|
2018-11-19 08:17:09 +00:00
|
|
|
/**
|
|
|
|
* This class represents table engine for external hdfs files.
|
|
|
|
* Read method is supported for now.
|
|
|
|
*/
|
2022-05-03 06:43:28 +00:00
|
|
|
class StorageHDFS final : public IStorage, WithContext
|
2018-11-19 08:17:09 +00:00
|
|
|
{
|
|
|
|
public:
|
2023-05-30 19:32:24 +00:00
|
|
|
struct PathInfo
|
|
|
|
{
|
|
|
|
time_t last_mod_time;
|
|
|
|
size_t size;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct PathWithInfo
|
|
|
|
{
|
2023-07-26 15:15:28 +00:00
|
|
|
PathWithInfo() = default;
|
|
|
|
PathWithInfo(const String & path_, const std::optional<PathInfo> & info_) : path(path_), info(info_) {}
|
2023-05-30 19:32:24 +00:00
|
|
|
String path;
|
|
|
|
std::optional<PathInfo> info;
|
|
|
|
};
|
|
|
|
|
2022-04-19 20:47:29 +00:00
|
|
|
StorageHDFS(
|
|
|
|
const String & uri_,
|
|
|
|
const StorageID & table_id_,
|
|
|
|
const String & format_name_,
|
|
|
|
const ColumnsDescription & columns_,
|
|
|
|
const ConstraintsDescription & constraints_,
|
|
|
|
const String & comment,
|
|
|
|
ContextPtr context_,
|
|
|
|
const String & compression_method_ = "",
|
|
|
|
bool distributed_processing_ = false,
|
|
|
|
ASTPtr partition_by = nullptr);
|
|
|
|
|
2019-07-09 15:40:21 +00:00
|
|
|
String getName() const override { return "HDFS"; }
|
2018-11-19 08:17:09 +00:00
|
|
|
|
2020-08-03 13:54:14 +00:00
|
|
|
Pipe read(
|
2020-06-15 19:08:58 +00:00
|
|
|
const Names & column_names,
|
2021-07-09 03:15:41 +00:00
|
|
|
const StorageSnapshotPtr & storage_snapshot,
|
2020-09-20 17:52:17 +00:00
|
|
|
SelectQueryInfo & query_info,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr context,
|
2018-11-19 08:17:09 +00:00
|
|
|
QueryProcessingStage::Enum processed_stage,
|
2019-02-18 23:38:44 +00:00
|
|
|
size_t max_block_size,
|
2022-10-07 10:46:45 +00:00
|
|
|
size_t num_streams) override;
|
2018-11-19 08:17:09 +00:00
|
|
|
|
2023-06-07 18:33:08 +00:00
|
|
|
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override;
|
2018-11-19 08:17:09 +00:00
|
|
|
|
2021-12-25 03:10:59 +00:00
|
|
|
void truncate(
|
|
|
|
const ASTPtr & query,
|
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
|
|
|
ContextPtr local_context,
|
|
|
|
TableExclusiveLockHolder &) override;
|
2021-06-21 13:50:09 +00:00
|
|
|
|
2020-04-28 10:38:57 +00:00
|
|
|
NamesAndTypesList getVirtuals() const override;
|
2020-04-27 13:55:30 +00:00
|
|
|
|
2021-10-25 16:23:44 +00:00
|
|
|
bool supportsPartitionBy() const override { return true; }
|
|
|
|
|
2021-12-06 03:54:45 +00:00
|
|
|
/// Check if the format is column-oriented.
|
|
|
|
/// Is is useful because column oriented formats could effectively skip unknown columns
|
|
|
|
/// So we can create a header of only required columns in read method and ask
|
|
|
|
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
|
2023-09-11 14:55:37 +00:00
|
|
|
bool supportsSubsetOfColumns(const ContextPtr & context_) const;
|
2021-12-06 03:54:45 +00:00
|
|
|
|
2023-07-04 16:50:31 +00:00
|
|
|
bool supportsSubcolumns() const override { return true; }
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
static ColumnsDescription getTableStructureFromData(
|
|
|
|
const String & format,
|
|
|
|
const String & uri,
|
|
|
|
const String & compression_method,
|
|
|
|
ContextPtr ctx);
|
|
|
|
|
2022-08-05 16:20:15 +00:00
|
|
|
static SchemaCache & getSchemaCache(const ContextPtr & ctx);
|
|
|
|
|
2023-08-22 11:59:59 +00:00
|
|
|
bool supportsTrivialCountOptimization() const override { return true; }
|
|
|
|
|
2018-11-19 08:17:09 +00:00
|
|
|
protected:
|
2021-12-06 03:54:45 +00:00
|
|
|
friend class HDFSSource;
|
2018-11-19 08:17:09 +00:00
|
|
|
|
|
|
|
private:
|
2022-06-27 12:43:24 +00:00
|
|
|
static std::optional<ColumnsDescription> tryGetColumnsFromCache(
|
2023-05-30 19:32:24 +00:00
|
|
|
const std::vector<StorageHDFS::PathWithInfo> & paths_with_info,
|
2022-06-30 12:41:56 +00:00
|
|
|
const String & uri_without_path,
|
2022-06-27 12:43:24 +00:00
|
|
|
const String & format_name,
|
|
|
|
const ContextPtr & ctx);
|
|
|
|
|
|
|
|
static void addColumnsToCache(
|
2023-05-30 19:32:24 +00:00
|
|
|
const std::vector<StorageHDFS::PathWithInfo> & paths,
|
2022-06-30 12:41:56 +00:00
|
|
|
const String & uri_without_path,
|
2022-06-27 12:43:24 +00:00
|
|
|
const ColumnsDescription & columns,
|
|
|
|
const String & format_name,
|
|
|
|
const ContextPtr & ctx);
|
2022-06-21 13:02:48 +00:00
|
|
|
|
2022-09-10 07:58:49 +00:00
|
|
|
std::vector<String> uris;
|
2018-11-19 08:17:09 +00:00
|
|
|
String format_name;
|
2019-11-19 12:46:07 +00:00
|
|
|
String compression_method;
|
2021-12-03 05:25:14 +00:00
|
|
|
const bool distributed_processing;
|
2021-10-26 12:22:13 +00:00
|
|
|
ASTPtr partition_by;
|
2021-12-29 18:03:15 +00:00
|
|
|
bool is_path_with_globs;
|
2022-03-28 19:18:20 +00:00
|
|
|
NamesAndTypesList virtual_columns;
|
2018-11-19 08:17:09 +00:00
|
|
|
|
2020-05-30 21:57:37 +00:00
|
|
|
Poco::Logger * log = &Poco::Logger::get("StorageHDFS");
|
2018-11-19 08:17:09 +00:00
|
|
|
};
|
2021-12-03 05:25:14 +00:00
|
|
|
|
|
|
|
class PullingPipelineExecutor;
|
|
|
|
|
2022-05-20 19:49:31 +00:00
|
|
|
class HDFSSource : public ISource, WithContext
|
2021-12-03 05:25:14 +00:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
class DisclosedGlobIterator
|
|
|
|
{
|
|
|
|
public:
|
2023-08-17 16:54:43 +00:00
|
|
|
DisclosedGlobIterator(const String & uri_, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
|
2023-05-30 19:32:24 +00:00
|
|
|
StorageHDFS::PathWithInfo next();
|
2021-12-03 05:25:14 +00:00
|
|
|
private:
|
|
|
|
class Impl;
|
|
|
|
/// shared_ptr to have copy constructor
|
|
|
|
std::shared_ptr<Impl> pimpl;
|
|
|
|
};
|
|
|
|
|
2021-12-29 18:03:15 +00:00
|
|
|
class URISIterator
|
|
|
|
{
|
|
|
|
public:
|
2023-08-17 16:54:43 +00:00
|
|
|
URISIterator(const std::vector<String> & uris_, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
|
2023-05-30 19:32:24 +00:00
|
|
|
StorageHDFS::PathWithInfo next();
|
2021-12-29 18:03:15 +00:00
|
|
|
private:
|
|
|
|
class Impl;
|
|
|
|
/// shared_ptr to have copy constructor
|
|
|
|
std::shared_ptr<Impl> pimpl;
|
|
|
|
};
|
|
|
|
|
2023-05-30 19:32:24 +00:00
|
|
|
using IteratorWrapper = std::function<StorageHDFS::PathWithInfo()>;
|
2021-12-03 05:25:14 +00:00
|
|
|
using StorageHDFSPtr = std::shared_ptr<StorageHDFS>;
|
|
|
|
|
|
|
|
HDFSSource(
|
2023-07-04 16:50:31 +00:00
|
|
|
const ReadFromFormatInfo & info,
|
2021-12-03 05:25:14 +00:00
|
|
|
StorageHDFSPtr storage_,
|
|
|
|
ContextPtr context_,
|
|
|
|
UInt64 max_block_size_,
|
2023-08-22 11:59:59 +00:00
|
|
|
std::shared_ptr<IteratorWrapper> file_iterator_,
|
2023-08-22 12:42:22 +00:00
|
|
|
bool need_only_count_,
|
2023-08-02 17:47:42 +00:00
|
|
|
const SelectQueryInfo & query_info_);
|
2021-12-03 05:25:14 +00:00
|
|
|
|
|
|
|
String getName() const override;
|
|
|
|
|
|
|
|
Chunk generate() override;
|
|
|
|
|
|
|
|
private:
|
2023-08-22 11:59:59 +00:00
|
|
|
void addNumRowsToCache(const String & path, size_t num_rows);
|
|
|
|
std::optional<size_t> tryGetNumRowsFromCache(const StorageHDFS::PathWithInfo & path_with_info);
|
|
|
|
|
2021-12-03 05:25:14 +00:00
|
|
|
StorageHDFSPtr storage;
|
2022-03-28 19:18:20 +00:00
|
|
|
Block block_for_format;
|
2023-07-04 16:50:31 +00:00
|
|
|
NamesAndTypesList requested_columns;
|
|
|
|
NamesAndTypesList requested_virtual_columns;
|
2021-12-03 05:25:14 +00:00
|
|
|
UInt64 max_block_size;
|
|
|
|
std::shared_ptr<IteratorWrapper> file_iterator;
|
|
|
|
ColumnsDescription columns_description;
|
2023-08-22 11:59:59 +00:00
|
|
|
bool need_only_count;
|
|
|
|
size_t total_rows_in_file = 0;
|
2023-08-02 17:47:42 +00:00
|
|
|
SelectQueryInfo query_info;
|
2021-12-03 05:25:14 +00:00
|
|
|
|
|
|
|
std::unique_ptr<ReadBuffer> read_buf;
|
2023-06-16 15:51:18 +00:00
|
|
|
std::shared_ptr<IInputFormat> input_format;
|
2021-12-03 05:25:14 +00:00
|
|
|
std::unique_ptr<QueryPipeline> pipeline;
|
|
|
|
std::unique_ptr<PullingPipelineExecutor> reader;
|
|
|
|
String current_path;
|
2023-11-22 18:12:36 +00:00
|
|
|
std::optional<size_t> current_file_size;
|
2021-12-03 05:25:14 +00:00
|
|
|
|
|
|
|
/// Recreate ReadBuffer and PullingPipelineExecutor for each file.
|
|
|
|
bool initialize();
|
|
|
|
};
|
2018-11-19 08:17:09 +00:00
|
|
|
}
|
2018-12-05 13:24:45 +00:00
|
|
|
|
|
|
|
#endif
|