ClickHouse/src/Storages/HDFS/StorageHDFS.h

154 lines
4.5 KiB
C++
Raw Normal View History

2018-11-19 08:17:09 +00:00
#pragma once
2021-06-11 11:26:33 +00:00
#include <Common/config.h>
2021-06-11 11:26:33 +00:00
#if USE_HDFS
2018-11-19 08:17:09 +00:00
2021-12-03 05:25:14 +00:00
#include <Processors/Sources/SourceWithProgress.h>
2018-11-19 08:17:09 +00:00
#include <Storages/IStorage.h>
#include <Poco/URI.h>
2021-10-02 07:13:14 +00:00
#include <base/logger_useful.h>
#include <base/shared_ptr_helper.h>
2018-11-19 08:17:09 +00:00
namespace DB
{
/**
* This class represents table engine for external hdfs files.
* Read method is supported for now.
*/
2021-06-15 19:55:21 +00:00
class StorageHDFS final : public shared_ptr_helper<StorageHDFS>, public IStorage, WithContext
2018-11-19 08:17:09 +00:00
{
2021-06-15 19:55:21 +00:00
friend struct shared_ptr_helper<StorageHDFS>;
2018-11-19 08:17:09 +00:00
public:
String getName() const override { return "HDFS"; }
2018-11-19 08:17:09 +00:00
2020-08-03 13:54:14 +00:00
Pipe read(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
2018-11-19 08:17:09 +00:00
QueryProcessingStage::Enum processed_stage,
size_t max_block_size,
2018-11-19 08:17:09 +00:00
unsigned num_streams) override;
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override;
2018-11-19 08:17:09 +00:00
2021-12-25 03:10:59 +00:00
void truncate(
const ASTPtr & query,
const StorageMetadataPtr & metadata_snapshot,
ContextPtr local_context,
TableExclusiveLockHolder &) override;
2021-06-21 13:50:09 +00:00
NamesAndTypesList getVirtuals() const override;
2020-04-27 13:55:30 +00:00
2021-10-25 16:23:44 +00:00
bool supportsPartitionBy() const override { return true; }
/// Check if the format is column-oriented.
/// Is is useful because column oriented formats could effectively skip unknown columns
/// So we can create a header of only required columns in read method and ask
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
2022-02-28 13:29:05 +00:00
bool isColumnOriented() const override;
static ColumnsDescription getTableStructureFromData(
const String & format,
const String & uri,
const String & compression_method,
ContextPtr ctx);
2018-11-19 08:17:09 +00:00
protected:
friend class HDFSSource;
2021-04-23 12:18:23 +00:00
StorageHDFS(
const String & uri_,
2019-12-04 16:06:55 +00:00
const StorageID & table_id_,
2018-11-19 08:17:09 +00:00
const String & format_name_,
const ColumnsDescription & columns_,
2019-08-24 21:20:20 +00:00
const ConstraintsDescription & constraints_,
2021-04-23 12:18:23 +00:00
const String & comment,
ContextPtr context_,
2021-12-03 05:25:14 +00:00
const String & compression_method_ = "",
bool distributed_processing_ = false,
2021-10-26 12:22:13 +00:00
ASTPtr partition_by = nullptr);
2018-11-19 08:17:09 +00:00
private:
std::vector<const String> uris;
2018-11-19 08:17:09 +00:00
String format_name;
String compression_method;
2021-12-03 05:25:14 +00:00
const bool distributed_processing;
2021-10-26 12:22:13 +00:00
ASTPtr partition_by;
bool is_path_with_globs;
2022-03-28 19:18:20 +00:00
NamesAndTypesList virtual_columns;
2018-11-19 08:17:09 +00:00
2020-05-30 21:57:37 +00:00
Poco::Logger * log = &Poco::Logger::get("StorageHDFS");
2018-11-19 08:17:09 +00:00
};
2021-12-03 05:25:14 +00:00
class PullingPipelineExecutor;
class HDFSSource : public SourceWithProgress, WithContext
{
public:
class DisclosedGlobIterator
{
public:
DisclosedGlobIterator(ContextPtr context_, const String & uri_);
String next();
private:
class Impl;
/// shared_ptr to have copy constructor
std::shared_ptr<Impl> pimpl;
};
class URISIterator
{
public:
2022-01-18 19:26:13 +00:00
URISIterator(const std::vector<const String> & uris_, ContextPtr context);
String next();
private:
class Impl;
/// shared_ptr to have copy constructor
std::shared_ptr<Impl> pimpl;
};
2021-12-03 05:25:14 +00:00
using IteratorWrapper = std::function<String()>;
using StorageHDFSPtr = std::shared_ptr<StorageHDFS>;
2022-03-28 19:18:20 +00:00
static Block getHeader(Block sample_block, const std::vector<NameAndTypePair> & requested_virtual_columns);
2021-12-03 05:25:14 +00:00
HDFSSource(
StorageHDFSPtr storage_,
2022-03-28 19:18:20 +00:00
const Block & block_for_foramt_,
const std::vector<NameAndTypePair> & requested_virtual_columns_,
2021-12-03 05:25:14 +00:00
ContextPtr context_,
UInt64 max_block_size_,
std::shared_ptr<IteratorWrapper> file_iterator_,
ColumnsDescription columns_description_);
String getName() const override;
Chunk generate() override;
2021-12-27 19:42:56 +00:00
void onCancel() override;
2021-12-03 05:25:14 +00:00
private:
StorageHDFSPtr storage;
2022-03-28 19:18:20 +00:00
Block block_for_format;
std::vector<NameAndTypePair> requested_virtual_columns;
2021-12-03 05:25:14 +00:00
UInt64 max_block_size;
bool need_path_column;
bool need_file_column;
std::shared_ptr<IteratorWrapper> file_iterator;
ColumnsDescription columns_description;
std::unique_ptr<ReadBuffer> read_buf;
std::unique_ptr<QueryPipeline> pipeline;
std::unique_ptr<PullingPipelineExecutor> reader;
2022-02-28 08:44:35 +00:00
/// onCancel and generate can be called concurrently.
std::mutex reader_mutex;
2021-12-03 05:25:14 +00:00
String current_path;
/// Recreate ReadBuffer and PullingPipelineExecutor for each file.
bool initialize();
};
2018-11-19 08:17:09 +00:00
}
#endif