ClickHouse/src/Storages/HDFS/ReadBufferFromHDFS.cpp

195 lines
5.4 KiB
C++
Raw Normal View History

#include "ReadBufferFromHDFS.h"
2019-01-17 11:26:29 +00:00
#if USE_HDFS
#include <Storages/HDFS/HDFSCommon.h>
#include <hdfs/hdfs.h>
2020-10-06 13:29:08 +00:00
#include <mutex>
2019-01-17 11:26:29 +00:00
2019-02-10 17:40:52 +00:00
2019-01-17 11:26:29 +00:00
namespace DB
{
2021-04-20 07:53:55 +00:00
2019-01-17 11:26:29 +00:00
namespace ErrorCodes
{
extern const int NETWORK_ERROR;
2019-01-19 20:17:19 +00:00
extern const int CANNOT_OPEN_FILE;
2021-04-04 09:08:09 +00:00
extern const int CANNOT_SEEK_THROUGH_FILE;
extern const int SEEK_POSITION_OUT_OF_BOUND;
2021-10-20 22:57:43 +00:00
extern const int LOGICAL_ERROR;
2019-01-17 11:26:29 +00:00
}
2020-03-09 01:03:43 +00:00
ReadBufferFromHDFS::~ReadBufferFromHDFS() = default;
2021-06-03 11:34:47 +00:00
struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory<SeekableReadBuffer>
2019-01-17 11:26:29 +00:00
{
2020-10-06 13:29:08 +00:00
/// HDFS create/open functions are not thread safe
static std::mutex hdfs_init_mutex;
2021-04-20 07:53:55 +00:00
String hdfs_uri;
String hdfs_file_path;
2019-01-17 11:26:29 +00:00
hdfsFile fin;
2020-09-09 12:13:20 +00:00
HDFSBuilderWrapper builder;
2019-01-19 20:17:19 +00:00
HDFSFSPtr fs;
2019-01-19 23:51:03 +00:00
2021-04-04 09:08:09 +00:00
off_t offset = 0;
2021-10-20 22:57:43 +00:00
off_t read_until_position = 0;
2021-04-04 09:08:09 +00:00
2021-04-19 19:43:22 +00:00
explicit ReadBufferFromHDFSImpl(
const std::string & hdfs_uri_,
const std::string & hdfs_file_path_,
2021-06-03 11:34:47 +00:00
const Poco::Util::AbstractConfiguration & config_,
2021-10-20 22:57:43 +00:00
size_t buf_size_, size_t read_until_position_)
2021-06-03 11:34:47 +00:00
: BufferWithOwnMemory<SeekableReadBuffer>(buf_size_)
, hdfs_uri(hdfs_uri_)
2021-04-19 19:43:22 +00:00
, hdfs_file_path(hdfs_file_path_)
, builder(createHDFSBuilder(hdfs_uri_, config_))
2021-10-20 22:57:43 +00:00
, read_until_position(read_until_position_)
2019-01-17 11:26:29 +00:00
{
2020-10-06 13:29:08 +00:00
std::lock_guard lock(hdfs_init_mutex);
fs = createHDFSFS(builder.get());
2021-04-19 19:43:22 +00:00
fin = hdfsOpenFile(fs.get(), hdfs_file_path.c_str(), O_RDONLY, 0, 0, 0);
2019-01-18 18:57:11 +00:00
if (fin == nullptr)
2021-04-04 09:08:09 +00:00
throw Exception(ErrorCodes::CANNOT_OPEN_FILE,
2021-04-19 19:43:22 +00:00
"Unable to open HDFS file: {}. Error: {}",
hdfs_uri + hdfs_file_path, std::string(hdfsGetLastError()));
2021-04-04 09:08:09 +00:00
}
2021-06-03 11:34:47 +00:00
~ReadBufferFromHDFSImpl() override
2021-04-04 09:08:09 +00:00
{
std::lock_guard lock(hdfs_init_mutex);
hdfsCloseFile(fs.get(), fin);
}
2021-10-31 19:53:24 +00:00
std::optional<size_t> getTotalSize() const
{
auto * file_info = hdfsGetPathInfo(fs.get(), hdfs_file_path.c_str());
if (!file_info)
return std::nullopt;
return file_info->mSize;
}
2021-06-03 11:34:47 +00:00
bool nextImpl() override
2019-01-17 11:26:29 +00:00
{
2021-10-20 22:57:43 +00:00
size_t num_bytes_to_read;
if (read_until_position)
{
if (read_until_position == offset)
return false;
if (read_until_position < offset)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
num_bytes_to_read = read_until_position - offset;
}
else
{
num_bytes_to_read = internal_buffer.size();
}
int bytes_read = hdfsRead(fs.get(), fin, internal_buffer.begin(), num_bytes_to_read);
2019-01-17 11:26:29 +00:00
if (bytes_read < 0)
2021-04-19 19:43:22 +00:00
throw Exception(ErrorCodes::NETWORK_ERROR,
"Fail to read from HDFS: {}, file path: {}. Error: {}",
hdfs_uri, hdfs_file_path, std::string(hdfsGetLastError()));
2021-06-03 11:34:47 +00:00
2021-09-22 20:48:25 +00:00
if (bytes_read)
{
working_buffer = internal_buffer;
working_buffer.resize(bytes_read);
offset += bytes_read;
return true;
}
2021-06-03 11:34:47 +00:00
2021-09-22 20:48:25 +00:00
return false;
2019-01-17 11:26:29 +00:00
}
2019-01-19 20:17:19 +00:00
2021-11-24 18:53:53 +00:00
off_t seek(off_t offset_, int whence) override
2019-01-19 20:17:19 +00:00
{
2021-11-24 18:53:53 +00:00
if (whence != SEEK_SET)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Only SEEK_SET is supported");
2021-04-04 09:08:09 +00:00
offset = offset_;
2021-11-24 18:53:53 +00:00
int seek_status = hdfsSeek(fs.get(), fin, offset);
if (seek_status != 0)
throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Fail to seek HDFS file: {}, error: {}", hdfs_uri, std::string(hdfsGetLastError()));
2021-04-04 09:08:09 +00:00
return offset;
}
2021-06-03 11:34:47 +00:00
off_t getPosition() override
2021-04-04 09:08:09 +00:00
{
return offset;
2019-01-19 20:17:19 +00:00
}
2019-01-17 11:26:29 +00:00
};
2020-10-06 13:29:08 +00:00
std::mutex ReadBufferFromHDFS::ReadBufferFromHDFSImpl::hdfs_init_mutex;
2021-04-20 07:53:55 +00:00
ReadBufferFromHDFS::ReadBufferFromHDFS(
const String & hdfs_uri_,
const String & hdfs_file_path_,
const Poco::Util::AbstractConfiguration & config_,
2021-10-20 22:57:43 +00:00
size_t buf_size_, size_t read_until_position_)
2021-10-31 19:53:24 +00:00
: SeekableReadBufferWithSize(nullptr, 0)
2021-10-20 22:57:43 +00:00
, impl(std::make_unique<ReadBufferFromHDFSImpl>(hdfs_uri_, hdfs_file_path_, config_, buf_size_, read_until_position_))
2019-01-17 11:26:29 +00:00
{
}
2021-10-31 19:53:24 +00:00
std::optional<size_t> ReadBufferFromHDFS::getTotalSize()
{
return impl->getTotalSize();
}
2019-01-17 11:26:29 +00:00
bool ReadBufferFromHDFS::nextImpl()
{
2021-09-22 20:48:25 +00:00
impl->position() = impl->buffer().begin() + offset();
2021-06-03 11:34:47 +00:00
auto result = impl->next();
2019-01-17 11:26:29 +00:00
2021-06-03 11:34:47 +00:00
if (result)
2021-09-22 20:48:25 +00:00
BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset); /// use the buffer returned by `impl`
return result;
2019-01-17 11:26:29 +00:00
}
2021-04-04 09:08:09 +00:00
2021-10-31 19:53:24 +00:00
off_t ReadBufferFromHDFS::seek(off_t offset_, int whence)
2021-04-04 09:08:09 +00:00
{
2021-10-31 19:53:24 +00:00
if (whence != SEEK_SET)
throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
if (offset_ < 0)
throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND);
if (!working_buffer.empty()
&& size_t(offset_) >= impl->getPosition() - working_buffer.size()
&& offset_ < impl->getPosition())
{
pos = working_buffer.end() - (impl->getPosition() - offset_);
assert(pos >= working_buffer.begin());
assert(pos <= working_buffer.end());
return getPosition();
}
resetWorkingBuffer();
2021-10-31 19:53:24 +00:00
impl->seek(offset_, whence);
return impl->getPosition();
2021-04-04 09:08:09 +00:00
}
off_t ReadBufferFromHDFS::getPosition()
{
2021-06-03 11:34:47 +00:00
return impl->getPosition() - available();
2021-04-04 09:08:09 +00:00
}
2022-02-16 10:27:23 +00:00
size_t ReadBufferFromHDFS::getFileOffsetOfBufferEnd() const
{
return impl->getPosition();
}
2019-01-17 11:26:29 +00:00
}
#endif