ClickHouse/dbms/src/Storages/MergeTree/MergeTreeReader.h

137 lines
5.2 KiB
C++
Raw Normal View History

2013-11-26 11:55:11 +00:00
#pragma once
#include <Storages/MarkCache.h>
#include <Storages/MergeTree/MarkRange.h>
#include <Storages/MergeTree/MergeTreeData.h>
2017-06-14 10:50:22 +00:00
#include <Storages/MergeTree/MergeTreeRangeReader.h>
2018-12-20 17:37:02 +00:00
#include <IO/CompressedReadBufferFromFile.h>
#include <Core/NamesAndTypes.h>
#include <port/clock.h>
2013-11-26 11:55:11 +00:00
2018-12-20 17:37:02 +00:00
class CachedCompressedReadBuffer;
2013-11-26 11:55:11 +00:00
namespace DB
{
2016-11-20 12:43:20 +00:00
class IDataType;
2017-01-24 17:25:47 +00:00
/// Reads the data between pairs of marks in the same part. When reading consecutive ranges, avoids unnecessary seeks.
/// When ranges are almost consecutive, seeks are fast because they are performed inside the buffer.
/// Avoids loading the marks file if it is not needed (e.g. when reading the whole part).
class MergeTreeReader : private boost::noncopyable
2013-11-26 11:55:11 +00:00
{
public:
using ValueSizeMap = std::map<std::string, double>;
using DeserializeBinaryBulkStateMap = std::map<std::string, IDataType::DeserializeBinaryBulkStatePtr>;
MergeTreeReader(const String & path, /// Path to the directory containing the part
const MergeTreeData::DataPartPtr & data_part, const NamesAndTypesList & columns,
UncompressedCache * uncompressed_cache,
MarkCache * mark_cache,
bool save_marks_in_cache,
2018-10-17 03:13:00 +00:00
const MergeTreeData & storage, const MarkRanges & all_mark_ranges,
size_t aio_threshold, size_t max_read_buffer_size,
const ValueSizeMap & avg_value_size_hints = ValueSizeMap{},
const ReadBufferFromFileBase::ProfileCallback & profile_callback = ReadBufferFromFileBase::ProfileCallback{},
clockid_t clock_type = CLOCK_MONOTONIC_COARSE);
2014-07-23 15:24:45 +00:00
~MergeTreeReader();
2013-11-26 11:55:11 +00:00
const ValueSizeMap & getAvgValueSizeHints() const;
2015-09-16 17:49:08 +00:00
/// Add columns from ordered_names that are not present in the block.
/// Missing columns are added in the order specified by ordered_names.
/// If at least one column was added, reorders all columns in the block according to ordered_names.
/// num_rows is needed in case block is empty.
void fillMissingColumns(Block & res, bool & should_reorder, bool & should_evaluate_missing_defaults, size_t num_rows);
/// Sort columns to ensure consistent order among all blocks.
2018-04-23 19:05:46 +00:00
/// If filter_name is not nullptr and block has filter column, move it to the end of block.
2018-04-16 12:21:36 +00:00
void reorderColumns(Block & res, const Names & ordered_names, const String * filter_name);
/// Evaluate defaulted columns if necessary.
void evaluateMissingDefaults(Block & res);
2014-12-04 15:50:48 +00:00
2018-02-13 19:34:15 +00:00
const NamesAndTypesList & getColumns() const { return columns; }
2018-11-28 15:05:28 +00:00
/// Return the number of rows has been read or zero if there is no columns to read.
/// If continue_reading is true, continue reading from last state, otherwise seek to from_mark
size_t readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Block & res);
2016-07-19 10:57:57 +00:00
private:
class Stream
{
public:
Stream(
const String & path_prefix_, const String & extension_, size_t marks_count_,
const MarkRanges & all_mark_ranges,
MarkCache * mark_cache, bool save_marks_in_cache,
UncompressedCache * uncompressed_cache,
size_t aio_threshold, size_t max_read_buffer_size,
const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type);
2013-11-26 11:55:11 +00:00
void seekToMark(size_t index);
void seekToStart();
2013-11-26 11:55:11 +00:00
ReadBuffer * data_buffer;
2016-05-04 18:04:36 +00:00
private:
Stream() = default;
/// NOTE: lazily loads marks from the marks cache.
const MarkInCompressedFile & getMark(size_t index);
void loadMarks();
std::string path_prefix;
std::string extension;
size_t marks_count;
MarkCache * mark_cache;
bool save_marks_in_cache;
MarkCache::MappedPtr marks;
2018-12-20 17:37:02 +00:00
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
};
2013-11-26 11:55:11 +00:00
using FileStreams = std::map<std::string, std::unique_ptr<Stream>>;
2013-11-26 11:55:11 +00:00
/// avg_value_size_hints are used to reduce the number of reallocations when creating columns of variable size.
ValueSizeMap avg_value_size_hints;
/// Stores states for IDataType::deserializeBinaryBulk
DeserializeBinaryBulkStateMap deserialize_binary_bulk_state_map;
/// Path to the directory containing the part
String path;
MergeTreeData::DataPartPtr data_part;
FileStreams streams;
/// Columns that are read.
NamesAndTypesList columns;
UncompressedCache * uncompressed_cache;
MarkCache * mark_cache;
/// If save_marks_in_cache is false, then, if marks are not in cache, we will load them but won't save in the cache, to avoid evicting other data.
bool save_marks_in_cache;
2015-04-16 06:12:35 +00:00
2018-10-17 03:13:00 +00:00
const MergeTreeData & storage;
MarkRanges all_mark_ranges;
size_t aio_threshold;
size_t max_read_buffer_size;
size_t index_granularity;
2013-11-26 11:55:11 +00:00
2018-12-20 17:37:02 +00:00
void addStreams(const String & name, const IDataType & type,
2018-10-11 02:57:48 +00:00
const MarkRanges & all_mark_ranges, const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type);
void readData(
const String & name, const IDataType & type, IColumn & column,
size_t from_mark, bool continue_reading, size_t max_rows_to_read,
bool read_offsets = true);
2015-04-02 03:08:43 +00:00
2017-06-14 10:50:22 +00:00
friend class MergeTreeRangeReader::DelayedStream;
2013-11-26 11:55:11 +00:00
};
}