2023-03-07 05:09:13 +00:00
|
|
|
#include <Compression/CompressedReadBufferFromFile.h>
|
2020-02-03 12:46:25 +00:00
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2023-01-19 15:55:15 +00:00
|
|
|
#include <Interpreters/threadPoolCallbackRunner.h>
|
2023-03-07 05:09:13 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeData.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeMarksLoader.h>
|
2022-08-30 17:47:34 +00:00
|
|
|
#include <Common/CurrentMetrics.h>
|
2023-03-07 05:09:13 +00:00
|
|
|
#include <Common/MemoryTrackerBlockerInThread.h>
|
2022-08-30 17:47:34 +00:00
|
|
|
#include <Common/ThreadPool.h>
|
2023-03-07 05:09:13 +00:00
|
|
|
#include <Common/scope_guard_safe.h>
|
|
|
|
#include <Common/setThreadName.h>
|
2019-11-20 13:33:41 +00:00
|
|
|
|
2020-02-27 16:47:40 +00:00
|
|
|
#include <utility>
|
|
|
|
|
2022-08-30 17:47:34 +00:00
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
|
|
|
extern const Event WaitMarksLoadMicroseconds;
|
2022-09-04 17:10:46 +00:00
|
|
|
extern const Event BackgroundLoadingMarksTasks;
|
2023-03-07 05:09:13 +00:00
|
|
|
extern const Event LoadedMarksCount;
|
|
|
|
extern const Event LoadedMarksMemoryBytes;
|
2022-08-30 17:47:34 +00:00
|
|
|
}
|
2022-08-28 02:19:14 +00:00
|
|
|
|
2019-11-20 13:33:41 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2020-02-03 12:46:25 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-02-25 18:02:41 +00:00
|
|
|
extern const int CANNOT_READ_ALL_DATA;
|
2020-02-03 12:46:25 +00:00
|
|
|
extern const int CORRUPTED_DATA;
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
}
|
|
|
|
|
2019-11-20 13:33:41 +00:00
|
|
|
MergeTreeMarksLoader::MergeTreeMarksLoader(
|
2023-04-30 08:56:43 +00:00
|
|
|
MergeTreeDataPartInfoForReaderPtr data_part_reader_,
|
2019-11-20 13:33:41 +00:00
|
|
|
MarkCache * mark_cache_,
|
|
|
|
const String & mrk_path_,
|
2020-02-03 12:46:25 +00:00
|
|
|
size_t marks_count_,
|
|
|
|
const MergeTreeIndexGranularityInfo & index_granularity_info_,
|
2019-11-20 13:33:41 +00:00
|
|
|
bool save_marks_in_cache_,
|
2022-07-18 12:09:57 +00:00
|
|
|
const ReadSettings & read_settings_,
|
2022-09-05 18:12:40 +00:00
|
|
|
ThreadPool * load_marks_threadpool_,
|
2020-02-03 12:46:25 +00:00
|
|
|
size_t columns_in_mark_)
|
2023-04-30 08:56:43 +00:00
|
|
|
: data_part_reader(data_part_reader_)
|
2020-02-27 16:47:40 +00:00
|
|
|
, mark_cache(mark_cache_)
|
2019-11-20 13:33:41 +00:00
|
|
|
, mrk_path(mrk_path_)
|
2020-02-03 12:46:25 +00:00
|
|
|
, marks_count(marks_count_)
|
|
|
|
, index_granularity_info(index_granularity_info_)
|
2019-11-20 13:33:41 +00:00
|
|
|
, save_marks_in_cache(save_marks_in_cache_)
|
2022-07-18 12:09:57 +00:00
|
|
|
, columns_in_mark(columns_in_mark_)
|
|
|
|
, read_settings(read_settings_)
|
2022-09-05 18:12:40 +00:00
|
|
|
, load_marks_threadpool(load_marks_threadpool_)
|
2022-07-18 12:09:57 +00:00
|
|
|
{
|
2022-09-05 18:12:40 +00:00
|
|
|
if (load_marks_threadpool)
|
2022-08-30 17:47:34 +00:00
|
|
|
{
|
|
|
|
future = loadMarksAsync();
|
|
|
|
}
|
2022-07-18 12:09:57 +00:00
|
|
|
}
|
2019-11-20 13:33:41 +00:00
|
|
|
|
2022-08-31 13:39:53 +00:00
|
|
|
MergeTreeMarksLoader::~MergeTreeMarksLoader()
|
|
|
|
{
|
|
|
|
if (future.valid())
|
|
|
|
{
|
|
|
|
future.wait();
|
|
|
|
}
|
2022-07-18 12:09:57 +00:00
|
|
|
}
|
2019-11-20 13:33:41 +00:00
|
|
|
|
|
|
|
|
2023-03-07 05:09:13 +00:00
|
|
|
MarkInCompressedFile MergeTreeMarksLoader::getMark(size_t row_index, size_t column_index)
|
2019-11-20 13:33:41 +00:00
|
|
|
{
|
|
|
|
if (!marks)
|
2022-08-30 17:47:34 +00:00
|
|
|
{
|
|
|
|
Stopwatch watch(CLOCK_MONOTONIC);
|
|
|
|
|
|
|
|
if (future.valid())
|
|
|
|
{
|
|
|
|
marks = future.get();
|
|
|
|
future = {};
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
marks = loadMarks();
|
|
|
|
}
|
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
ProfileEvents::increment(ProfileEvents::WaitMarksLoadMicroseconds, watch.elapsedMicroseconds());
|
|
|
|
}
|
2020-02-03 12:46:25 +00:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
if (column_index >= columns_in_mark)
|
2022-08-28 20:33:42 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column index: {} is out of range [0, {})", column_index, columns_in_mark);
|
2020-02-03 12:46:25 +00:00
|
|
|
#endif
|
|
|
|
|
2023-03-07 05:09:13 +00:00
|
|
|
return marks->get(row_index * columns_in_mark + column_index);
|
2020-02-03 12:46:25 +00:00
|
|
|
}
|
|
|
|
|
2022-08-28 02:19:14 +00:00
|
|
|
|
2020-02-03 12:46:25 +00:00
|
|
|
MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl()
|
|
|
|
{
|
|
|
|
/// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache.
|
2022-01-10 19:39:10 +00:00
|
|
|
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker;
|
2020-02-03 12:46:25 +00:00
|
|
|
|
2023-04-30 08:56:43 +00:00
|
|
|
auto data_part_storage = data_part_reader->getDataPartStorage();
|
2023-04-21 09:24:02 +00:00
|
|
|
|
2022-04-12 18:59:49 +00:00
|
|
|
size_t file_size = data_part_storage->getFileSize(mrk_path);
|
2020-02-03 12:46:25 +00:00
|
|
|
size_t mark_size = index_granularity_info.getMarkSizeInBytes(columns_in_mark);
|
2022-08-28 02:19:14 +00:00
|
|
|
size_t expected_uncompressed_size = mark_size * marks_count;
|
2020-02-03 12:46:25 +00:00
|
|
|
|
2023-03-07 05:09:13 +00:00
|
|
|
// We first read the marks into a temporary simple array, then compress them into a more compact
|
|
|
|
// representation.
|
|
|
|
PODArray<MarkInCompressedFile> plain_marks(marks_count * columns_in_mark); // temporary
|
2020-02-03 12:46:25 +00:00
|
|
|
|
2023-02-27 11:27:57 +00:00
|
|
|
if (file_size == 0 && marks_count != 0)
|
|
|
|
{
|
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::CORRUPTED_DATA,
|
|
|
|
"Empty marks file '{}': {}, must be: {}",
|
|
|
|
std::string(fs::path(data_part_storage->getFullPath()) / mrk_path),
|
|
|
|
file_size, expected_uncompressed_size);
|
|
|
|
}
|
|
|
|
|
2022-09-06 15:41:39 +00:00
|
|
|
if (!index_granularity_info.mark_type.compressed && expected_uncompressed_size != file_size)
|
2020-02-03 12:46:25 +00:00
|
|
|
throw Exception(
|
2022-04-12 18:59:49 +00:00
|
|
|
ErrorCodes::CORRUPTED_DATA,
|
|
|
|
"Bad size of marks file '{}': {}, must be: {}",
|
|
|
|
std::string(fs::path(data_part_storage->getFullPath()) / mrk_path),
|
2023-03-07 05:09:13 +00:00
|
|
|
file_size,
|
|
|
|
expected_uncompressed_size);
|
2019-12-18 16:41:11 +00:00
|
|
|
|
2022-08-28 02:19:14 +00:00
|
|
|
auto buffer = data_part_storage->readFile(mrk_path, read_settings.adjustBufferSize(file_size), file_size, std::nullopt);
|
|
|
|
std::unique_ptr<ReadBuffer> reader;
|
2022-09-06 15:41:39 +00:00
|
|
|
if (!index_granularity_info.mark_type.compressed)
|
2022-08-28 02:19:14 +00:00
|
|
|
reader = std::move(buffer);
|
2022-08-29 17:09:58 +00:00
|
|
|
else
|
|
|
|
reader = std::make_unique<CompressedReadBufferFromFile>(std::move(buffer));
|
2020-02-03 12:46:25 +00:00
|
|
|
|
2022-09-05 05:26:58 +00:00
|
|
|
if (!index_granularity_info.mark_type.adaptive)
|
2020-02-03 12:46:25 +00:00
|
|
|
{
|
|
|
|
/// Read directly to marks.
|
2023-03-07 05:09:13 +00:00
|
|
|
reader->readStrict(reinterpret_cast<char *>(plain_marks.data()), expected_uncompressed_size);
|
2020-02-03 12:46:25 +00:00
|
|
|
|
2022-08-28 02:19:14 +00:00
|
|
|
if (!reader->eof())
|
2023-03-07 05:09:13 +00:00
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::CANNOT_READ_ALL_DATA,
|
2022-08-28 20:33:42 +00:00
|
|
|
"Cannot read all marks from file {}, is eof: {}, buffer size: {}, file size: {}",
|
2023-03-07 05:09:13 +00:00
|
|
|
mrk_path,
|
|
|
|
reader->eof(),
|
|
|
|
reader->buffer().size(),
|
|
|
|
file_size);
|
2020-02-03 12:46:25 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
size_t i = 0;
|
2022-08-28 02:19:14 +00:00
|
|
|
size_t granularity;
|
|
|
|
while (!reader->eof())
|
2020-02-03 12:46:25 +00:00
|
|
|
{
|
2023-03-07 05:09:13 +00:00
|
|
|
reader->readStrict(
|
|
|
|
reinterpret_cast<char *>(plain_marks.data() + i * columns_in_mark), columns_in_mark * sizeof(MarkInCompressedFile));
|
2022-08-28 02:19:14 +00:00
|
|
|
readIntBinary(granularity, *reader);
|
2020-02-03 12:46:25 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
2022-08-28 18:42:11 +00:00
|
|
|
if (i * mark_size != expected_uncompressed_size)
|
2023-02-27 11:27:57 +00:00
|
|
|
{
|
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::CANNOT_READ_ALL_DATA,
|
|
|
|
"Cannot read all marks from file {}, marks expected {} (bytes size {}), marks read {} (bytes size {})",
|
|
|
|
mrk_path, marks_count, expected_uncompressed_size, i, reader->count());
|
|
|
|
}
|
2020-02-03 12:46:25 +00:00
|
|
|
}
|
2022-08-28 02:19:14 +00:00
|
|
|
|
2023-03-07 05:09:13 +00:00
|
|
|
auto res = std::make_shared<MarksInCompressedFile>(plain_marks);
|
|
|
|
|
|
|
|
ProfileEvents::increment(ProfileEvents::LoadedMarksCount, marks_count * columns_in_mark);
|
|
|
|
ProfileEvents::increment(ProfileEvents::LoadedMarksMemoryBytes, res->approximateMemoryUsage());
|
|
|
|
|
2020-02-03 12:46:25 +00:00
|
|
|
return res;
|
2019-11-20 13:33:41 +00:00
|
|
|
}
|
|
|
|
|
2022-08-30 17:47:34 +00:00
|
|
|
MarkCache::MappedPtr MergeTreeMarksLoader::loadMarks()
|
2019-11-20 13:33:41 +00:00
|
|
|
{
|
2022-08-30 17:47:34 +00:00
|
|
|
MarkCache::MappedPtr loaded_marks;
|
|
|
|
|
2023-04-30 08:56:43 +00:00
|
|
|
auto data_part_storage = data_part_reader->getDataPartStorage();
|
2023-04-21 09:24:02 +00:00
|
|
|
|
2019-11-20 13:33:41 +00:00
|
|
|
if (mark_cache)
|
|
|
|
{
|
2022-04-22 20:29:14 +00:00
|
|
|
auto key = mark_cache->hash(fs::path(data_part_storage->getFullPath()) / mrk_path);
|
2019-11-20 13:33:41 +00:00
|
|
|
if (save_marks_in_cache)
|
|
|
|
{
|
2023-03-07 05:09:13 +00:00
|
|
|
auto callback = [this] { return loadMarksImpl(); };
|
2022-08-30 17:47:34 +00:00
|
|
|
loaded_marks = mark_cache->getOrSet(key, callback);
|
2019-11-20 13:33:41 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-08-30 17:47:34 +00:00
|
|
|
loaded_marks = mark_cache->get(key);
|
|
|
|
if (!loaded_marks)
|
|
|
|
loaded_marks = loadMarksImpl();
|
2019-11-20 13:33:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2022-08-30 17:47:34 +00:00
|
|
|
loaded_marks = loadMarksImpl();
|
2019-11-20 13:33:41 +00:00
|
|
|
|
2022-08-30 17:47:34 +00:00
|
|
|
if (!loaded_marks)
|
|
|
|
{
|
|
|
|
throw Exception(
|
2023-03-07 05:09:13 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR, "Failed to load marks: {}", (fs::path(data_part_storage->getFullPath()) / mrk_path).string());
|
2022-08-30 17:47:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return loaded_marks;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::future<MarkCache::MappedPtr> MergeTreeMarksLoader::loadMarksAsync()
|
|
|
|
{
|
2023-03-07 05:09:13 +00:00
|
|
|
return scheduleFromThreadPool<MarkCache::MappedPtr>(
|
|
|
|
[this]() -> MarkCache::MappedPtr
|
|
|
|
{
|
|
|
|
ProfileEvents::increment(ProfileEvents::BackgroundLoadingMarksTasks);
|
|
|
|
return loadMarks();
|
|
|
|
},
|
|
|
|
*load_marks_threadpool,
|
|
|
|
"LoadMarksThread");
|
2019-11-20 13:33:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|