Merge pull request #61250 from ClickHouse/less-contentaion-in-cache-part4

Less contention in cache, part 4
This commit is contained in:
Kseniia Sumarokova 2024-05-02 15:18:16 +00:00 committed by GitHub
commit 33a0e8035f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 226 additions and 5 deletions

View File

@ -489,6 +489,8 @@ The server successfully detected this situation and will download merged part fr
M(FilesystemCacheFailToReserveSpaceBecauseOfLockContention, "Number of times space reservation was skipped due to a high contention on the cache lock") \
M(FilesystemCacheHoldFileSegments, "Filesystem cache file segments count, which were hold") \
M(FilesystemCacheUnusedHoldFileSegments, "Filesystem cache file segments count, which were hold, but not used (because of seek or LIMIT n, etc)") \
M(FilesystemCacheFreeSpaceKeepingThreadRun, "Number of times background thread executed free space keeping job") \
M(FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds, "Time for which background thread executed free space keeping job") \
\
M(RemoteFSSeeks, "Total number of seeks for async buffer") \
M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \

View File

@ -28,6 +28,8 @@ namespace ProfileEvents
extern const Event FilesystemCacheGetOrSetMicroseconds;
extern const Event FilesystemCacheGetMicroseconds;
extern const Event FilesystemCacheFailToReserveSpaceBecauseOfLockContention;
extern const Event FilesystemCacheFreeSpaceKeepingThreadRun;
extern const Event FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds;
}
namespace DB
@ -86,6 +88,9 @@ FileCache::FileCache(const std::string & cache_name, const FileCacheSettings & s
, boundary_alignment(settings.boundary_alignment)
, load_metadata_threads(settings.load_metadata_threads)
, write_cache_per_user_directory(settings.write_cache_per_user_id_directory)
, keep_current_size_to_max_ratio(1 - settings.keep_free_space_size_ratio)
, keep_current_elements_to_max_ratio(1 - settings.keep_free_space_elements_ratio)
, keep_up_free_space_remove_batch(settings.keep_free_space_remove_batch)
, log(getLogger("FileCache(" + cache_name + ")"))
, metadata(settings.base_path, settings.background_download_queue_size_limit, settings.background_download_threads, write_cache_per_user_directory)
{
@ -192,6 +197,12 @@ void FileCache::initialize()
metadata.startup();
if (keep_current_size_to_max_ratio != 1 || keep_current_elements_to_max_ratio != 1)
{
keep_up_free_space_ratio_task = Context::getGlobalContextInstance()->getSchedulePool().createTask(log->name(), [this] { freeSpaceRatioKeepingThreadFunc(); });
keep_up_free_space_ratio_task->schedule();
}
is_initialized = true;
}
@ -946,6 +957,121 @@ bool FileCache::tryReserve(
return true;
}
void FileCache::freeSpaceRatioKeepingThreadFunc()
{
static constexpr auto lock_failed_reschedule_ms = 1000;
static constexpr auto space_ratio_satisfied_reschedule_ms = 5000;
static constexpr auto general_reschedule_ms = 5000;
if (shutdown)
return;
Stopwatch watch;
auto lock = tryLockCache();
/// To avoid deteriorating contention on cache,
/// proceed only if cache is not heavily used.
if (!lock)
{
keep_up_free_space_ratio_task->scheduleAfter(lock_failed_reschedule_ms);
return;
}
const size_t size_limit = main_priority->getSizeLimit(lock);
const size_t elements_limit = main_priority->getElementsLimit(lock);
const size_t desired_size = std::lround(keep_current_size_to_max_ratio * size_limit);
const size_t desired_elements_num = std::lround(keep_current_elements_to_max_ratio * elements_limit);
if ((size_limit == 0 || main_priority->getSize(lock) <= desired_size)
&& (elements_limit == 0 || main_priority->getElementsCount(lock) <= desired_elements_num))
{
/// Nothing to free - all limits are satisfied.
keep_up_free_space_ratio_task->scheduleAfter(space_ratio_satisfied_reschedule_ms);
return;
}
ProfileEvents::increment(ProfileEvents::FilesystemCacheFreeSpaceKeepingThreadRun);
FileCacheReserveStat stat;
EvictionCandidates eviction_candidates;
bool limits_satisfied = true;
try
{
/// Collect at most `keep_up_free_space_remove_batch` elements to evict,
/// (we use batches to make sure we do not block cache for too long,
/// by default the batch size is quite small).
limits_satisfied = main_priority->collectCandidatesForEviction(
desired_size, desired_elements_num, keep_up_free_space_remove_batch, stat, eviction_candidates, lock);
#ifdef ABORT_ON_LOGICAL_ERROR
/// Let's make sure that we correctly processed the limits.
if (limits_satisfied && eviction_candidates.size() < keep_up_free_space_remove_batch)
{
const auto current_size = main_priority->getSize(lock);
chassert(current_size >= stat.total_stat.releasable_size);
chassert(!size_limit
|| current_size - stat.total_stat.releasable_size <= desired_size);
const auto current_elements_count = main_priority->getElementsCount(lock);
chassert(current_elements_count >= stat.total_stat.releasable_count);
chassert(!elements_limit
|| current_elements_count - stat.total_stat.releasable_count <= desired_elements_num);
}
#endif
if (shutdown)
return;
if (eviction_candidates.size() > 0)
{
LOG_TRACE(log, "Current usage {}/{} in size, {}/{} in elements count "
"(trying to keep size ration at {} and elements ratio at {}). "
"Collected {} eviction candidates, "
"skipped {} candidates while iterating",
main_priority->getSize(lock), size_limit,
main_priority->getElementsCount(lock), elements_limit,
desired_size, desired_elements_num,
eviction_candidates.size(), stat.total_stat.non_releasable_count);
lock.unlock();
/// Remove files from filesystem.
eviction_candidates.evict();
/// Take lock again to finalize eviction,
/// e.g. to update the in-memory state.
lock.lock();
eviction_candidates.finalize(nullptr, lock);
}
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
if (eviction_candidates.size() > 0)
eviction_candidates.finalize(nullptr, lockCache());
/// Let's catch such cases in ci,
/// in general there should not be exceptions.
chassert(false);
}
watch.stop();
ProfileEvents::increment(ProfileEvents::FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds, watch.elapsedMilliseconds());
LOG_TRACE(log, "Free space ratio keeping thread finished in {} ms", watch.elapsedMilliseconds());
[[maybe_unused]] bool scheduled = false;
if (limits_satisfied)
scheduled = keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
else
scheduled = keep_up_free_space_ratio_task->schedule();
chassert(scheduled);
}
void FileCache::iterate(IterateFunc && func, const UserID & user_id)
{
return metadata.iterate([&](const LockedKey & locked_key)
@ -1275,6 +1401,8 @@ void FileCache::deactivateBackgroundOperations()
{
shutdown.store(true);
metadata.shutdown();
if (keep_up_free_space_ratio_task)
keep_up_free_space_ratio_task->deactivate();
}
std::vector<FileSegment::Info> FileCache::getFileSegmentInfos(const UserID & user_id)

View File

@ -189,6 +189,8 @@ public:
void applySettingsIfPossible(const FileCacheSettings & new_settings, FileCacheSettings & actual_settings);
void freeSpaceRatioKeepingThreadFunc();
private:
using KeyAndOffset = FileCacheKeyAndOffset;
@ -198,6 +200,11 @@ private:
size_t load_metadata_threads;
const bool write_cache_per_user_directory;
BackgroundSchedulePool::TaskHolder keep_up_free_space_ratio_task;
const double keep_current_size_to_max_ratio;
const double keep_current_elements_to_max_ratio;
const size_t keep_up_free_space_remove_batch;
LoggerPtr log;
std::exception_ptr init_exception;

View File

@ -79,6 +79,15 @@ void FileCacheSettings::loadImpl(FuncHas has, FuncGetUInt get_uint, FuncGetStrin
if (has("write_cache_per_user_id_directory"))
slru_size_ratio = get_uint("write_cache_per_user_id_directory");
if (has("keep_free_space_size_ratio"))
keep_free_space_size_ratio = get_double("keep_free_space_size_ratio");
if (has("keep_free_space_elements_ratio"))
keep_free_space_elements_ratio = get_double("keep_free_space_elements_ratio");
if (has("keep_free_space_remove_batch"))
keep_free_space_elements_ratio = get_uint("keep_free_space_remove_batch");
}
void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)

View File

@ -38,6 +38,10 @@ struct FileCacheSettings
std::string cache_policy = "LRU";
double slru_size_ratio = 0.5;
double keep_free_space_size_ratio = FILECACHE_DEFAULT_FREE_SPACE_SIZE_RATIO;
double keep_free_space_elements_ratio = FILECACHE_DEFAULT_FREE_SPACE_ELEMENTS_RATIO;
size_t keep_free_space_remove_batch = FILECACHE_DEFAULT_FREE_SPACE_REMOVE_BATCH;
void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
void loadFromCollection(const NamedCollection & collection);

View File

@ -12,6 +12,9 @@ static constexpr int FILECACHE_DEFAULT_LOAD_METADATA_THREADS = 16;
static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 10000000;
static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0;
static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024;
static constexpr double FILECACHE_DEFAULT_FREE_SPACE_SIZE_RATIO = 0; /// Disabled.
static constexpr double FILECACHE_DEFAULT_FREE_SPACE_ELEMENTS_RATIO = 0; /// Disabled.
static constexpr int FILECACHE_DEFAULT_FREE_SPACE_REMOVE_BATCH = 10;
class FileCache;
using FileCachePtr = std::shared_ptr<FileCache>;

View File

@ -137,6 +137,8 @@ public:
virtual PriorityDumpPtr dump(const CachePriorityGuard::Lock &) = 0;
/// Collect eviction candidates sufficient to free `size` bytes
/// and `elements` elements from cache.
virtual bool collectCandidatesForEviction(
size_t size,
size_t elements,
@ -146,7 +148,10 @@ public:
const UserID & user_id,
const CachePriorityGuard::Lock &) = 0;
/// Collect eviction `candidates_num` candidates for eviction.
/// Collect eviction candidates sufficient to have `desired_size`
/// and `desired_elements_num` as current cache state.
/// Collect no more than `max_candidates_to_evict` elements.
/// Return `true` if the first condition is satisfied.
virtual bool collectCandidatesForEviction(
size_t desired_size,
size_t desired_elements_count,

View File

@ -284,6 +284,7 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
};
iterateForEviction(res, stat, can_fit, lock);
if (can_fit())
{
/// `res` contains eviction candidates. Do we have any?
@ -330,14 +331,17 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
EvictionCandidates & res,
const CachePriorityGuard::Lock & lock)
{
auto stop_condition = [&, this]()
auto desired_limits_satisfied = [&]()
{
return canFit(0, 0, stat.total_stat.releasable_size, stat.total_stat.releasable_count,
lock, &desired_size, &desired_elements_count)
|| (max_candidates_to_evict && res.size() >= max_candidates_to_evict);
lock, &desired_size, &desired_elements_count);
};
auto stop_condition = [&]()
{
return desired_limits_satisfied() || (max_candidates_to_evict && res.size() >= max_candidates_to_evict);
};
iterateForEviction(res, stat, stop_condition, lock);
return stop_condition();
return desired_limits_satisfied();
}
void LRUFileCachePriority::iterateForEviction(

View File

@ -24,6 +24,8 @@
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
<cache_policy>LRU</cache_policy>
<slru_size_ratio>0.3</slru_size_ratio>
<keep_free_space_size_ratio>0.15</keep_free_space_size_ratio>
<keep_free_space_elements_ratio>0.15</keep_free_space_elements_ratio>
</s3_cache>
<s3_cache_02933>
<type>cache</type>

View File

@ -501,3 +501,60 @@ INSERT INTO test SELECT 1, 'test';
node.query("SELECT * FROM test FORMAT Null")
assert key not in node.query("SYSTEM SYNC FILESYSTEM CACHE")
def test_keep_up_size_ratio(cluster):
node = cluster.instances["node"]
max_elements = 20
elements_ratio = 0.5
cache_name = "keep_up_size_ratio"
node.query(
f"""
DROP TABLE IF EXISTS test;
CREATE TABLE test (a String)
ENGINE = MergeTree() ORDER BY tuple()
SETTINGS disk = disk(type = cache,
name = {cache_name},
max_size = '100Ki',
max_elements = {max_elements},
max_file_segment_size = 10,
boundary_alignment = 10,
path = "test_keep_up_size_ratio",
keep_free_space_size_ratio = 0.5,
keep_free_space_elements_ratio = {elements_ratio},
disk = hdd_blob),
min_bytes_for_wide_part = 10485760;
INSERT INTO test SELECT randomString(200);
"""
)
query_id = "test_keep_up_size_ratio_1"
node.query(
"SELECT * FROM test FORMAT Null SETTINGS enable_filesystem_cache_log = 1",
query_id=query_id,
)
count = int(
node.query(
f"""
SYSTEM FLUSH LOGS;
SELECT uniqExact(concat(key, toString(offset)))
FROM system.filesystem_cache_log
WHERE read_type = 'READ_FROM_FS_AND_DOWNLOADED_TO_CACHE';
"""
)
)
assert count > max_elements
expected = 10
for _ in range(100):
elements = int(
node.query(
f"SELECT count() FROM system.filesystem_cache WHERE cache_name = '{cache_name}'"
)
)
if elements <= expected:
break
time.sleep(1)
assert elements <= expected