diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index dc955ed14bd..ad55e46eebf 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -85,6 +85,7 @@ CacheDictionary::CacheDictionary( , size{roundUpToPowerOfTwoOrZero(std::max(size_, size_t(max_collision_length)))} , size_overlap_mask{this->size - 1} , cells{this->size} + , default_keys{this->size} , rnd_engine(randomSeed()) , update_queue(max_update_queue_size_) , update_pool(max_threads_for_updates) @@ -309,14 +310,14 @@ std::string CacheDictionary::AttributeValuesForKey::dump() std::string CacheDictionary::UpdateUnit::dumpFoundIds() { - std::string ans; + std::ostringstream os; for (auto it : found_ids) { - ans += "Key: " + std::to_string(it.first) + "\n"; + os << "Key: " << std::to_string(it.first) << "\n"; if (it.second.found) - ans += it.second.dump() + "\n"; + os << it.second.dump() << "\n"; } - return ans; + return os.str(); }; /// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag @@ -387,12 +388,11 @@ void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray for (const auto row : ext::range(0, rows)) { const auto id = ids[row]; - { - std::shared_lock shared_lock(default_cache_rw_lock); - /// Check if the key is stored in the cache of defaults. - if (default_keys.find(id) != default_keys.end()) - continue; - } + + /// Check if the key is stored in the cache of defaults. + if (default_keys.has(id)) + continue; + const auto find_result = findCellIdx(id, now); auto insert_to_answer_routine = [&] () @@ -461,7 +461,7 @@ void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray } /// At this point we have two situations. - /// There may be both types of keys: cache_expired_ids and cache_not_found_ids. + /// There may be both types of keys: expired and not_found. /// We will update them all synchronously. std::vector required_ids; @@ -481,11 +481,8 @@ void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray for (const auto row : cache_expired_or_not_found_ids[key]) out[row] = true; else - { - std::unique_lock unique_lock(default_cache_rw_lock); /// Cache this key as default. - default_keys.insert(key); - } + default_keys.add(key); } } diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index ec0cdb2acfb..d1742b19776 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -22,6 +22,8 @@ #include "IDictionary.h" #include "IDictionarySource.h" +#include + namespace CurrentMetrics { extern const Metric CacheDictionaryUpdateQueueBatches; @@ -346,6 +348,7 @@ private: std::map attribute_index_by_name; mutable std::vector attributes; mutable std::vector cells; + mutable LRUSet default_keys; Attribute * hierarchical_attribute = nullptr; std::unique_ptr string_arena; @@ -360,21 +363,11 @@ private: mutable std::atomic hit_count{0}; mutable std::atomic query_count{0}; - mutable std::unordered_set default_keys; - /* - * Disclaimer: this comment is written not for fun. - * * How the update goes: we basically have a method like get(keys)->values. Values are cached, so sometimes we - * can return them from the cache. For values not in cache, we query them from the dictionary, and add to the - * cache. The cache is lossy, so we can't expect it to store all the keys, and we store them separately. Normally, - * they would be passed as a return value of get(), but for Unknown Reasons the dictionaries use a baroque - * interface where get() accepts two callback, one that it calls for found values, and one for not found. - * - * Now we make it even uglier by doing this from multiple threads. The missing values are retrieved from the - * dictionary in a background thread, and this thread calls the provided callback. So if you provide the callbacks, - * you MUST wait until the background update finishes, or god knows what happens. Unfortunately, we have no - * way to check that you did this right, so good luck. + * can return them from the cache. For values not in cache, we query them from the source, and add to the + * cache. The cache is lossy, so we can't expect it to store all the keys, and we store them separately. + * So, there is a map of found keys to all its attributes. */ struct UpdateUnit { diff --git a/src/Dictionaries/CacheDictionary.inc.h b/src/Dictionaries/CacheDictionary.inc.h index ca59556d689..5b08b027b85 100644 --- a/src/Dictionaries/CacheDictionary.inc.h +++ b/src/Dictionaries/CacheDictionary.inc.h @@ -61,14 +61,10 @@ void CacheDictionary::getItemsNumberImpl( for (const auto row : ext::range(0, rows)) { const auto id = ids[row]; - - { - std::shared_lock shared_lock(default_cache_rw_lock); - /// First check if this key in the cache of default keys. - if (default_keys.find(id) != default_keys.end()) - continue; - } + /// First check if this key in the cache of default keys. + if (default_keys.has(id)) + continue; /** cell should be updated if either: * 1. ids do not match, @@ -174,9 +170,7 @@ void CacheDictionary::getItemsNumberImpl( out[row] = std::get(value.values[attribute_index]); } else { - /// Add key to the cache of default keys. - std::unique_lock unique_lock(default_cache_rw_lock); - default_keys.emplace(key); + default_keys.add(key); } } } @@ -207,7 +201,7 @@ void CacheDictionary::getItemsString( { std::shared_lock shared_lock(default_cache_rw_lock); /// Check if the key is stored in the cache of defaults. - if (default_keys.find(id) != default_keys.end()) + if (default_keys.has(id)) { const auto string_ref = get_default(row); out->insertData(string_ref.data, string_ref.size); @@ -258,14 +252,11 @@ void CacheDictionary::getItemsString( for (const auto row : ext::range(0, ids.size())) { const auto id = ids[row]; + /// Check if the key is stored in the cache of defaults. + if (default_keys.has(id)) { - std::shared_lock shared_lock(default_cache_rw_lock); - /// Check if the key is stored in the cache of defaults. - if (default_keys.find(id) != default_keys.end()) - { - const auto string_ref = get_default(row); - out->insertData(string_ref.data, string_ref.size); - } + const auto string_ref = get_default(row); + out->insertData(string_ref.data, string_ref.size); } const auto find_result = findCellIdx(id, now); @@ -383,8 +374,7 @@ void CacheDictionary::getItemsString( value = std::get(found_it->second.values[attribute_index]); else { - std::unique_lock unique_lock(default_cache_rw_lock); - default_keys.insert(id); + default_keys.add(id); value = get_default(row); } } diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/dictionaries/.gitkeep b/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/dictionaries/.gitkeep deleted file mode 100644 index c693f138c81..00000000000 --- a/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/dictionaries/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -keep \ No newline at end of file diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index 95096249f48..ce71625c3ea 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -221,6 +221,7 @@ def test_reload_after_fail_in_cache_dictionary(started_cluster): # Can't get a value from the cache dictionary because the source (table `test.xypairs`) doesn't respond. expected_error = "Table test.xypairs doesn't exist" + update_error = "Could not update cache dictionary cache_xypairs now" assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(1))") assert get_status("cache_xypairs") == "LOADED" assert expected_error in get_last_exception("cache_xypairs") @@ -254,8 +255,9 @@ def test_reload_after_fail_in_cache_dictionary(started_cluster): query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56" query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0" error = query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))") - assert expected_error in error - assert expected_error in get_last_exception("cache_xypairs") + assert (expected_error in error) or (update_error in error) + last_exception = get_last_exception("cache_xypairs") + assert (expected_error in last_exception) or (update_error in last_exception) # Create table `test.xypairs` again with changed values. query(''' diff --git a/tests/integration/test_system_queries/test.py b/tests/integration/test_system_queries/test.py index 5fa1cab5b36..b159e8b4cf3 100644 --- a/tests/integration/test_system_queries/test.py +++ b/tests/integration/test_system_queries/test.py @@ -47,7 +47,7 @@ def test_SYSTEM_RELOAD_DICTIONARY(started_cluster): instance.query("INSERT INTO dictionary_source VALUES (1, 1)") assert TSV(instance.query( "SELECT dictGetUInt8('clickhouse_cache', 'value', toUInt64(0)), dictHas('clickhouse_cache', toUInt64(1))")) == TSV( - "0\t1\n") + "0\t0\n") instance.query("SYSTEM RELOAD DICTIONARY clickhouse_cache") assert TSV(instance.query(