Add ability to preallocate hashtables for hashed/sparsehashed dictionaries

preallocation can be used only when we know number of rows, and for this we need: - source clickhouse - no filtering (i.e. lack of <where>), since filtering can filter too much rows and eventually it may allocate memory that will never be used. For sparse_hash the difference is quite significant, preallocated sparse_hash hashtable allocates ~33% faster (7.5 seconds vs 5 seconds for insert, and the difference is more significant for higher number of elements): $ ninja bench-sparse_hash-run [1/1] cd /src/ch/hashtable-bench/.cmake && ...ch/hashtable-bench/.cmake/bench-sparse_hash sparse_hash/insert: 7.574 <!-- sparse_hash/find : 2.14426 sparse_hash/maxrss: 174MiB sparse_hash/time: 9710.51 msec (user+sys) $ time ninja bench-sparse_hash-preallocate-run [1/1] cd /src/ch/hashtable-bench/.cmake && ...-bench/.cmake/bench-sparse_hash-preallocate sparse_hash/insert: 5.0522 <!-- sparse_hash/find : 2.14024 sparse_hash/maxrss: 174MiB sparse_hash/time: 7192.06 msec (user+sys) P.S. the difference for sparse_hashed dictionary with 4e9 elements (uint64, uint16) is ~18% (4975.905 vs 4103.569 sec) v2: do not reallocate the dictionary from the progress callback Since this will access hashtable in parallel. v3: drop PREALLOCATE() and do this only for source=clickhouse and empty <where>
2024-11-21 07:01:59 +00:00 · 2020-09-24 08:01:00 +03:00 · 2020-09-24 08:01:00 +03:00 · 064f901ea8
commit 064f901ea8
parent f4f79aa84a
6 changed files with 169 additions and 0 deletions
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -91,6 +91,8 @@ LAYOUT(FLAT())

 The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.

+The hash table will be preallocated (this will make dictionary load faster), if the is approx number of total rows is known, this is supported only if the source is `clickhouse` without any `<where>` (since in case of `<where>` you can filter out too much rows and the dictionary will allocate too much memory, that will not be used eventually).
+
 All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.

 Configuration example:
@ -111,6 +113,8 @@ LAYOUT(HASHED())

 Similar to `hashed`, but uses less memory in favor more CPU usage.

+It will be also preallocated so as `hashed`, note that it is even more significant for `sparse_hashed`.
+
 Configuration example:

 ``` xml
--- a/src/Dictionaries/ClickHouseDictionarySource.h
+++ b/src/Dictionaries/ClickHouseDictionarySource.h
@ -48,6 +48,10 @@ public:

    std::string toString() const override;

+    /// Used for detection whether the hashtable should be preallocated
+    /// (since if there is WHERE then it can filter out too much)
+    bool hasWhere() const { return !where.empty(); }
+
 private:
    std::string getUpdateFieldAndDate();

--- a/src/Dictionaries/HashedDictionary.cpp
+++ b/src/Dictionaries/HashedDictionary.cpp
@ -2,6 +2,7 @@
 #include <ext/size.h>
 #include "DictionaryBlockInputStream.h"
 #include "DictionaryFactory.h"
+#include "ClickHouseDictionarySource.h"
 #include <Core/Defines.h>


@ -406,18 +407,130 @@ void HashedDictionary::updateData()
    }

    if (saved_block)
+    {
+        resize(saved_block->rows());
        blockToAttributes(*saved_block.get());
+    }
+}
+
+template <typename T>
+void HashedDictionary::resize(Attribute & attribute, size_t added_rows)
+{
+    if (!sparse)
+    {
+        const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
+        added_rows += map_ref->size();
+        map_ref->reserve(added_rows);
+    }
+    else
+    {
+        const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
+        added_rows += map_ref->size();
+        map_ref->resize(added_rows);
+    }
+}
+void HashedDictionary::resize(size_t added_rows)
+{
+    if (!added_rows)
+        return;
+
+    for (auto & attribute : attributes)
+    {
+        switch (attribute.type)
+        {
+            case AttributeUnderlyingType::utUInt8:
+                resize<UInt8>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utUInt16:
+                resize<UInt16>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utUInt32:
+                resize<UInt32>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utUInt64:
+                resize<UInt64>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utUInt128:
+                resize<UInt128>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utInt8:
+                resize<Int8>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utInt16:
+                resize<Int16>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utInt32:
+                resize<Int32>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utInt64:
+                resize<Int64>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utFloat32:
+                resize<Float32>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utFloat64:
+                resize<Float64>(attribute, added_rows);
+                break;
+
+            case AttributeUnderlyingType::utDecimal32:
+                resize<Decimal32>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utDecimal64:
+                resize<Decimal64>(attribute, added_rows);
+                break;
+            case AttributeUnderlyingType::utDecimal128:
+                resize<Decimal128>(attribute, added_rows);
+                break;
+
+            case AttributeUnderlyingType::utString:
+                resize<StringRef>(attribute, added_rows);
+                break;
+        }
+    }
 }

 void HashedDictionary::loadData()
 {
    if (!source_ptr->hasUpdateField())
    {
+        /// atomic since progress callbac called in parallel
+        std::atomic<uint64_t> new_size = 0;
        auto stream = source_ptr->loadAll();
+
+        /// preallocation can be used only when we know number of rows, for this we need:
+        /// - source clickhouse
+        /// - no filtering (i.e. lack of <where>), since filtering can filter
+        ///   too much rows and eventually it may allocate memory that will
+        ///   never be used.
+        bool preallocate = false;
+        if (const auto & clickhouse_source = dynamic_cast<ClickHouseDictionarySource *>(source_ptr.get()))
+        {
+            if (!clickhouse_source->hasWhere())
+                preallocate = true;
+        }
+
+        if (preallocate)
+        {
+            stream->setProgressCallback([&new_size](const Progress & progress)
+            {
+                new_size += progress.total_rows_to_read;
+            });
+        }
+
        stream->readPrefix();

        while (const auto block = stream->read())
+        {
+            if (new_size)
+            {
+                size_t current_new_size = new_size.exchange(0);
+                if (current_new_size)
+                    resize(current_new_size);
+            }
+            else
+                resize(block.rows());
            blockToAttributes(block);
+        }

        stream->readSuffix();
    }
--- a/src/Dictionaries/HashedDictionary.h
+++ b/src/Dictionaries/HashedDictionary.h
@ -261,6 +261,14 @@ private:

    PaddedPODArray<Key> getIds() const;

+    /// Preallocates the hashtable based on query progress
+    /// (Only while loading all data).
+    ///
+    /// @see preallocate
+    template <typename T>
+    void resize(Attribute & attribute, size_t added_rows);
+    void resize(size_t added_rows);
+
    template <typename AttrType, typename ChildType, typename AncestorType>
    void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
    template <typename ChildType, typename AncestorType>
--- a/tests/queries/0_stateless/01509_dictionary_preallocate.reference
+++ b/tests/queries/0_stateless/01509_dictionary_preallocate.reference
@ -0,0 +1,4 @@
+CREATE DICTIONARY db_01509.dict\n(\n    `key` UInt64,\n    `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED())
+-
+0
+1000
--- a/tests/queries/0_stateless/01509_dictionary_preallocate.sql
+++ b/tests/queries/0_stateless/01509_dictionary_preallocate.sql
@ -0,0 +1,36 @@
+-- The test itself does not test does preallocation works
+-- It simply check SPARSE_HASHED dictionary with bunch of dictGet()
+-- (since at the moment of writing there were no such test)
+
+DROP DATABASE IF EXISTS db_01509;
+CREATE DATABASE db_01509;
+
+CREATE TABLE db_01509.data
+(
+  key   UInt64,
+  value String
+)
+ENGINE = MergeTree()
+ORDER BY key;
+INSERT INTO db_01509.data SELECT number key, toString(number) value FROM numbers(1000);
+
+DROP DICTIONARY IF EXISTS db_01509.dict;
+CREATE DICTIONARY db_01509.dict
+(
+  key   UInt64,
+  value String DEFAULT '-'
+)
+PRIMARY KEY key
+SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'data' PASSWORD '' DB 'db_01509'))
+LAYOUT(SPARSE_HASHED())
+LIFETIME(0);
+
+SHOW CREATE DICTIONARY db_01509.dict;
+
+SYSTEM RELOAD DICTIONARY db_01509.dict;
+
+SELECT dictGet('db_01509.dict', 'value', toUInt64(1e12));
+SELECT dictGet('db_01509.dict', 'value', toUInt64(0));
+SELECT count() FROM db_01509.dict;
+
+DROP DATABASE IF EXISTS db_01509;