Add ability to preallocate hashtables for hashed/sparsehashed dictionaries

preallocation can be used only when we know number of rows, and for this we need: - source clickhouse - no filtering (i.e. lack of <where>), since filtering can filter too much rows and eventually it may allocate memory that will never be used. For sparse_hash the difference is quite significant, preallocated sparse_hash hashtable allocates ~33% faster (7.5 seconds vs 5 seconds for insert, and the difference is more significant for higher number of elements): $ ninja bench-sparse_hash-run [1/1] cd /src/ch/hashtable-bench/.cmake && ...ch/hashtable-bench/.cmake/bench-sparse_hash sparse_hash/insert: 7.574 <!-- sparse_hash/find : 2.14426 sparse_hash/maxrss: 174MiB sparse_hash/time: 9710.51 msec (user+sys) $ time ninja bench-sparse_hash-preallocate-run [1/1] cd /src/ch/hashtable-bench/.cmake && ...-bench/.cmake/bench-sparse_hash-preallocate sparse_hash/insert: 5.0522 <!-- sparse_hash/find : 2.14024 sparse_hash/maxrss: 174MiB sparse_hash/time: 7192.06 msec (user+sys) P.S. the difference for sparse_hashed dictionary with 4e9 elements (uint64, uint16) is ~18% (4975.905 vs 4103.569 sec) v2: do not reallocate the dictionary from the progress callback Since this will access hashtable in parallel. v3: drop PREALLOCATE() and do this only for source=clickhouse and empty <where>
2024-11-24 00:22:29 +00:00 · 2020-09-24 08:01:00 +03:00 · 2020-09-24 08:01:00 +03:00 · 064f901ea8
commit 064f901ea8
parent f4f79aa84a
6 changed files with 169 additions and 0 deletions
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -91,6 +91,8 @@ LAYOUT(FLAT())
 The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
 The hash table will be preallocated (this will make dictionary load faster), if the is approx number of total rows is known, this is supported only if the source is `clickhouse` without any `<where>` (since in case of `<where>` you can filter out too much rows and the dictionary will allocate too much memory, that will not be used eventually).
 All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
 Configuration example:
@ -111,6 +113,8 @@ LAYOUT(HASHED())
 Similar to `hashed`, but uses less memory in favor more CPU usage.
 It will be also preallocated so as `hashed`, note that it is even more significant for `sparse_hashed`.
 Configuration example:
 ``` xml
--- a/src/Dictionaries/ClickHouseDictionarySource.h
+++ b/src/Dictionaries/ClickHouseDictionarySource.h
@ -48,6 +48,10 @@ public:
    std::string toString() const override;
    /// Used for detection whether the hashtable should be preallocated
    /// (since if there is WHERE then it can filter out too much)
    bool hasWhere() const { return !where.empty(); }
 private:
    std::string getUpdateFieldAndDate();
--- a/src/Dictionaries/HashedDictionary.cpp
+++ b/src/Dictionaries/HashedDictionary.cpp
@ -2,6 +2,7 @@
 #include <ext/size.h>
 #include "DictionaryBlockInputStream.h"
 #include "DictionaryFactory.h"
 #include "ClickHouseDictionarySource.h"
 #include <Core/Defines.h>
@ -406,18 +407,130 @@ void HashedDictionary::updateData()
    }
    if (saved_block)
    {
        resize(saved_block->rows());
        blockToAttributes(*saved_block.get());
    }
 }
 template <typename T>
 void HashedDictionary::resize(Attribute & attribute, size_t added_rows)
 {
    if (!sparse)
    {
        const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
        added_rows += map_ref->size();
        map_ref->reserve(added_rows);
    }
    else
    {
        const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
        added_rows += map_ref->size();
        map_ref->resize(added_rows);
    }
 }
 void HashedDictionary::resize(size_t added_rows)
 {
    if (!added_rows)
        return;
    for (auto & attribute : attributes)
    {
        switch (attribute.type)
        {
            case AttributeUnderlyingType::utUInt8:
                resize<UInt8>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utUInt16:
                resize<UInt16>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utUInt32:
                resize<UInt32>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utUInt64:
                resize<UInt64>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utUInt128:
                resize<UInt128>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utInt8:
                resize<Int8>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utInt16:
                resize<Int16>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utInt32:
                resize<Int32>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utInt64:
                resize<Int64>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utFloat32:
                resize<Float32>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utFloat64:
                resize<Float64>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utDecimal32:
                resize<Decimal32>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utDecimal64:
                resize<Decimal64>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utDecimal128:
                resize<Decimal128>(attribute, added_rows);
                break;
            case AttributeUnderlyingType::utString:
                resize<StringRef>(attribute, added_rows);
                break;
        }
    }
 }
 void HashedDictionary::loadData()
 {
    if (!source_ptr->hasUpdateField())
    {
        /// atomic since progress callbac called in parallel
        std::atomic<uint64_t> new_size = 0;
        auto stream = source_ptr->loadAll();
        /// preallocation can be used only when we know number of rows, for this we need:
        /// - source clickhouse
        /// - no filtering (i.e. lack of <where>), since filtering can filter
        ///   too much rows and eventually it may allocate memory that will
        ///   never be used.
        bool preallocate = false;
        if (const auto & clickhouse_source = dynamic_cast<ClickHouseDictionarySource *>(source_ptr.get()))
        {
            if (!clickhouse_source->hasWhere())
                preallocate = true;
        }
        if (preallocate)
        {
            stream->setProgressCallback([&new_size](const Progress & progress)
            {
                new_size += progress.total_rows_to_read;
            });
        }
        stream->readPrefix();
        while (const auto block = stream->read())
        {
            if (new_size)
            {
                size_t current_new_size = new_size.exchange(0);
                if (current_new_size)
                    resize(current_new_size);
            }
            else
                resize(block.rows());
            blockToAttributes(block);
        }
        stream->readSuffix();
    }
--- a/src/Dictionaries/HashedDictionary.h
+++ b/src/Dictionaries/HashedDictionary.h
@ -261,6 +261,14 @@ private:
    PaddedPODArray<Key> getIds() const;
    /// Preallocates the hashtable based on query progress
    /// (Only while loading all data).
    ///
    /// @see preallocate
    template <typename T>
    void resize(Attribute & attribute, size_t added_rows);
    void resize(size_t added_rows);
    template <typename AttrType, typename ChildType, typename AncestorType>
    void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
    template <typename ChildType, typename AncestorType>
--- a/tests/queries/0_stateless/01509_dictionary_preallocate.reference
+++ b/tests/queries/0_stateless/01509_dictionary_preallocate.reference
@ -0,0 +1,4 @@
 CREATE DICTIONARY db_01509.dict\n(\n    `key` UInt64,\n    `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED())
 -
 0
 1000
--- a/tests/queries/0_stateless/01509_dictionary_preallocate.sql
+++ b/tests/queries/0_stateless/01509_dictionary_preallocate.sql
@ -0,0 +1,36 @@
 -- The test itself does not test does preallocation works
 -- It simply check SPARSE_HASHED dictionary with bunch of dictGet()
 -- (since at the moment of writing there were no such test)
 DROP DATABASE IF EXISTS db_01509;
 CREATE DATABASE db_01509;
 CREATE TABLE db_01509.data
 (
  key   UInt64,
  value String
 )
 ENGINE = MergeTree()
 ORDER BY key;
 INSERT INTO db_01509.data SELECT number key, toString(number) value FROM numbers(1000);
 DROP DICTIONARY IF EXISTS db_01509.dict;
 CREATE DICTIONARY db_01509.dict
 (
  key   UInt64,
  value String DEFAULT '-'
 )
 PRIMARY KEY key
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'data' PASSWORD '' DB 'db_01509'))
 LAYOUT(SPARSE_HASHED())
 LIFETIME(0);
 SHOW CREATE DICTIONARY db_01509.dict;
 SYSTEM RELOAD DICTIONARY db_01509.dict;
 SELECT dictGet('db_01509.dict', 'value', toUInt64(1e12));
 SELECT dictGet('db_01509.dict', 'value', toUInt64(0));
 SELECT count() FROM db_01509.dict;
 DROP DATABASE IF EXISTS db_01509;
		`@ -0,0 +1,4 @@`
							CREATE DICTIONARY db_01509.dict\n(\n `key` UInt64,\n `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED())
							`-`
							`0`
							`1000`