diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index bbf45a577e0..ae2de0ba6f8 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -91,6 +91,8 @@ LAYOUT(FLAT()) The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items. +The hash table will be preallocated (this will make dictionary load faster), if the is approx number of total rows is known, this is supported only if the source is `clickhouse` without any `` (since in case of `` you can filter out too much rows and the dictionary will allocate too much memory, that will not be used eventually). + All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. Configuration example: @@ -111,6 +113,8 @@ LAYOUT(HASHED()) Similar to `hashed`, but uses less memory in favor more CPU usage. +It will be also preallocated so as `hashed`, note that it is even more significant for `sparse_hashed`. + Configuration example: ``` xml diff --git a/src/Common/HashTable/HashTable.h b/src/Common/HashTable/HashTable.h index baad5d40764..897c84fe951 100644 --- a/src/Common/HashTable/HashTable.h +++ b/src/Common/HashTable/HashTable.h @@ -850,6 +850,11 @@ protected: public: + void reserve(size_t num_elements) + { + resize(num_elements); + } + /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. std::pair ALWAYS_INLINE insert(const value_type & x) { diff --git a/src/Dictionaries/ClickHouseDictionarySource.h b/src/Dictionaries/ClickHouseDictionarySource.h index 276f6916ada..8017d458b7e 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.h +++ b/src/Dictionaries/ClickHouseDictionarySource.h @@ -48,6 +48,10 @@ public: std::string toString() const override; + /// Used for detection whether the hashtable should be preallocated + /// (since if there is WHERE then it can filter out too much) + bool hasWhere() const { return !where.empty(); } + private: std::string getUpdateFieldAndDate(); diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 85456e8c61c..1439036bf8d 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -2,6 +2,7 @@ #include #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" +#include "ClickHouseDictionarySource.h" #include @@ -406,18 +407,130 @@ void HashedDictionary::updateData() } if (saved_block) + { + resize(saved_block->rows()); blockToAttributes(*saved_block.get()); + } +} + +template +void HashedDictionary::resize(Attribute & attribute, size_t added_rows) +{ + if (!sparse) + { + const auto & map_ref = std::get>(attribute.maps); + added_rows += map_ref->size(); + map_ref->reserve(added_rows); + } + else + { + const auto & map_ref = std::get>(attribute.sparse_maps); + added_rows += map_ref->size(); + map_ref->resize(added_rows); + } +} +void HashedDictionary::resize(size_t added_rows) +{ + if (!added_rows) + return; + + for (auto & attribute : attributes) + { + switch (attribute.type) + { + case AttributeUnderlyingType::utUInt8: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utUInt16: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utUInt32: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utUInt64: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utUInt128: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utInt8: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utInt16: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utInt32: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utInt64: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utFloat32: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utFloat64: + resize(attribute, added_rows); + break; + + case AttributeUnderlyingType::utDecimal32: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utDecimal64: + resize(attribute, added_rows); + break; + case AttributeUnderlyingType::utDecimal128: + resize(attribute, added_rows); + break; + + case AttributeUnderlyingType::utString: + resize(attribute, added_rows); + break; + } + } } void HashedDictionary::loadData() { if (!source_ptr->hasUpdateField()) { + /// atomic since progress callbac called in parallel + std::atomic new_size = 0; auto stream = source_ptr->loadAll(); + + /// preallocation can be used only when we know number of rows, for this we need: + /// - source clickhouse + /// - no filtering (i.e. lack of ), since filtering can filter + /// too much rows and eventually it may allocate memory that will + /// never be used. + bool preallocate = false; + if (const auto & clickhouse_source = dynamic_cast(source_ptr.get())) + { + if (!clickhouse_source->hasWhere()) + preallocate = true; + } + + if (preallocate) + { + stream->setProgressCallback([&new_size](const Progress & progress) + { + new_size += progress.total_rows_to_read; + }); + } + stream->readPrefix(); while (const auto block = stream->read()) + { + if (new_size) + { + size_t current_new_size = new_size.exchange(0); + if (current_new_size) + resize(current_new_size); + } + else + resize(block.rows()); blockToAttributes(block); + } stream->readSuffix(); } diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index fd6b93b6d5c..b9c4ab138b8 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -261,6 +261,14 @@ private: PaddedPODArray getIds() const; + /// Preallocates the hashtable based on query progress + /// (Only while loading all data). + /// + /// @see preallocate + template + void resize(Attribute & attribute, size_t added_rows); + void resize(size_t added_rows); + template void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; template diff --git a/tests/queries/0_stateless/01509_dictionary_preallocate.reference b/tests/queries/0_stateless/01509_dictionary_preallocate.reference new file mode 100644 index 00000000000..9f0e494f583 --- /dev/null +++ b/tests/queries/0_stateless/01509_dictionary_preallocate.reference @@ -0,0 +1,4 @@ +CREATE DICTIONARY db_01509.dict\n(\n `key` UInt64,\n `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED()) +- +0 +1000 diff --git a/tests/queries/0_stateless/01509_dictionary_preallocate.sql b/tests/queries/0_stateless/01509_dictionary_preallocate.sql new file mode 100644 index 00000000000..a3c407ddae5 --- /dev/null +++ b/tests/queries/0_stateless/01509_dictionary_preallocate.sql @@ -0,0 +1,36 @@ +-- The test itself does not test does preallocation works +-- It simply check SPARSE_HASHED dictionary with bunch of dictGet() +-- (since at the moment of writing there were no such test) + +DROP DATABASE IF EXISTS db_01509; +CREATE DATABASE db_01509; + +CREATE TABLE db_01509.data +( + key UInt64, + value String +) +ENGINE = MergeTree() +ORDER BY key; +INSERT INTO db_01509.data SELECT number key, toString(number) value FROM numbers(1000); + +DROP DICTIONARY IF EXISTS db_01509.dict; +CREATE DICTIONARY db_01509.dict +( + key UInt64, + value String DEFAULT '-' +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'data' PASSWORD '' DB 'db_01509')) +LAYOUT(SPARSE_HASHED()) +LIFETIME(0); + +SHOW CREATE DICTIONARY db_01509.dict; + +SYSTEM RELOAD DICTIONARY db_01509.dict; + +SELECT dictGet('db_01509.dict', 'value', toUInt64(1e12)); +SELECT dictGet('db_01509.dict', 'value', toUInt64(0)); +SELECT count() FROM db_01509.dict; + +DROP DATABASE IF EXISTS db_01509; diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index ec8dbf7a6d4..58b353fc491 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -151,3 +151,4 @@ 00510_materizlized_view_and_deduplication_zookeeper 00738_lock_for_inner_table 01515_force_data_skipping_indices +01509_dictionary_preallocate