Merge pull request #15454 from azat/dict-prealloc

Speedup hashed/sparse_hashed dictionary loading by preallocating the hash table
This commit is contained in:
Nikolai Kochetov 2020-10-15 15:03:53 +03:00 committed by GitHub
commit 963ce315b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 175 additions and 0 deletions

View File

@ -91,6 +91,8 @@ LAYOUT(FLAT())
The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
The hash table will be preallocated (this will make dictionary load faster), if the is approx number of total rows is known, this is supported only if the source is `clickhouse` without any `<where>` (since in case of `<where>` you can filter out too much rows and the dictionary will allocate too much memory, that will not be used eventually).
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
Configuration example:
@ -111,6 +113,8 @@ LAYOUT(HASHED())
Similar to `hashed`, but uses less memory in favor more CPU usage.
It will be also preallocated so as `hashed`, note that it is even more significant for `sparse_hashed`.
Configuration example:
``` xml

View File

@ -850,6 +850,11 @@ protected:
public:
void reserve(size_t num_elements)
{
resize(num_elements);
}
/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
{

View File

@ -48,6 +48,10 @@ public:
std::string toString() const override;
/// Used for detection whether the hashtable should be preallocated
/// (since if there is WHERE then it can filter out too much)
bool hasWhere() const { return !where.empty(); }
private:
std::string getUpdateFieldAndDate();

View File

@ -2,6 +2,7 @@
#include <ext/size.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
#include "ClickHouseDictionarySource.h"
#include <Core/Defines.h>
@ -406,18 +407,130 @@ void HashedDictionary::updateData()
}
if (saved_block)
{
resize(saved_block->rows());
blockToAttributes(*saved_block.get());
}
}
template <typename T>
void HashedDictionary::resize(Attribute & attribute, size_t added_rows)
{
if (!sparse)
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
added_rows += map_ref->size();
map_ref->reserve(added_rows);
}
else
{
const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
added_rows += map_ref->size();
map_ref->resize(added_rows);
}
}
void HashedDictionary::resize(size_t added_rows)
{
if (!added_rows)
return;
for (auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::utUInt8:
resize<UInt8>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt16:
resize<UInt16>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt32:
resize<UInt32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt64:
resize<UInt64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt128:
resize<UInt128>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt8:
resize<Int8>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt16:
resize<Int16>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt32:
resize<Int32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt64:
resize<Int64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utFloat32:
resize<Float32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utFloat64:
resize<Float64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utDecimal32:
resize<Decimal32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utDecimal64:
resize<Decimal64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utDecimal128:
resize<Decimal128>(attribute, added_rows);
break;
case AttributeUnderlyingType::utString:
resize<StringRef>(attribute, added_rows);
break;
}
}
}
void HashedDictionary::loadData()
{
if (!source_ptr->hasUpdateField())
{
/// atomic since progress callbac called in parallel
std::atomic<uint64_t> new_size = 0;
auto stream = source_ptr->loadAll();
/// preallocation can be used only when we know number of rows, for this we need:
/// - source clickhouse
/// - no filtering (i.e. lack of <where>), since filtering can filter
/// too much rows and eventually it may allocate memory that will
/// never be used.
bool preallocate = false;
if (const auto & clickhouse_source = dynamic_cast<ClickHouseDictionarySource *>(source_ptr.get()))
{
if (!clickhouse_source->hasWhere())
preallocate = true;
}
if (preallocate)
{
stream->setProgressCallback([&new_size](const Progress & progress)
{
new_size += progress.total_rows_to_read;
});
}
stream->readPrefix();
while (const auto block = stream->read())
{
if (new_size)
{
size_t current_new_size = new_size.exchange(0);
if (current_new_size)
resize(current_new_size);
}
else
resize(block.rows());
blockToAttributes(block);
}
stream->readSuffix();
}

View File

@ -261,6 +261,14 @@ private:
PaddedPODArray<Key> getIds() const;
/// Preallocates the hashtable based on query progress
/// (Only while loading all data).
///
/// @see preallocate
template <typename T>
void resize(Attribute & attribute, size_t added_rows);
void resize(size_t added_rows);
template <typename AttrType, typename ChildType, typename AncestorType>
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
template <typename ChildType, typename AncestorType>

View File

@ -0,0 +1,4 @@
CREATE DICTIONARY db_01509.dict\n(\n `key` UInt64,\n `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED())
-
0
1000

View File

@ -0,0 +1,36 @@
-- The test itself does not test does preallocation works
-- It simply check SPARSE_HASHED dictionary with bunch of dictGet()
-- (since at the moment of writing there were no such test)
DROP DATABASE IF EXISTS db_01509;
CREATE DATABASE db_01509;
CREATE TABLE db_01509.data
(
key UInt64,
value String
)
ENGINE = MergeTree()
ORDER BY key;
INSERT INTO db_01509.data SELECT number key, toString(number) value FROM numbers(1000);
DROP DICTIONARY IF EXISTS db_01509.dict;
CREATE DICTIONARY db_01509.dict
(
key UInt64,
value String DEFAULT '-'
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'data' PASSWORD '' DB 'db_01509'))
LAYOUT(SPARSE_HASHED())
LIFETIME(0);
SHOW CREATE DICTIONARY db_01509.dict;
SYSTEM RELOAD DICTIONARY db_01509.dict;
SELECT dictGet('db_01509.dict', 'value', toUInt64(1e12));
SELECT dictGet('db_01509.dict', 'value', toUInt64(0));
SELECT count() FROM db_01509.dict;
DROP DATABASE IF EXISTS db_01509;

View File

@ -151,3 +151,4 @@
00510_materizlized_view_and_deduplication_zookeeper
00738_lock_for_inner_table
01515_force_data_skipping_indices
01509_dictionary_preallocate