Add ability to preallocate hashtables for hashed/sparsehashed dictionaries

preallocation can be used only when we know number of rows, and for this
we need:
- source clickhouse
- no filtering (i.e. lack of <where>), since filtering can filter
  too much rows and eventually it may allocate memory that will
  never be used.

For sparse_hash the difference is quite significant, preallocated
sparse_hash hashtable allocates ~33% faster (7.5 seconds vs 5 seconds
for insert, and the difference is more significant for higher number of
elements):

    $ ninja bench-sparse_hash-run
    [1/1] cd /src/ch/hashtable-bench/.cmake && ...ch/hashtable-bench/.cmake/bench-sparse_hash
    sparse_hash/insert: 7.574 <!--
    sparse_hash/find  : 2.14426
    sparse_hash/maxrss: 174MiB
    sparse_hash/time:   9710.51 msec (user+sys)

    $ time ninja bench-sparse_hash-preallocate-run
    [1/1] cd /src/ch/hashtable-bench/.cmake && ...-bench/.cmake/bench-sparse_hash-preallocate
    sparse_hash/insert: 5.0522 <!--
    sparse_hash/find  : 2.14024
    sparse_hash/maxrss: 174MiB
    sparse_hash/time:   7192.06 msec (user+sys)

P.S. the difference for sparse_hashed dictionary with 4e9 elements
(uint64, uint16) is ~18% (4975.905 vs 4103.569 sec)

v2: do not reallocate the dictionary from the progress callback
    Since this will access hashtable in parallel.
v3: drop PREALLOCATE() and do this only for source=clickhouse and empty
    <where>
This commit is contained in:
Azat Khuzhin 2020-09-24 08:01:00 +03:00
parent f4f79aa84a
commit 064f901ea8
6 changed files with 169 additions and 0 deletions

View File

@ -91,6 +91,8 @@ LAYOUT(FLAT())
The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
The hash table will be preallocated (this will make dictionary load faster), if the is approx number of total rows is known, this is supported only if the source is `clickhouse` without any `<where>` (since in case of `<where>` you can filter out too much rows and the dictionary will allocate too much memory, that will not be used eventually).
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
Configuration example:
@ -111,6 +113,8 @@ LAYOUT(HASHED())
Similar to `hashed`, but uses less memory in favor more CPU usage.
It will be also preallocated so as `hashed`, note that it is even more significant for `sparse_hashed`.
Configuration example:
``` xml

View File

@ -48,6 +48,10 @@ public:
std::string toString() const override;
/// Used for detection whether the hashtable should be preallocated
/// (since if there is WHERE then it can filter out too much)
bool hasWhere() const { return !where.empty(); }
private:
std::string getUpdateFieldAndDate();

View File

@ -2,6 +2,7 @@
#include <ext/size.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
#include "ClickHouseDictionarySource.h"
#include <Core/Defines.h>
@ -406,18 +407,130 @@ void HashedDictionary::updateData()
}
if (saved_block)
{
resize(saved_block->rows());
blockToAttributes(*saved_block.get());
}
}
template <typename T>
void HashedDictionary::resize(Attribute & attribute, size_t added_rows)
{
if (!sparse)
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
added_rows += map_ref->size();
map_ref->reserve(added_rows);
}
else
{
const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
added_rows += map_ref->size();
map_ref->resize(added_rows);
}
}
void HashedDictionary::resize(size_t added_rows)
{
if (!added_rows)
return;
for (auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::utUInt8:
resize<UInt8>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt16:
resize<UInt16>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt32:
resize<UInt32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt64:
resize<UInt64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utUInt128:
resize<UInt128>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt8:
resize<Int8>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt16:
resize<Int16>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt32:
resize<Int32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utInt64:
resize<Int64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utFloat32:
resize<Float32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utFloat64:
resize<Float64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utDecimal32:
resize<Decimal32>(attribute, added_rows);
break;
case AttributeUnderlyingType::utDecimal64:
resize<Decimal64>(attribute, added_rows);
break;
case AttributeUnderlyingType::utDecimal128:
resize<Decimal128>(attribute, added_rows);
break;
case AttributeUnderlyingType::utString:
resize<StringRef>(attribute, added_rows);
break;
}
}
}
void HashedDictionary::loadData()
{
if (!source_ptr->hasUpdateField())
{
/// atomic since progress callbac called in parallel
std::atomic<uint64_t> new_size = 0;
auto stream = source_ptr->loadAll();
/// preallocation can be used only when we know number of rows, for this we need:
/// - source clickhouse
/// - no filtering (i.e. lack of <where>), since filtering can filter
/// too much rows and eventually it may allocate memory that will
/// never be used.
bool preallocate = false;
if (const auto & clickhouse_source = dynamic_cast<ClickHouseDictionarySource *>(source_ptr.get()))
{
if (!clickhouse_source->hasWhere())
preallocate = true;
}
if (preallocate)
{
stream->setProgressCallback([&new_size](const Progress & progress)
{
new_size += progress.total_rows_to_read;
});
}
stream->readPrefix();
while (const auto block = stream->read())
{
if (new_size)
{
size_t current_new_size = new_size.exchange(0);
if (current_new_size)
resize(current_new_size);
}
else
resize(block.rows());
blockToAttributes(block);
}
stream->readSuffix();
}

View File

@ -261,6 +261,14 @@ private:
PaddedPODArray<Key> getIds() const;
/// Preallocates the hashtable based on query progress
/// (Only while loading all data).
///
/// @see preallocate
template <typename T>
void resize(Attribute & attribute, size_t added_rows);
void resize(size_t added_rows);
template <typename AttrType, typename ChildType, typename AncestorType>
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
template <typename ChildType, typename AncestorType>

View File

@ -0,0 +1,4 @@
CREATE DICTIONARY db_01509.dict\n(\n `key` UInt64,\n `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED())
-
0
1000

View File

@ -0,0 +1,36 @@
-- The test itself does not test does preallocation works
-- It simply check SPARSE_HASHED dictionary with bunch of dictGet()
-- (since at the moment of writing there were no such test)
DROP DATABASE IF EXISTS db_01509;
CREATE DATABASE db_01509;
CREATE TABLE db_01509.data
(
key UInt64,
value String
)
ENGINE = MergeTree()
ORDER BY key;
INSERT INTO db_01509.data SELECT number key, toString(number) value FROM numbers(1000);
DROP DICTIONARY IF EXISTS db_01509.dict;
CREATE DICTIONARY db_01509.dict
(
key UInt64,
value String DEFAULT '-'
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'data' PASSWORD '' DB 'db_01509'))
LAYOUT(SPARSE_HASHED())
LIFETIME(0);
SHOW CREATE DICTIONARY db_01509.dict;
SYSTEM RELOAD DICTIONARY db_01509.dict;
SELECT dictGet('db_01509.dict', 'value', toUInt64(1e12));
SELECT dictGet('db_01509.dict', 'value', toUInt64(0));
SELECT count() FROM db_01509.dict;
DROP DATABASE IF EXISTS db_01509;