mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-17 20:02:05 +00:00
Add ability to preallocate hashtables for hashed/sparsehashed dictionaries
preallocation can be used only when we know number of rows, and for this we need: - source clickhouse - no filtering (i.e. lack of <where>), since filtering can filter too much rows and eventually it may allocate memory that will never be used. For sparse_hash the difference is quite significant, preallocated sparse_hash hashtable allocates ~33% faster (7.5 seconds vs 5 seconds for insert, and the difference is more significant for higher number of elements): $ ninja bench-sparse_hash-run [1/1] cd /src/ch/hashtable-bench/.cmake && ...ch/hashtable-bench/.cmake/bench-sparse_hash sparse_hash/insert: 7.574 <!-- sparse_hash/find : 2.14426 sparse_hash/maxrss: 174MiB sparse_hash/time: 9710.51 msec (user+sys) $ time ninja bench-sparse_hash-preallocate-run [1/1] cd /src/ch/hashtable-bench/.cmake && ...-bench/.cmake/bench-sparse_hash-preallocate sparse_hash/insert: 5.0522 <!-- sparse_hash/find : 2.14024 sparse_hash/maxrss: 174MiB sparse_hash/time: 7192.06 msec (user+sys) P.S. the difference for sparse_hashed dictionary with 4e9 elements (uint64, uint16) is ~18% (4975.905 vs 4103.569 sec) v2: do not reallocate the dictionary from the progress callback Since this will access hashtable in parallel. v3: drop PREALLOCATE() and do this only for source=clickhouse and empty <where>
This commit is contained in:
parent
f4f79aa84a
commit
064f901ea8
@ -91,6 +91,8 @@ LAYOUT(FLAT())
|
||||
|
||||
The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
|
||||
|
||||
The hash table will be preallocated (this will make dictionary load faster), if the is approx number of total rows is known, this is supported only if the source is `clickhouse` without any `<where>` (since in case of `<where>` you can filter out too much rows and the dictionary will allocate too much memory, that will not be used eventually).
|
||||
|
||||
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
|
||||
|
||||
Configuration example:
|
||||
@ -111,6 +113,8 @@ LAYOUT(HASHED())
|
||||
|
||||
Similar to `hashed`, but uses less memory in favor more CPU usage.
|
||||
|
||||
It will be also preallocated so as `hashed`, note that it is even more significant for `sparse_hashed`.
|
||||
|
||||
Configuration example:
|
||||
|
||||
``` xml
|
||||
|
@ -48,6 +48,10 @@ public:
|
||||
|
||||
std::string toString() const override;
|
||||
|
||||
/// Used for detection whether the hashtable should be preallocated
|
||||
/// (since if there is WHERE then it can filter out too much)
|
||||
bool hasWhere() const { return !where.empty(); }
|
||||
|
||||
private:
|
||||
std::string getUpdateFieldAndDate();
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <ext/size.h>
|
||||
#include "DictionaryBlockInputStream.h"
|
||||
#include "DictionaryFactory.h"
|
||||
#include "ClickHouseDictionarySource.h"
|
||||
#include <Core/Defines.h>
|
||||
|
||||
|
||||
@ -406,18 +407,130 @@ void HashedDictionary::updateData()
|
||||
}
|
||||
|
||||
if (saved_block)
|
||||
{
|
||||
resize(saved_block->rows());
|
||||
blockToAttributes(*saved_block.get());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HashedDictionary::resize(Attribute & attribute, size_t added_rows)
|
||||
{
|
||||
if (!sparse)
|
||||
{
|
||||
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
|
||||
added_rows += map_ref->size();
|
||||
map_ref->reserve(added_rows);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
|
||||
added_rows += map_ref->size();
|
||||
map_ref->resize(added_rows);
|
||||
}
|
||||
}
|
||||
void HashedDictionary::resize(size_t added_rows)
|
||||
{
|
||||
if (!added_rows)
|
||||
return;
|
||||
|
||||
for (auto & attribute : attributes)
|
||||
{
|
||||
switch (attribute.type)
|
||||
{
|
||||
case AttributeUnderlyingType::utUInt8:
|
||||
resize<UInt8>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utUInt16:
|
||||
resize<UInt16>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utUInt32:
|
||||
resize<UInt32>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utUInt64:
|
||||
resize<UInt64>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utUInt128:
|
||||
resize<UInt128>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utInt8:
|
||||
resize<Int8>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utInt16:
|
||||
resize<Int16>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utInt32:
|
||||
resize<Int32>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utInt64:
|
||||
resize<Int64>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utFloat32:
|
||||
resize<Float32>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utFloat64:
|
||||
resize<Float64>(attribute, added_rows);
|
||||
break;
|
||||
|
||||
case AttributeUnderlyingType::utDecimal32:
|
||||
resize<Decimal32>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utDecimal64:
|
||||
resize<Decimal64>(attribute, added_rows);
|
||||
break;
|
||||
case AttributeUnderlyingType::utDecimal128:
|
||||
resize<Decimal128>(attribute, added_rows);
|
||||
break;
|
||||
|
||||
case AttributeUnderlyingType::utString:
|
||||
resize<StringRef>(attribute, added_rows);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HashedDictionary::loadData()
|
||||
{
|
||||
if (!source_ptr->hasUpdateField())
|
||||
{
|
||||
/// atomic since progress callbac called in parallel
|
||||
std::atomic<uint64_t> new_size = 0;
|
||||
auto stream = source_ptr->loadAll();
|
||||
|
||||
/// preallocation can be used only when we know number of rows, for this we need:
|
||||
/// - source clickhouse
|
||||
/// - no filtering (i.e. lack of <where>), since filtering can filter
|
||||
/// too much rows and eventually it may allocate memory that will
|
||||
/// never be used.
|
||||
bool preallocate = false;
|
||||
if (const auto & clickhouse_source = dynamic_cast<ClickHouseDictionarySource *>(source_ptr.get()))
|
||||
{
|
||||
if (!clickhouse_source->hasWhere())
|
||||
preallocate = true;
|
||||
}
|
||||
|
||||
if (preallocate)
|
||||
{
|
||||
stream->setProgressCallback([&new_size](const Progress & progress)
|
||||
{
|
||||
new_size += progress.total_rows_to_read;
|
||||
});
|
||||
}
|
||||
|
||||
stream->readPrefix();
|
||||
|
||||
while (const auto block = stream->read())
|
||||
{
|
||||
if (new_size)
|
||||
{
|
||||
size_t current_new_size = new_size.exchange(0);
|
||||
if (current_new_size)
|
||||
resize(current_new_size);
|
||||
}
|
||||
else
|
||||
resize(block.rows());
|
||||
blockToAttributes(block);
|
||||
}
|
||||
|
||||
stream->readSuffix();
|
||||
}
|
||||
|
@ -261,6 +261,14 @@ private:
|
||||
|
||||
PaddedPODArray<Key> getIds() const;
|
||||
|
||||
/// Preallocates the hashtable based on query progress
|
||||
/// (Only while loading all data).
|
||||
///
|
||||
/// @see preallocate
|
||||
template <typename T>
|
||||
void resize(Attribute & attribute, size_t added_rows);
|
||||
void resize(size_t added_rows);
|
||||
|
||||
template <typename AttrType, typename ChildType, typename AncestorType>
|
||||
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
|
||||
template <typename ChildType, typename AncestorType>
|
||||
|
@ -0,0 +1,4 @@
|
||||
CREATE DICTIONARY db_01509.dict\n(\n `key` UInt64,\n `value` String DEFAULT \'-\'\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'data\' PASSWORD \'\' DB \'db_01509\'))\nLIFETIME(MIN 0 MAX 0)\nLAYOUT(SPARSE_HASHED())
|
||||
-
|
||||
0
|
||||
1000
|
36
tests/queries/0_stateless/01509_dictionary_preallocate.sql
Normal file
36
tests/queries/0_stateless/01509_dictionary_preallocate.sql
Normal file
@ -0,0 +1,36 @@
|
||||
-- The test itself does not test does preallocation works
|
||||
-- It simply check SPARSE_HASHED dictionary with bunch of dictGet()
|
||||
-- (since at the moment of writing there were no such test)
|
||||
|
||||
DROP DATABASE IF EXISTS db_01509;
|
||||
CREATE DATABASE db_01509;
|
||||
|
||||
CREATE TABLE db_01509.data
|
||||
(
|
||||
key UInt64,
|
||||
value String
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
ORDER BY key;
|
||||
INSERT INTO db_01509.data SELECT number key, toString(number) value FROM numbers(1000);
|
||||
|
||||
DROP DICTIONARY IF EXISTS db_01509.dict;
|
||||
CREATE DICTIONARY db_01509.dict
|
||||
(
|
||||
key UInt64,
|
||||
value String DEFAULT '-'
|
||||
)
|
||||
PRIMARY KEY key
|
||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'data' PASSWORD '' DB 'db_01509'))
|
||||
LAYOUT(SPARSE_HASHED())
|
||||
LIFETIME(0);
|
||||
|
||||
SHOW CREATE DICTIONARY db_01509.dict;
|
||||
|
||||
SYSTEM RELOAD DICTIONARY db_01509.dict;
|
||||
|
||||
SELECT dictGet('db_01509.dict', 'value', toUInt64(1e12));
|
||||
SELECT dictGet('db_01509.dict', 'value', toUInt64(0));
|
||||
SELECT count() FROM db_01509.dict;
|
||||
|
||||
DROP DATABASE IF EXISTS db_01509;
|
Loading…
Reference in New Issue
Block a user