Add new dictionary layout (sparse_hashed) that is more memory efficient

With this new layout, sparsehash will be used over default HashMap,
sparsehash is more memory efficient but it is also slower.

So in a nutshell:
- HashMap uses ~2x more memory then sparse_hash_map
- HashMap ~2-2.5x faster then sparse_hash_map
(tested on lots of input, and the most close to production was
dictionary with 600KK hashes and UInt16 as value)

TODO:
- fix allocated memory calculation
- getBufferSizeInBytes/getBufferSizeInCells interface
- benchmarks

v0: replace HashMap with google::sparse_hash_map
v2: use google::sparse_hash_map only when <sparse> isset to true
v3: replace attributes with different layout
v4: use ch hash over std::hash
This commit is contained in:
Azat Khuzhin 2019-09-21 02:22:40 +03:00
parent 1fe79ad43c
commit 420089c301
No known key found for this signature in database
GPG Key ID: B86086848EF8686D
8 changed files with 242 additions and 27 deletions

View File

@ -40,3 +40,5 @@ if(USE_POCO_MONGODB)
endif()
add_subdirectory(Embedded)
target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${SPARSEHASH_INCLUDE_DIR})

View File

@ -3,6 +3,23 @@
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
namespace
{
/// google::sparse_hash_map
template <class T> auto first(const T &lhs) -> decltype(lhs.first)
{ return lhs.first; }
template <class T> auto second(const T &lhs) -> decltype(lhs.second)
{ return lhs.second; }
/// HashMap
template <class T> auto first(const T &lhs) -> decltype(lhs.getFirst())
{ return lhs.getFirst(); }
template <class T> auto second(const T &lhs) -> decltype(lhs.getSecond())
{ return lhs.getSecond(); }
}
namespace DB
{
namespace ErrorCodes
@ -21,12 +38,14 @@ HashedDictionary::HashedDictionary(
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
bool sparse_,
BlockPtr saved_block_)
: name{name_}
, dict_struct(dict_struct_)
, source_ptr{std::move(source_ptr_)}
, dict_lifetime(dict_lifetime_)
, require_nonempty(require_nonempty_)
, sparse(sparse_)
, saved_block{std::move(saved_block_)}
{
createAttributes();
@ -57,11 +76,10 @@ static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, c
return value;
}
template <typename ChildType, typename AncestorType>
void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
template <typename AttrType, typename ChildType, typename AncestorType>
void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
const auto & attr = *std::get<CollectionPtrType<Key>>(hierarchical_attribute->maps);
const auto rows = out.size();
for (const auto row : ext::range(0, rows))
@ -73,7 +91,7 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType
{
auto it = attr.find(id);
if (it != std::end(attr))
id = it->getSecond();
id = second(*it);
else
break;
}
@ -83,6 +101,13 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename ChildType, typename AncestorType>
void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
if (!sparse)
return isInAttrImpl(*std::get<CollectionPtrType<Key>>(hierarchical_attribute->maps), child_ids, ancestor_ids, out);
return isInAttrImpl(*std::get<SparseCollectionPtrType<Key>>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out);
}
void HashedDictionary::isInVectorVector(
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const
@ -407,9 +432,22 @@ void HashedDictionary::loadData()
template <typename T>
void HashedDictionary::addAttributeSize(const Attribute & attribute)
{
if (!sparse)
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
else
{
const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
bucket_count = map_ref->bucket_count();
/** TODO: more accurate calculation */
bytes_allocated += sizeof(CollectionType<T>);
bytes_allocated += bucket_count;
bytes_allocated += map_ref->size() * sizeof(Key) * sizeof(T);
}
}
void HashedDictionary::calculateBytesAllocated()
@ -479,12 +517,15 @@ template <typename T>
void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value)
{
attribute.null_values = T(null_value.get<NearestFieldType<T>>());
if (!sparse)
attribute.maps = std::make_unique<CollectionType<T>>();
else
attribute.sparse_maps = std::make_unique<SparseCollectionType<T>>();
}
HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
Attribute attr{type, {}, {}, {}};
Attribute attr{type, {}, {}, {}, {}};
switch (type)
{
@ -535,7 +576,10 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr
case AttributeUnderlyingType::utString:
{
attr.null_values = null_value.get<String>();
if (!sparse)
attr.maps = std::make_unique<CollectionType<StringRef>>();
else
attr.sparse_maps = std::make_unique<SparseCollectionType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
@ -545,28 +589,43 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsImpl(
const Attribute & attribute, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const
template <typename OutputType, typename AttrType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsAttrImpl(
const AttrType & attr, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const
{
const auto & attr = *std::get<CollectionPtrType<AttributeType>>(attribute.maps);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
{
const auto it = attr.find(ids[i]);
set_value(i, it != attr.end() ? static_cast<OutputType>(it->getSecond()) : get_default(i));
set_value(i, it != attr.end() ? static_cast<OutputType>(second(*it)) : get_default(i));
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsImpl(
const Attribute & attribute, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const
{
if (!sparse)
return getItemsAttrImpl<OutputType>(*std::get<CollectionPtrType<AttributeType>>(attribute.maps), ids, set_value, get_default);
return getItemsAttrImpl<OutputType>(*std::get<SparseCollectionPtrType<AttributeType>>(attribute.sparse_maps), ids, set_value, get_default);
}
template <typename T>
bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value)
{
if (!sparse)
{
auto & map = *std::get<CollectionPtrType<T>>(attribute.maps);
return map.insert({id, value}).second;
}
else
{
auto & map = *std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
return map.insert({id, value}).second;
}
}
bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value)
@ -605,11 +664,19 @@ bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, co
case AttributeUnderlyingType::utString:
{
auto & map = *std::get<CollectionPtrType<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
if (!sparse)
{
auto & map = *std::get<CollectionPtrType<StringRef>>(attribute.maps);
return map.insert({id, StringRef{string_in_arena, string.size()}}).second;
}
else
{
auto & map = *std::get<SparseCollectionPtrType<StringRef>>(attribute.sparse_maps);
return map.insert({id, StringRef{string_in_arena, string.size()}}).second;
}
}
}
throw Exception{"Invalid attribute type", ErrorCodes::BAD_ARGUMENTS};
@ -636,18 +703,23 @@ void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray<Key
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIds(const Attribute & attribute) const
template <typename T, typename AttrType>
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIdsAttrImpl(const AttrType & attr) const
{
const HashMap<UInt64, T> & attr = *std::get<CollectionPtrType<T>>(attribute.maps);
PaddedPODArray<Key> ids;
ids.reserve(attr.size());
for (const auto & value : attr)
ids.push_back(value.getFirst());
ids.push_back(first(value));
return ids;
}
template <typename T>
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIds(const Attribute & attribute) const
{
if (!sparse)
return getIdsAttrImpl<T>(*std::get<CollectionPtrType<Key>>(attribute.maps));
return getIdsAttrImpl<T>(*std::get<SparseCollectionPtrType<Key>>(attribute.sparse_maps));
}
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIds() const
{
@ -714,9 +786,11 @@ void registerDictionaryHashed(DictionaryFactory & factory)
ErrorCodes::BAD_ARGUMENTS};
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<HashedDictionary>(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
const bool sparse = name == "sparse_hashed";
return std::make_unique<HashedDictionary>(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse);
};
factory.registerLayout("hashed", create_layout);
factory.registerLayout("sparse_hashed", create_layout);
}
}

View File

@ -7,6 +7,7 @@
#include <Columns/ColumnString.h>
#include <Core/Block.h>
#include <Common/HashTable/HashMap.h>
#include <sparsehash/sparse_hash_map>
#include <ext/range.h>
#include "DictionaryStructure.h"
#include "IDictionary.h"
@ -26,6 +27,7 @@ public:
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
bool sparse_,
BlockPtr saved_block_ = nullptr);
std::string getName() const override { return name; }
@ -46,7 +48,7 @@ public:
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<HashedDictionary>(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block);
return std::make_shared<HashedDictionary>(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
@ -149,6 +151,11 @@ private:
template <typename Value>
using CollectionPtrType = std::unique_ptr<CollectionType<Value>>;
template <typename Value>
using SparseCollectionType = google::sparse_hash_map<UInt64, Value, DefaultHash<UInt64>>;
template <typename Value>
using SparseCollectionPtrType = std::unique_ptr<SparseCollectionType<Value>>;
struct Attribute final
{
AttributeUnderlyingType type;
@ -186,6 +193,23 @@ private:
CollectionPtrType<Float64>,
CollectionPtrType<StringRef>>
maps;
std::variant<
SparseCollectionPtrType<UInt8>,
SparseCollectionPtrType<UInt16>,
SparseCollectionPtrType<UInt32>,
SparseCollectionPtrType<UInt64>,
SparseCollectionPtrType<UInt128>,
SparseCollectionPtrType<Int8>,
SparseCollectionPtrType<Int16>,
SparseCollectionPtrType<Int32>,
SparseCollectionPtrType<Int64>,
SparseCollectionPtrType<Decimal32>,
SparseCollectionPtrType<Decimal64>,
SparseCollectionPtrType<Decimal128>,
SparseCollectionPtrType<Float32>,
SparseCollectionPtrType<Float64>,
SparseCollectionPtrType<StringRef>>
sparse_maps;
std::unique_ptr<Arena> string_arena;
};
@ -207,6 +231,9 @@ private:
Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
template <typename OutputType, typename AttrType, typename ValueSetter, typename DefaultGetter>
void getItemsAttrImpl(
const AttrType & attr, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const;
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void getItemsImpl(
const Attribute & attribute, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const;
@ -221,11 +248,15 @@ private:
template <typename T>
void has(const Attribute & attribute, const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const;
template <typename T, typename AttrType>
PaddedPODArray<Key> getIdsAttrImpl(const AttrType & attr) const;
template <typename T>
PaddedPODArray<Key> getIds(const Attribute & attribute) const;
PaddedPODArray<Key> getIds() const;
template <typename AttrType, typename ChildType, typename AncestorType>
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
template <typename ChildType, typename AncestorType>
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
@ -234,6 +265,7 @@ private:
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
const bool require_nonempty;
const bool sparse;
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;

View File

@ -33,7 +33,7 @@ if (OPENSSL_CRYPTO_LIBRARY)
endif()
target_include_directories(clickhouse_functions PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include)
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR})
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR} ${SPARSEHASH_INCLUDE_DIR})
if (CONSISTENT_HASHING_INCLUDE_DIR)
target_include_directories (clickhouse_functions PRIVATE ${CONSISTENT_HASHING_INCLUDE_DIR})

View File

@ -125,6 +125,69 @@
</structure>
</dictionary>
<dictionary>
<name>hashed_sparse_ints</name>
<source>
<clickhouse>
<host>localhost</host>
<port>9000</port>
<user>default</user>
<password></password>
<db>test_00950</db>
<table>ints</table>
</clickhouse>
</source>
<lifetime>0</lifetime>
<layout>
<sparse_hashed/>
</layout>
<structure>
<id>
<name>key</name>
</id>
<attribute>
<name>i8</name>
<type>Int8</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>i16</name>
<type>Int16</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>i32</name>
<type>Int32</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>i64</name>
<type>Int64</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u8</name>
<type>UInt8</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u16</name>
<type>UInt16</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u32</name>
<type>UInt32</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u64</name>
<type>UInt64</type>
<null_value>0</null_value>
</attribute>
</structure>
</dictionary>
<dictionary>
<name>cache_ints</name>
<source>

View File

@ -4,6 +4,9 @@ dictGetOrDefault flat_ints 0 42 42 42 42 42 42 42 42
dictGet hashed_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_ints 0 42 42 42 42 42 42 42 42
dictGet hashed_sparse_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_sparse_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_sparse_ints 0 42 42 42 42 42 42 42 42
dictGet cache_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault cache_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault cache_ints 0 42 42 42 42 42 42 42 42

View File

@ -69,6 +69,34 @@ select 'dictGetOrDefault', 'hashed_ints' as dict_name, toUInt64(0) as k,
dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)),
dictGetOrDefault(dict_name, 'u64', k, toUInt64(42));
select 'dictGet', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k,
dictGet(dict_name, 'i8', k),
dictGet(dict_name, 'i16', k),
dictGet(dict_name, 'i32', k),
dictGet(dict_name, 'i64', k),
dictGet(dict_name, 'u8', k),
dictGet(dict_name, 'u16', k),
dictGet(dict_name, 'u32', k),
dictGet(dict_name, 'u64', k);
select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k,
dictGetOrDefault(dict_name, 'i8', k, toInt8(42)),
dictGetOrDefault(dict_name, 'i16', k, toInt16(42)),
dictGetOrDefault(dict_name, 'i32', k, toInt32(42)),
dictGetOrDefault(dict_name, 'i64', k, toInt64(42)),
dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)),
dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)),
dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)),
dictGetOrDefault(dict_name, 'u64', k, toUInt64(42));
select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(0) as k,
dictGetOrDefault(dict_name, 'i8', k, toInt8(42)),
dictGetOrDefault(dict_name, 'i16', k, toInt16(42)),
dictGetOrDefault(dict_name, 'i32', k, toInt32(42)),
dictGetOrDefault(dict_name, 'i64', k, toInt64(42)),
dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)),
dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)),
dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)),
dictGetOrDefault(dict_name, 'u64', k, toUInt64(42));
select 'dictGet', 'cache_ints' as dict_name, toUInt64(1) as k,
dictGet(dict_name, 'i8', k),
dictGet(dict_name, 'i16', k),

View File

@ -39,6 +39,7 @@ The configuration looks like this:
- [flat](#flat)
- [hashed](#dicts-external_dicts_dict_layout-hashed)
- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed)
- [cache](#cache)
- [range_hashed](#range-hashed)
- [complex_key_hashed](#complex-key-hashed)
@ -77,6 +78,18 @@ Configuration example:
</layout>
```
### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed}
Similar to `hashed`, but uses less memory in favor more CPU usage.
Configuration example:
```xml
<layout>
<sparse_hashed />
</layout>
```
### complex_key_hashed