Merge pull request #6894 from azat-archive/hashed-dict-memory-usage-v2

[RFC] Add sparsehash support for hashed dictionary (to reduce memory usage)
This commit is contained in:
alexey-milovidov 2019-09-22 15:55:36 +03:00 committed by GitHub
commit 6fa234cea4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 244 additions and 27 deletions

View File

@ -40,3 +40,5 @@ if(USE_POCO_MONGODB)
endif()
add_subdirectory(Embedded)
target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${SPARSEHASH_INCLUDE_DIR})

View File

@ -3,6 +3,21 @@
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
namespace
{
/// NOTE: Trailing return type is explicitly specified for SFINAE.
/// google::sparse_hash_map
template <typename T> auto first(const T & value) -> decltype(value.first) { return value.first; }
template <typename T> auto second(const T & value) -> decltype(value.second) { return value.second; }
/// HashMap
template <typename T> auto first(const T & value) -> decltype(value.getFirst()) { return value.getFirst(); }
template <typename T> auto second(const T & value) -> decltype(value.getSecond()) { return value.getSecond(); }
}
namespace DB
{
namespace ErrorCodes
@ -21,12 +36,14 @@ HashedDictionary::HashedDictionary(
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
bool sparse_,
BlockPtr saved_block_)
: name{name_}
, dict_struct(dict_struct_)
, source_ptr{std::move(source_ptr_)}
, dict_lifetime(dict_lifetime_)
, require_nonempty(require_nonempty_)
, sparse(sparse_)
, saved_block{std::move(saved_block_)}
{
createAttributes();
@ -57,11 +74,10 @@ static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, c
return value;
}
template <typename ChildType, typename AncestorType>
void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
template <typename AttrType, typename ChildType, typename AncestorType>
void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
const auto & attr = *std::get<CollectionPtrType<Key>>(hierarchical_attribute->maps);
const auto rows = out.size();
for (const auto row : ext::range(0, rows))
@ -73,7 +89,7 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType
{
auto it = attr.find(id);
if (it != std::end(attr))
id = it->getSecond();
id = second(*it);
else
break;
}
@ -83,6 +99,13 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename ChildType, typename AncestorType>
void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
if (!sparse)
return isInAttrImpl(*std::get<CollectionPtrType<Key>>(hierarchical_attribute->maps), child_ids, ancestor_ids, out);
return isInAttrImpl(*std::get<SparseCollectionPtrType<Key>>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out);
}
void HashedDictionary::isInVectorVector(
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const
@ -407,9 +430,22 @@ void HashedDictionary::loadData()
template <typename T>
void HashedDictionary::addAttributeSize(const Attribute & attribute)
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
if (!sparse)
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
else
{
const auto & map_ref = std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
bucket_count = map_ref->bucket_count();
/** TODO: more accurate calculation */
bytes_allocated += sizeof(CollectionType<T>);
bytes_allocated += bucket_count;
bytes_allocated += map_ref->size() * sizeof(Key) * sizeof(T);
}
}
void HashedDictionary::calculateBytesAllocated()
@ -479,12 +515,15 @@ template <typename T>
void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value)
{
attribute.null_values = T(null_value.get<NearestFieldType<T>>());
attribute.maps = std::make_unique<CollectionType<T>>();
if (!sparse)
attribute.maps = std::make_unique<CollectionType<T>>();
else
attribute.sparse_maps = std::make_unique<SparseCollectionType<T>>();
}
HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
Attribute attr{type, {}, {}, {}};
Attribute attr{type, {}, {}, {}, {}};
switch (type)
{
@ -535,7 +574,10 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr
case AttributeUnderlyingType::utString:
{
attr.null_values = null_value.get<String>();
attr.maps = std::make_unique<CollectionType<StringRef>>();
if (!sparse)
attr.maps = std::make_unique<CollectionType<StringRef>>();
else
attr.sparse_maps = std::make_unique<SparseCollectionType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
@ -545,28 +587,43 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsImpl(
const Attribute & attribute, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const
template <typename OutputType, typename AttrType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsAttrImpl(
const AttrType & attr, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const
{
const auto & attr = *std::get<CollectionPtrType<AttributeType>>(attribute.maps);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
{
const auto it = attr.find(ids[i]);
set_value(i, it != attr.end() ? static_cast<OutputType>(it->getSecond()) : get_default(i));
set_value(i, it != attr.end() ? static_cast<OutputType>(second(*it)) : get_default(i));
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsImpl(
const Attribute & attribute, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const
{
if (!sparse)
return getItemsAttrImpl<OutputType>(*std::get<CollectionPtrType<AttributeType>>(attribute.maps), ids, set_value, get_default);
return getItemsAttrImpl<OutputType>(*std::get<SparseCollectionPtrType<AttributeType>>(attribute.sparse_maps), ids, set_value, get_default);
}
template <typename T>
bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value)
{
auto & map = *std::get<CollectionPtrType<T>>(attribute.maps);
return map.insert({id, value}).second;
if (!sparse)
{
auto & map = *std::get<CollectionPtrType<T>>(attribute.maps);
return map.insert({id, value}).second;
}
else
{
auto & map = *std::get<SparseCollectionPtrType<T>>(attribute.sparse_maps);
return map.insert({id, value}).second;
}
}
bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value)
@ -605,10 +662,18 @@ bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, co
case AttributeUnderlyingType::utString:
{
auto & map = *std::get<CollectionPtrType<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
return map.insert({id, StringRef{string_in_arena, string.size()}}).second;
if (!sparse)
{
auto & map = *std::get<CollectionPtrType<StringRef>>(attribute.maps);
return map.insert({id, StringRef{string_in_arena, string.size()}}).second;
}
else
{
auto & map = *std::get<SparseCollectionPtrType<StringRef>>(attribute.sparse_maps);
return map.insert({id, StringRef{string_in_arena, string.size()}}).second;
}
}
}
@ -636,18 +701,23 @@ void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray<Key
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIds(const Attribute & attribute) const
template <typename T, typename AttrType>
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIdsAttrImpl(const AttrType & attr) const
{
const HashMap<UInt64, T> & attr = *std::get<CollectionPtrType<T>>(attribute.maps);
PaddedPODArray<Key> ids;
ids.reserve(attr.size());
for (const auto & value : attr)
ids.push_back(value.getFirst());
ids.push_back(first(value));
return ids;
}
template <typename T>
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIds(const Attribute & attribute) const
{
if (!sparse)
return getIdsAttrImpl<T>(*std::get<CollectionPtrType<Key>>(attribute.maps));
return getIdsAttrImpl<T>(*std::get<SparseCollectionPtrType<Key>>(attribute.sparse_maps));
}
PaddedPODArray<HashedDictionary::Key> HashedDictionary::getIds() const
{
@ -714,9 +784,11 @@ void registerDictionaryHashed(DictionaryFactory & factory)
ErrorCodes::BAD_ARGUMENTS};
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<HashedDictionary>(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
const bool sparse = name == "sparse_hashed";
return std::make_unique<HashedDictionary>(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse);
};
factory.registerLayout("hashed", create_layout);
factory.registerLayout("sparse_hashed", create_layout);
}
}

View File

@ -7,11 +7,16 @@
#include <Columns/ColumnString.h>
#include <Core/Block.h>
#include <Common/HashTable/HashMap.h>
#include <sparsehash/sparse_hash_map>
#include <ext/range.h>
#include "DictionaryStructure.h"
#include "IDictionary.h"
#include "IDictionarySource.h"
/** This dictionary stores all content in a hash table in memory
* (a separate Key -> Value map for each attribute)
* Two variants of hash table are supported: a fast HashMap and memory efficient sparse_hash_map.
*/
namespace DB
{
@ -26,6 +31,7 @@ public:
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
bool sparse_,
BlockPtr saved_block_ = nullptr);
std::string getName() const override { return name; }
@ -46,7 +52,7 @@ public:
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<HashedDictionary>(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block);
return std::make_shared<HashedDictionary>(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
@ -149,6 +155,11 @@ private:
template <typename Value>
using CollectionPtrType = std::unique_ptr<CollectionType<Value>>;
template <typename Value>
using SparseCollectionType = google::sparse_hash_map<UInt64, Value, DefaultHash<UInt64>>;
template <typename Value>
using SparseCollectionPtrType = std::unique_ptr<SparseCollectionType<Value>>;
struct Attribute final
{
AttributeUnderlyingType type;
@ -186,6 +197,23 @@ private:
CollectionPtrType<Float64>,
CollectionPtrType<StringRef>>
maps;
std::variant<
SparseCollectionPtrType<UInt8>,
SparseCollectionPtrType<UInt16>,
SparseCollectionPtrType<UInt32>,
SparseCollectionPtrType<UInt64>,
SparseCollectionPtrType<UInt128>,
SparseCollectionPtrType<Int8>,
SparseCollectionPtrType<Int16>,
SparseCollectionPtrType<Int32>,
SparseCollectionPtrType<Int64>,
SparseCollectionPtrType<Decimal32>,
SparseCollectionPtrType<Decimal64>,
SparseCollectionPtrType<Decimal128>,
SparseCollectionPtrType<Float32>,
SparseCollectionPtrType<Float64>,
SparseCollectionPtrType<StringRef>>
sparse_maps;
std::unique_ptr<Arena> string_arena;
};
@ -207,6 +235,9 @@ private:
Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
template <typename OutputType, typename AttrType, typename ValueSetter, typename DefaultGetter>
void getItemsAttrImpl(
const AttrType & attr, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const;
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void getItemsImpl(
const Attribute & attribute, const PaddedPODArray<Key> & ids, ValueSetter && set_value, DefaultGetter && get_default) const;
@ -221,11 +252,15 @@ private:
template <typename T>
void has(const Attribute & attribute, const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const;
template <typename T, typename AttrType>
PaddedPODArray<Key> getIdsAttrImpl(const AttrType & attr) const;
template <typename T>
PaddedPODArray<Key> getIds(const Attribute & attribute) const;
PaddedPODArray<Key> getIds() const;
template <typename AttrType, typename ChildType, typename AncestorType>
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
template <typename ChildType, typename AncestorType>
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
@ -234,6 +269,7 @@ private:
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
const bool require_nonempty;
const bool sparse;
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;

View File

@ -33,7 +33,7 @@ if (OPENSSL_CRYPTO_LIBRARY)
endif()
target_include_directories(clickhouse_functions PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include)
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR})
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR} ${SPARSEHASH_INCLUDE_DIR})
if (CONSISTENT_HASHING_INCLUDE_DIR)
target_include_directories (clickhouse_functions PRIVATE ${CONSISTENT_HASHING_INCLUDE_DIR})

View File

@ -125,6 +125,69 @@
</structure>
</dictionary>
<dictionary>
<name>hashed_sparse_ints</name>
<source>
<clickhouse>
<host>localhost</host>
<port>9000</port>
<user>default</user>
<password></password>
<db>test_00950</db>
<table>ints</table>
</clickhouse>
</source>
<lifetime>0</lifetime>
<layout>
<sparse_hashed/>
</layout>
<structure>
<id>
<name>key</name>
</id>
<attribute>
<name>i8</name>
<type>Int8</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>i16</name>
<type>Int16</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>i32</name>
<type>Int32</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>i64</name>
<type>Int64</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u8</name>
<type>UInt8</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u16</name>
<type>UInt16</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u32</name>
<type>UInt32</type>
<null_value>0</null_value>
</attribute>
<attribute>
<name>u64</name>
<type>UInt64</type>
<null_value>0</null_value>
</attribute>
</structure>
</dictionary>
<dictionary>
<name>cache_ints</name>
<source>

View File

@ -4,6 +4,9 @@ dictGetOrDefault flat_ints 0 42 42 42 42 42 42 42 42
dictGet hashed_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_ints 0 42 42 42 42 42 42 42 42
dictGet hashed_sparse_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_sparse_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault hashed_sparse_ints 0 42 42 42 42 42 42 42 42
dictGet cache_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault cache_ints 1 1 1 1 1 1 1 1 1
dictGetOrDefault cache_ints 0 42 42 42 42 42 42 42 42

View File

@ -69,6 +69,34 @@ select 'dictGetOrDefault', 'hashed_ints' as dict_name, toUInt64(0) as k,
dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)),
dictGetOrDefault(dict_name, 'u64', k, toUInt64(42));
select 'dictGet', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k,
dictGet(dict_name, 'i8', k),
dictGet(dict_name, 'i16', k),
dictGet(dict_name, 'i32', k),
dictGet(dict_name, 'i64', k),
dictGet(dict_name, 'u8', k),
dictGet(dict_name, 'u16', k),
dictGet(dict_name, 'u32', k),
dictGet(dict_name, 'u64', k);
select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k,
dictGetOrDefault(dict_name, 'i8', k, toInt8(42)),
dictGetOrDefault(dict_name, 'i16', k, toInt16(42)),
dictGetOrDefault(dict_name, 'i32', k, toInt32(42)),
dictGetOrDefault(dict_name, 'i64', k, toInt64(42)),
dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)),
dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)),
dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)),
dictGetOrDefault(dict_name, 'u64', k, toUInt64(42));
select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(0) as k,
dictGetOrDefault(dict_name, 'i8', k, toInt8(42)),
dictGetOrDefault(dict_name, 'i16', k, toInt16(42)),
dictGetOrDefault(dict_name, 'i32', k, toInt32(42)),
dictGetOrDefault(dict_name, 'i64', k, toInt64(42)),
dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)),
dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)),
dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)),
dictGetOrDefault(dict_name, 'u64', k, toUInt64(42));
select 'dictGet', 'cache_ints' as dict_name, toUInt64(1) as k,
dictGet(dict_name, 'i8', k),
dictGet(dict_name, 'i16', k),

View File

@ -39,6 +39,7 @@ The configuration looks like this:
- [flat](#flat)
- [hashed](#dicts-external_dicts_dict_layout-hashed)
- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed)
- [cache](#cache)
- [range_hashed](#range-hashed)
- [complex_key_hashed](#complex-key-hashed)
@ -77,6 +78,18 @@ Configuration example:
</layout>
```
### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed}
Similar to `hashed`, but uses less memory in favor more CPU usage.
Configuration example:
```xml
<layout>
<sparse_hashed />
</layout>
```
### complex_key_hashed