diff --git a/dbms/src/Dictionaries/CMakeLists.txt b/dbms/src/Dictionaries/CMakeLists.txt index 4d066d1f59b..d6f8fc57ff6 100644 --- a/dbms/src/Dictionaries/CMakeLists.txt +++ b/dbms/src/Dictionaries/CMakeLists.txt @@ -40,3 +40,5 @@ if(USE_POCO_MONGODB) endif() add_subdirectory(Embedded) + +target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${SPARSEHASH_INCLUDE_DIR}) diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 9b853ac2df5..4d195b5139a 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -3,6 +3,23 @@ #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" +namespace +{ + +/// google::sparse_hash_map +template auto first(const T &lhs) -> decltype(lhs.first) +{ return lhs.first; } +template auto second(const T &lhs) -> decltype(lhs.second) +{ return lhs.second; } + +/// HashMap +template auto first(const T &lhs) -> decltype(lhs.getFirst()) +{ return lhs.getFirst(); } +template auto second(const T &lhs) -> decltype(lhs.getSecond()) +{ return lhs.getSecond(); } + +} + namespace DB { namespace ErrorCodes @@ -21,12 +38,14 @@ HashedDictionary::HashedDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, + bool sparse_, BlockPtr saved_block_) : name{name_} , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) , require_nonempty(require_nonempty_) + , sparse(sparse_) , saved_block{std::move(saved_block_)} { createAttributes(); @@ -57,11 +76,10 @@ static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, c return value; } -template -void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const +template +void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const { const auto null_value = std::get(hierarchical_attribute->null_values); - const auto & attr = *std::get>(hierarchical_attribute->maps); const auto rows = out.size(); for (const auto row : ext::range(0, rows)) @@ -73,7 +91,7 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType { auto it = attr.find(id); if (it != std::end(attr)) - id = it->getSecond(); + id = second(*it); else break; } @@ -83,6 +101,13 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType query_count.fetch_add(rows, std::memory_order_relaxed); } +template +void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const +{ + if (!sparse) + return isInAttrImpl(*std::get>(hierarchical_attribute->maps), child_ids, ancestor_ids, out); + return isInAttrImpl(*std::get>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out); +} void HashedDictionary::isInVectorVector( const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const @@ -407,9 +432,22 @@ void HashedDictionary::loadData() template void HashedDictionary::addAttributeSize(const Attribute & attribute) { - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); - bucket_count = map_ref->getBufferSizeInCells(); + if (!sparse) + { + const auto & map_ref = std::get>(attribute.maps); + bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); + bucket_count = map_ref->getBufferSizeInCells(); + } + else + { + const auto & map_ref = std::get>(attribute.sparse_maps); + bucket_count = map_ref->bucket_count(); + + /** TODO: more accurate calculation */ + bytes_allocated += sizeof(CollectionType); + bytes_allocated += bucket_count; + bytes_allocated += map_ref->size() * sizeof(Key) * sizeof(T); + } } void HashedDictionary::calculateBytesAllocated() @@ -479,12 +517,15 @@ template void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) { attribute.null_values = T(null_value.get>()); - attribute.maps = std::make_unique>(); + if (!sparse) + attribute.maps = std::make_unique>(); + else + attribute.sparse_maps = std::make_unique>(); } HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) { - Attribute attr{type, {}, {}, {}}; + Attribute attr{type, {}, {}, {}, {}}; switch (type) { @@ -535,7 +576,10 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr case AttributeUnderlyingType::utString: { attr.null_values = null_value.get(); - attr.maps = std::make_unique>(); + if (!sparse) + attr.maps = std::make_unique>(); + else + attr.sparse_maps = std::make_unique>(); attr.string_arena = std::make_unique(); break; } @@ -545,28 +589,43 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr } -template -void HashedDictionary::getItemsImpl( - const Attribute & attribute, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const +template +void HashedDictionary::getItemsAttrImpl( + const AttrType & attr, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const { - const auto & attr = *std::get>(attribute.maps); const auto rows = ext::size(ids); for (const auto i : ext::range(0, rows)) { const auto it = attr.find(ids[i]); - set_value(i, it != attr.end() ? static_cast(it->getSecond()) : get_default(i)); + set_value(i, it != attr.end() ? static_cast(second(*it)) : get_default(i)); } query_count.fetch_add(rows, std::memory_order_relaxed); } +template +void HashedDictionary::getItemsImpl( + const Attribute & attribute, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const +{ + if (!sparse) + return getItemsAttrImpl(*std::get>(attribute.maps), ids, set_value, get_default); + return getItemsAttrImpl(*std::get>(attribute.sparse_maps), ids, set_value, get_default); +} template bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value) { - auto & map = *std::get>(attribute.maps); - return map.insert({id, value}).second; + if (!sparse) + { + auto & map = *std::get>(attribute.maps); + return map.insert({id, value}).second; + } + else + { + auto & map = *std::get>(attribute.sparse_maps); + return map.insert({id, value}).second; + } } bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) @@ -605,10 +664,18 @@ bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, co case AttributeUnderlyingType::utString: { - auto & map = *std::get>(attribute.maps); const auto & string = value.get(); const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - return map.insert({id, StringRef{string_in_arena, string.size()}}).second; + if (!sparse) + { + auto & map = *std::get>(attribute.maps); + return map.insert({id, StringRef{string_in_arena, string.size()}}).second; + } + else + { + auto & map = *std::get>(attribute.sparse_maps); + return map.insert({id, StringRef{string_in_arena, string.size()}}).second; + } } } @@ -636,18 +703,23 @@ void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const +template +PaddedPODArray HashedDictionary::getIdsAttrImpl(const AttrType & attr) const { - const HashMap & attr = *std::get>(attribute.maps); - PaddedPODArray ids; ids.reserve(attr.size()); for (const auto & value : attr) - ids.push_back(value.getFirst()); + ids.push_back(first(value)); return ids; } +template +PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const +{ + if (!sparse) + return getIdsAttrImpl(*std::get>(attribute.maps)); + return getIdsAttrImpl(*std::get>(attribute.sparse_maps)); +} PaddedPODArray HashedDictionary::getIds() const { @@ -714,9 +786,11 @@ void registerDictionaryHashed(DictionaryFactory & factory) ErrorCodes::BAD_ARGUMENTS}; const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + const bool sparse = name == "sparse_hashed"; + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse); }; factory.registerLayout("hashed", create_layout); + factory.registerLayout("sparse_hashed", create_layout); } } diff --git a/dbms/src/Dictionaries/HashedDictionary.h b/dbms/src/Dictionaries/HashedDictionary.h index d1aa5a38d97..5cd31ba3e80 100644 --- a/dbms/src/Dictionaries/HashedDictionary.h +++ b/dbms/src/Dictionaries/HashedDictionary.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "DictionaryStructure.h" #include "IDictionary.h" @@ -26,6 +27,7 @@ public: DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, + bool sparse_, BlockPtr saved_block_ = nullptr); std::string getName() const override { return name; } @@ -46,7 +48,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); + return std::make_shared(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -149,6 +151,11 @@ private: template using CollectionPtrType = std::unique_ptr>; + template + using SparseCollectionType = google::sparse_hash_map>; + template + using SparseCollectionPtrType = std::unique_ptr>; + struct Attribute final { AttributeUnderlyingType type; @@ -186,6 +193,23 @@ private: CollectionPtrType, CollectionPtrType> maps; + std::variant< + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType> + sparse_maps; std::unique_ptr string_arena; }; @@ -207,6 +231,9 @@ private: Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value); + template + void getItemsAttrImpl( + const AttrType & attr, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const; template void getItemsImpl( const Attribute & attribute, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const; @@ -221,11 +248,15 @@ private: template void has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const; + template + PaddedPODArray getIdsAttrImpl(const AttrType & attr) const; template PaddedPODArray getIds(const Attribute & attribute) const; PaddedPODArray getIds() const; + template + void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; template void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; @@ -234,6 +265,7 @@ private: const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; const bool require_nonempty; + const bool sparse; std::map attribute_index_by_name; std::vector attributes; diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt index f495d6d8665..d3d8eb34d36 100644 --- a/dbms/src/Functions/CMakeLists.txt +++ b/dbms/src/Functions/CMakeLists.txt @@ -33,7 +33,7 @@ if (OPENSSL_CRYPTO_LIBRARY) endif() target_include_directories(clickhouse_functions PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include) -target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR}) +target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR} ${SPARSEHASH_INCLUDE_DIR}) if (CONSISTENT_HASHING_INCLUDE_DIR) target_include_directories (clickhouse_functions PRIVATE ${CONSISTENT_HASHING_INCLUDE_DIR}) diff --git a/dbms/tests/config/ints_dictionary.xml b/dbms/tests/config/ints_dictionary.xml index c70f820ce36..a406c8553c0 100644 --- a/dbms/tests/config/ints_dictionary.xml +++ b/dbms/tests/config/ints_dictionary.xml @@ -125,6 +125,69 @@ + + hashed_sparse_ints + + + localhost + 9000 + default + + test_00950 + ints
+
+ + 0 + + + + + + key + + + i8 + Int8 + 0 + + + i16 + Int16 + 0 + + + i32 + Int32 + 0 + + + i64 + Int64 + 0 + + + u8 + UInt8 + 0 + + + u16 + UInt16 + 0 + + + u32 + UInt32 + 0 + + + u64 + UInt64 + 0 + + +
+ cache_ints diff --git a/dbms/tests/queries/0_stateless/00950_dict_get.reference b/dbms/tests/queries/0_stateless/00950_dict_get.reference index fabfda8425f..3010a2989c0 100644 --- a/dbms/tests/queries/0_stateless/00950_dict_get.reference +++ b/dbms/tests/queries/0_stateless/00950_dict_get.reference @@ -4,6 +4,9 @@ dictGetOrDefault flat_ints 0 42 42 42 42 42 42 42 42 dictGet hashed_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault hashed_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault hashed_ints 0 42 42 42 42 42 42 42 42 +dictGet hashed_sparse_ints 1 1 1 1 1 1 1 1 1 +dictGetOrDefault hashed_sparse_ints 1 1 1 1 1 1 1 1 1 +dictGetOrDefault hashed_sparse_ints 0 42 42 42 42 42 42 42 42 dictGet cache_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault cache_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault cache_ints 0 42 42 42 42 42 42 42 42 diff --git a/dbms/tests/queries/0_stateless/00950_dict_get.sql b/dbms/tests/queries/0_stateless/00950_dict_get.sql index 20e5e376855..2483a21c0d3 100644 --- a/dbms/tests/queries/0_stateless/00950_dict_get.sql +++ b/dbms/tests/queries/0_stateless/00950_dict_get.sql @@ -69,6 +69,34 @@ select 'dictGetOrDefault', 'hashed_ints' as dict_name, toUInt64(0) as k, dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)), dictGetOrDefault(dict_name, 'u64', k, toUInt64(42)); +select 'dictGet', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k, + dictGet(dict_name, 'i8', k), + dictGet(dict_name, 'i16', k), + dictGet(dict_name, 'i32', k), + dictGet(dict_name, 'i64', k), + dictGet(dict_name, 'u8', k), + dictGet(dict_name, 'u16', k), + dictGet(dict_name, 'u32', k), + dictGet(dict_name, 'u64', k); +select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k, + dictGetOrDefault(dict_name, 'i8', k, toInt8(42)), + dictGetOrDefault(dict_name, 'i16', k, toInt16(42)), + dictGetOrDefault(dict_name, 'i32', k, toInt32(42)), + dictGetOrDefault(dict_name, 'i64', k, toInt64(42)), + dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)), + dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)), + dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)), + dictGetOrDefault(dict_name, 'u64', k, toUInt64(42)); +select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(0) as k, + dictGetOrDefault(dict_name, 'i8', k, toInt8(42)), + dictGetOrDefault(dict_name, 'i16', k, toInt16(42)), + dictGetOrDefault(dict_name, 'i32', k, toInt32(42)), + dictGetOrDefault(dict_name, 'i64', k, toInt64(42)), + dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)), + dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)), + dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)), + dictGetOrDefault(dict_name, 'u64', k, toUInt64(42)); + select 'dictGet', 'cache_ints' as dict_name, toUInt64(1) as k, dictGet(dict_name, 'i8', k), dictGet(dict_name, 'i16', k), diff --git a/docs/en/query_language/dicts/external_dicts_dict_layout.md b/docs/en/query_language/dicts/external_dicts_dict_layout.md index c3096544d25..a9a80dbe761 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_layout.md +++ b/docs/en/query_language/dicts/external_dicts_dict_layout.md @@ -39,6 +39,7 @@ The configuration looks like this: - [flat](#flat) - [hashed](#dicts-external_dicts_dict_layout-hashed) +- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) - [cache](#cache) - [range_hashed](#range-hashed) - [complex_key_hashed](#complex-key-hashed) @@ -77,6 +78,18 @@ Configuration example: ``` +### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed} + +Similar to `hashed`, but uses less memory in favor more CPU usage. + +Configuration example: + +```xml + + + +``` + ### complex_key_hashed