From 26ab5dd7a73df978104b4b2945571dec0439cdd1 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 28 Feb 2019 17:35:38 +0800 Subject: [PATCH] A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap>; 2. NewLookupMap ResolutionWidth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................223550276.46 ResolutionWidth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................248772721.24 Best: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................217108119.84 ResolutionDepth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................249459504.41 Best: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ``` --- dbms/programs/obfuscator/Obfuscator.cpp | 12 +- .../AggregateFunctionEntropy.h | 8 +- .../AggregateFunctionGroupUniqArray.h | 14 +- .../QuantileExactWeighted.h | 26 +- dbms/src/Columns/ColumnLowCardinality.cpp | 2 +- dbms/src/Columns/ReverseIndex.h | 4 +- dbms/src/Common/ColumnsHashing.h | 14 +- dbms/src/Common/ColumnsHashingImpl.h | 31 +- .../src/Common/CombinedCardinalityEstimator.h | 10 +- dbms/src/Common/HashTable/ClearableHashMap.h | 4 +- .../Common/HashTable/FixedClearableHashMap.h | 73 +++ .../Common/HashTable/FixedClearableHashSet.h | 45 ++ dbms/src/Common/HashTable/FixedHashMap.h | 72 +++ dbms/src/Common/HashTable/FixedHashSet.h | 25 ++ dbms/src/Common/HashTable/FixedHashTable.h | 420 ++++++++++++++++++ dbms/src/Common/HashTable/HashMap.h | 26 +- dbms/src/Common/HashTable/HashTable.h | 7 +- dbms/src/Common/HashTable/SmallTable.h | 12 +- dbms/src/Common/HashTable/TwoLevelHashMap.h | 4 +- dbms/src/Common/HashTable/TwoLevelHashTable.h | 10 +- .../HyperLogLogWithSmallSetOptimization.h | 4 +- dbms/src/Common/SpaceSaving.h | 6 +- dbms/src/Common/tests/auto_array.cpp | 4 +- dbms/src/Common/tests/hash_table.cpp | 6 +- .../src/Common/tests/parallel_aggregation.cpp | 30 +- .../Common/tests/parallel_aggregation2.cpp | 14 +- dbms/src/Common/tests/small_table.cpp | 8 +- dbms/src/Core/tests/string_pool.cpp | 4 +- dbms/src/DataTypes/DataTypeEnum.cpp | 2 +- dbms/src/DataTypes/DataTypeEnum.h | 2 +- .../ComplexKeyCacheDictionary.cpp | 2 +- .../Dictionaries/ComplexKeyCacheDictionary.h | 10 +- .../ComplexKeyHashedDictionary.cpp | 4 +- dbms/src/Dictionaries/HashedDictionary.cpp | 6 +- .../Dictionaries/RangeHashedDictionary.cpp | 12 +- .../src/Formats/JSONEachRowRowInputStream.cpp | 6 +- dbms/src/Formats/TSKVRowInputStream.cpp | 2 +- dbms/src/Functions/arrayIntersect.cpp | 8 +- dbms/src/Functions/transform.cpp | 18 +- dbms/src/Interpreters/Aggregator.cpp | 44 +- dbms/src/Interpreters/Aggregator.h | 17 +- dbms/src/Interpreters/Join.cpp | 4 +- dbms/src/Interpreters/Join.h | 5 +- dbms/src/Interpreters/SetVariants.h | 12 +- dbms/src/Interpreters/tests/CMakeLists.txt | 4 + dbms/src/Interpreters/tests/hash_map.cpp | 12 +- dbms/src/Interpreters/tests/hash_map3.cpp | 4 +- .../Interpreters/tests/hash_map_lookup.cpp | 124 ++++++ .../Interpreters/tests/hash_map_string.cpp | 16 +- .../Interpreters/tests/hash_map_string_2.cpp | 4 +- .../Interpreters/tests/hash_map_string_3.cpp | 4 +- .../tests/hash_map_string_small.cpp | 8 +- .../Interpreters/tests/two_level_hash_map.cpp | 16 +- .../MergeTree/MergeTreeDataWriter.cpp | 4 +- dbms/src/Storages/StorageJoin.cpp | 8 +- utils/test-data-generator/MarkovModel.h | 12 +- 56 files changed, 1020 insertions(+), 245 deletions(-) create mode 100644 dbms/src/Common/HashTable/FixedClearableHashMap.h create mode 100644 dbms/src/Common/HashTable/FixedClearableHashSet.h create mode 100644 dbms/src/Common/HashTable/FixedHashMap.h create mode 100644 dbms/src/Common/HashTable/FixedHashSet.h create mode 100644 dbms/src/Common/HashTable/FixedHashTable.h create mode 100644 dbms/src/Interpreters/tests/hash_map_lookup.cpp diff --git a/dbms/programs/obfuscator/Obfuscator.cpp b/dbms/programs/obfuscator/Obfuscator.cpp index a6fd5bdaff8..943c1ef2f90 100644 --- a/dbms/programs/obfuscator/Obfuscator.cpp +++ b/dbms/programs/obfuscator/Obfuscator.cpp @@ -577,7 +577,7 @@ public: { for (auto & elem : table) { - Histogram & histogram = elem.second; + Histogram & histogram = elem.getSecond(); if (histogram.buckets.size() < params.num_buckets_cutoff) { @@ -591,7 +591,7 @@ public: { for (auto & elem : table) { - Histogram & histogram = elem.second; + Histogram & histogram = elem.getSecond(); if (!histogram.total) continue; @@ -623,7 +623,7 @@ public: { for (auto & elem : table) { - Histogram & histogram = elem.second; + Histogram & histogram = elem.getSecond(); if (!histogram.total) continue; @@ -639,7 +639,7 @@ public: { for (auto & elem : table) { - Histogram & histogram = elem.second; + Histogram & histogram = elem.getSecond(); if (!histogram.total) continue; @@ -674,7 +674,7 @@ public: while (true) { it = table.find(hashContext(code_points.data() + code_points.size() - context_size, code_points.data() + code_points.size())); - if (table.end() != it && it->second.total + it->second.count_end != 0) + if (table.end() != it && it->getSecond().total + it->getSecond().count_end != 0) break; if (context_size == 0) @@ -708,7 +708,7 @@ public: if (num_bytes_after_desired_size > 0) end_probability_multiplier = std::pow(1.25, num_bytes_after_desired_size); - CodePoint code = it->second.sample(determinator, end_probability_multiplier); + CodePoint code = it->getSecond().sample(determinator, end_probability_multiplier); if (code == END) break; diff --git a/dbms/src/AggregateFunctions/AggregateFunctionEntropy.h b/dbms/src/AggregateFunctions/AggregateFunctionEntropy.h index 3041f1781aa..720dd12d2da 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionEntropy.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionEntropy.h @@ -54,7 +54,7 @@ struct EntropyData void merge(const EntropyData & rhs) { for (const auto & pair : rhs.map) - map[pair.first] += pair.second; + map[pair.getFirst()] += pair.getSecond(); } void serialize(WriteBuffer & buf) const @@ -68,7 +68,7 @@ struct EntropyData while (reader.next()) { const auto & pair = reader.get(); - map[pair.first] = pair.second; + map[pair.getFirst()] = pair.getSecond(); } } @@ -76,12 +76,12 @@ struct EntropyData { UInt64 total_value = 0; for (const auto & pair : map) - total_value += pair.second; + total_value += pair.getSecond(); Float64 shannon_entropy = 0; for (const auto & pair : map) { - Float64 frequency = Float64(pair.second) / total_value; + Float64 frequency = Float64(pair.getSecond()) / total_value; shannon_entropy -= frequency * log2(frequency); } diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h index f2ae9e77438..e1c730a4e49 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h @@ -94,7 +94,7 @@ public: size_t i = 0; for (auto it = set.begin(); it != set.end(); ++it, ++i) - data_to[old_size + i] = *it; + data_to[old_size + i] = it->getValue(); } const char * getHeaderFilePath() const override { return __FILE__; } @@ -150,7 +150,7 @@ public: for (const auto & elem : set) { - writeStringBinary(elem, buf); + writeStringBinary(elem.getValue(), buf); } } @@ -185,7 +185,7 @@ public: else { if (inserted) - it->data = arena->insert(str_serialized.data, str_serialized.size); + it->getValueMutable().data = arena->insert(str_serialized.data, str_serialized.size); } } @@ -198,9 +198,9 @@ public: State::Set::iterator it; for (auto & rhs_elem : rhs_set) { - cur_set.emplace(rhs_elem, it, inserted); - if (inserted && it->size) - it->data = arena->insert(it->data, it->size); + cur_set.emplace(rhs_elem.getValue(), it, inserted); + if (inserted && it->getValue().size) + it->getValueMutable().data = arena->insert(it->getValue().data, it->getValue().size); } } @@ -215,7 +215,7 @@ public: for (auto & elem : set) { - deserializeAndInsert(elem, data_to); + deserializeAndInsert(elem.getValue(), data_to); } } diff --git a/dbms/src/AggregateFunctions/QuantileExactWeighted.h b/dbms/src/AggregateFunctions/QuantileExactWeighted.h index d62646b5974..80986c258d1 100644 --- a/dbms/src/AggregateFunctions/QuantileExactWeighted.h +++ b/dbms/src/AggregateFunctions/QuantileExactWeighted.h @@ -48,7 +48,7 @@ struct QuantileExactWeighted void merge(const QuantileExactWeighted & rhs) { for (const auto & pair : rhs.map) - map[pair.first] += pair.second; + map[pair.getFirst()] += pair.getSecond(); } void serialize(WriteBuffer & buf) const @@ -62,7 +62,7 @@ struct QuantileExactWeighted while (reader.next()) { const auto & pair = reader.get(); - map[pair.first] = pair.second; + map[pair.getFirst()] = pair.getSecond(); } } @@ -83,12 +83,12 @@ struct QuantileExactWeighted UInt64 sum_weight = 0; for (const auto & pair : map) { - sum_weight += pair.second; - array[i] = pair; + sum_weight += pair.getSecond(); + array[i] = pair.getValue(); ++i; } - std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; }); + std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.getFirst() < b.getFirst(); }); UInt64 threshold = std::ceil(sum_weight * level); UInt64 accumulated = 0; @@ -97,7 +97,7 @@ struct QuantileExactWeighted const Pair * end = array + size; while (it < end) { - accumulated += it->second; + accumulated += it->getSecond(); if (accumulated >= threshold) break; @@ -108,7 +108,7 @@ struct QuantileExactWeighted if (it == end) --it; - return it->first; + return it->getFirst(); } /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address. @@ -133,12 +133,12 @@ struct QuantileExactWeighted UInt64 sum_weight = 0; for (const auto & pair : map) { - sum_weight += pair.second; - array[i] = pair; + sum_weight += pair.getSecond(); + array[i] = pair.getValue(); ++i; } - std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; }); + std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.getFirst() < b.getFirst(); }); UInt64 accumulated = 0; @@ -150,11 +150,11 @@ struct QuantileExactWeighted while (it < end) { - accumulated += it->second; + accumulated += it->getSecond(); while (accumulated >= threshold) { - result[indices[level_index]] = it->first; + result[indices[level_index]] = it->getFirst(); ++level_index; if (level_index == num_levels) @@ -168,7 +168,7 @@ struct QuantileExactWeighted while (level_index < num_levels) { - result[indices[level_index]] = array[size - 1].first; + result[indices[level_index]] = array[size - 1].getFirst(); ++level_index; } } diff --git a/dbms/src/Columns/ColumnLowCardinality.cpp b/dbms/src/Columns/ColumnLowCardinality.cpp index b7cc67de71a..c919116112c 100644 --- a/dbms/src/Columns/ColumnLowCardinality.cpp +++ b/dbms/src/Columns/ColumnLowCardinality.cpp @@ -33,7 +33,7 @@ namespace data.resize(hash_map.size()); for (auto val : hash_map) - data[val.second] = val.first; + data[val.getSecond()] = val.getFirst(); for (auto & ind : index) ind = hash_map[ind]; diff --git a/dbms/src/Columns/ReverseIndex.h b/dbms/src/Columns/ReverseIndex.h index 6ae780fdd99..ef08f3b2da5 100644 --- a/dbms/src/Columns/ReverseIndex.h +++ b/dbms/src/Columns/ReverseIndex.h @@ -414,7 +414,7 @@ UInt64 ReverseIndex::insert(const StringRef & data) column->popBack(1); } - return *iterator; + return iterator->getValue(); } template @@ -429,7 +429,7 @@ UInt64 ReverseIndex::getInsertionPoint(const StringRef & auto hash = getHash(data); iterator = index->find(data, hash); - return iterator == index->end() ? size() + base_index : *iterator; + return iterator == index->end() ? size() + base_index : iterator->getValue(); } } diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 436ed43660e..e09692bc330 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -56,7 +56,7 @@ struct HashMethodOneNumber /// Get StringRef from value which can be inserted into column. static StringRef getValueRef(const Value & value) { - return StringRef(reinterpret_cast(&value.first), sizeof(value.first)); + return StringRef(reinterpret_cast(&value.getFirst()), sizeof(value.getFirst())); } }; @@ -85,7 +85,7 @@ struct HashMethodString return StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); } - static StringRef getValueRef(const Value & value) { return StringRef(value.first.data, value.first.size); } + static StringRef getValueRef(const Value & value) { return StringRef(value.getFirst().data, value.getFirst().size); } protected: friend class columns_hashing_impl::HashMethodBase; @@ -122,7 +122,7 @@ struct HashMethodFixedString StringRef getKey(size_t row, Arena &) const { return StringRef(&(*chars)[row * n], n); } - static StringRef getValueRef(const Value & value) { return StringRef(value.first.data, value.first.size); } + static StringRef getValueRef(const Value & value) { return StringRef(value.getFirst().data, value.getFirst().size); } protected: friend class columns_hashing_impl::HashMethodBase; @@ -356,8 +356,8 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod { if constexpr (has_mapped) { - new(&it->second) Mapped(); - Base::onNewKey(it->first, pool); + new(&it->getSecond()) Mapped(); + Base::onNewKey(it->getFirstMutable(), pool); } else Base::onNewKey(*it, pool); @@ -365,8 +365,8 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod if constexpr (has_mapped) { - mapped_cache[row] = it->second; - return EmplaceResult(it->second, mapped_cache[row], inserted); + mapped_cache[row] = it->getSecond(); + return EmplaceResult(it->getSecond(), mapped_cache[row], inserted); } else return EmplaceResult(inserted); diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h index e853ff66e64..1f826640899 100644 --- a/dbms/src/Common/ColumnsHashingImpl.h +++ b/dbms/src/Common/ColumnsHashingImpl.h @@ -39,7 +39,7 @@ struct LastElementCache bool check(const Value & value_) { return !empty && value == value_; } template - bool check(const Key & key) { return !empty && value.first == key; } + bool check(const Key & key) { return !empty && value.getFirst() == key; } }; template @@ -147,9 +147,8 @@ protected: if constexpr (has_mapped) { /// Init PairNoInit elements. - cache.value.second = Mapped(); - using Key = decltype(cache.value.first); - cache.value.first = Key(); + cache.value.getSecond() = Mapped(); + cache.value.getFirstMutable() = {}; } else cache.value = Value(); @@ -171,7 +170,7 @@ protected: static_cast(*this).onExistingKey(key, pool); if constexpr (has_mapped) - return EmplaceResult(cache.value.second, cache.value.second, false); + return EmplaceResult(cache.value.getSecond(), cache.value.getSecond(), false); else return EmplaceResult(false); } @@ -183,33 +182,33 @@ protected: [[maybe_unused]] Mapped * cached = nullptr; if constexpr (has_mapped) - cached = &it->second; + cached = &it->getSecond(); if (inserted) { if constexpr (has_mapped) { - new(&it->second) Mapped(); - static_cast(*this).onNewKey(it->first, pool); + new(&it->getSecond()) Mapped(); + static_cast(*this).onNewKey(it->getFirstMutable(), pool); } else - static_cast(*this).onNewKey(*it, pool); + static_cast(*this).onNewKey(it->getValueMutable(), pool); } else static_cast(*this).onExistingKey(key, pool); if constexpr (consecutive_keys_optimization) { - cache.value = *it; + cache.value = it->getValue(); cache.found = true; cache.empty = false; if constexpr (has_mapped) - cached = &cache.value.second; + cached = &cache.value.getSecond(); } if constexpr (has_mapped) - return EmplaceResult(it->second, *cached, inserted); + return EmplaceResult(it->getSecond(), *cached, inserted); else return EmplaceResult(inserted); } @@ -222,7 +221,7 @@ protected: if (cache.check(key)) { if constexpr (has_mapped) - return FindResult(&cache.value.second, cache.found); + return FindResult(&cache.value.getSecond(), cache.found); else return FindResult(cache.found); } @@ -237,18 +236,18 @@ protected: cache.empty = false; if (found) - cache.value = *it; + cache.value = it->getValue(); else { if constexpr (has_mapped) - cache.value.first = key; + cache.value.getFirstMutable() = key; else cache.value = key; } } if constexpr (has_mapped) - return FindResult(found ? &it->second : nullptr, found); + return FindResult(found ? &it->getSecond() : nullptr, found); else return FindResult(found); } diff --git a/dbms/src/Common/CombinedCardinalityEstimator.h b/dbms/src/Common/CombinedCardinalityEstimator.h index 17e9bf4a454..824f0a8c018 100644 --- a/dbms/src/Common/CombinedCardinalityEstimator.h +++ b/dbms/src/Common/CombinedCardinalityEstimator.h @@ -137,12 +137,12 @@ public: if (rhs.getContainerType() == details::ContainerType::SMALL) { for (const auto & x : rhs.small) - insert(x); + insert(x.getValue()); } else if (rhs.getContainerType() == details::ContainerType::MEDIUM) { for (const auto & x : rhs.getContainer()) - insert(x); + insert(x.getValue()); } else if (rhs.getContainerType() == details::ContainerType::LARGE) getContainer().merge(rhs.getContainer()); @@ -234,7 +234,7 @@ private: auto tmp_medium = std::make_unique(); for (const auto & x : small) - tmp_medium->insert(x); + tmp_medium->insert(x.getValue()); medium = tmp_medium.release(); setContainerType(details::ContainerType::MEDIUM); @@ -253,12 +253,12 @@ private: if (container_type == details::ContainerType::SMALL) { for (const auto & x : small) - tmp_large->insert(x); + tmp_large->insert(x.getValue()); } else if (container_type == details::ContainerType::MEDIUM) { for (const auto & x : getContainer()) - tmp_large->insert(x); + tmp_large->insert(x.getValue()); destroy(); } diff --git a/dbms/src/Common/HashTable/ClearableHashMap.h b/dbms/src/Common/HashTable/ClearableHashMap.h index b68ed180927..c7084e56e4c 100644 --- a/dbms/src/Common/HashTable/ClearableHashMap.h +++ b/dbms/src/Common/HashTable/ClearableHashMap.h @@ -37,9 +37,9 @@ public: this->emplace(x, it, inserted); if (inserted) - new(&it->second) mapped_type(); + new(&it->getSecond()) mapped_type(); - return it->second; + return it->getSecond(); } void clear() diff --git a/dbms/src/Common/HashTable/FixedClearableHashMap.h b/dbms/src/Common/HashTable/FixedClearableHashMap.h new file mode 100644 index 00000000000..7f8f739893b --- /dev/null +++ b/dbms/src/Common/HashTable/FixedClearableHashMap.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include + + +template +struct FixedClearableHashMapCell +{ + using Mapped = TMapped; + using State = ClearableHashSetState; + + using value_type = PairNoInit; + UInt32 version; + Mapped mapped; + + FixedClearableHashMapCell() {} + FixedClearableHashMapCell(const Key &, const State & state) : version(state.version) {} + FixedClearableHashMapCell(const value_type & value_, const State & state) : version(state.version), mapped(value_.second) {} + + Mapped & getSecond() { return mapped; } + const Mapped & getSecond() const { return mapped; } + bool isZero(const State & state) const { return version != state.version; } + void setZero() { version = 0; } + static constexpr bool need_zero_value_storage = false; + void setMapped(const value_type & value) { mapped = value.getSecond(); } + + struct CellExt + { + CellExt() {} + CellExt(Key && key_, FixedClearableHashMapCell * ptr_) : key(key_), ptr(ptr_) {} + void update(Key && key_, FixedClearableHashMapCell * ptr_) + { + key = key_; + ptr = ptr_; + } + Key key; + FixedClearableHashMapCell * ptr; + Key & getFirstMutable() { return key; } + const Key & getFirst() const { return key; } + Mapped & getSecond() { return ptr->mapped; } + const Mapped & getSecond() const { return *ptr->mapped; } + const value_type getValue() const { return {key, *ptr->mapped}; } + }; +}; + + +template +class FixedClearableHashMap : public FixedHashMap, Allocator> +{ +public: + using key_type = Key; + using mapped_type = Mapped; + using value_type = typename FixedClearableHashMap::cell_type::value_type; + + mapped_type & operator[](Key x) + { + typename FixedClearableHashMap::iterator it; + bool inserted; + this->emplace(x, it, inserted); + + if (inserted) + new (&it->second) mapped_type(); + + return it->second; + } + + void clear() + { + ++this->version; + this->m_size = 0; + } +}; diff --git a/dbms/src/Common/HashTable/FixedClearableHashSet.h b/dbms/src/Common/HashTable/FixedClearableHashSet.h new file mode 100644 index 00000000000..915875ebda4 --- /dev/null +++ b/dbms/src/Common/HashTable/FixedClearableHashSet.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include + + +template +struct FixedClearableHashTableCell +{ + using State = ClearableHashSetState; + + using value_type = Key; + UInt32 version; + + FixedClearableHashTableCell() {} + FixedClearableHashTableCell(const Key &, const State & state) : version(state.version) {} + + bool isZero(const State & state) const { return version != state.version; } + void setZero() { version = 0; } + static constexpr bool need_zero_value_storage = false; + void setMapped(const value_type & /*value*/) {} + + struct CellExt + { + Key key; + value_type & getValueMutable() { return key; } + const value_type & getValue() const { return key; } + void update(Key && key_, FixedClearableHashTableCell *) { key = key_; } + }; +}; + + +template +class FixedClearableHashSet : public FixedHashTable, Allocator> +{ +public: + using key_type = Key; + using value_type = typename FixedClearableHashSet::cell_type::value_type; + + void clear() + { + ++this->version; + this->m_size = 0; + } +}; diff --git a/dbms/src/Common/HashTable/FixedHashMap.h b/dbms/src/Common/HashTable/FixedHashMap.h new file mode 100644 index 00000000000..ae076ddb877 --- /dev/null +++ b/dbms/src/Common/HashTable/FixedHashMap.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + + +template +struct FixedHashMapCell +{ + using Mapped = TMapped; + using State = TState; + + using value_type = PairNoInit; + bool full; + Mapped mapped; + + FixedHashMapCell() {} + FixedHashMapCell(const Key &, const State &) : full(true) {} + FixedHashMapCell(const value_type & value_, const State &) : full(true), mapped(value_.second) {} + + Mapped & getSecond() { return mapped; } + const Mapped & getSecond() const { return mapped; } + bool isZero(const State &) const { return !full; } + void setZero() { full = false; } + static constexpr bool need_zero_value_storage = false; + void setMapped(const value_type & value) { mapped = value.getSecond(); } + + /// Similar to FixedHashSetCell except that we need to contain a pointer to the Mapped field. + /// Note that we have to assemble a continuous layout for the value_type on each call of getValue(). + struct CellExt + { + CellExt() {} + CellExt(Key && key_, const FixedHashMapCell * ptr_) : key(key_), ptr(const_cast(ptr_)) {} + void update(Key && key_, const FixedHashMapCell * ptr_) + { + key = key_; + ptr = const_cast(ptr_); + } + Key key; + FixedHashMapCell * ptr; + + Key & getFirstMutable() { return key; } + const Key & getFirst() const { return key; } + Mapped & getSecond() { return ptr->mapped; } + const Mapped & getSecond() const { return ptr->mapped; } + const value_type getValue() const { return {key, ptr->mapped}; } + }; +}; + + +template +class FixedHashMap : public FixedHashTable, Allocator> +{ +public: + using Base = FixedHashTable, Allocator>; + using key_type = Key; + using mapped_type = Mapped; + using value_type = typename Base::cell_type::value_type; + + using Base::Base; + + mapped_type & ALWAYS_INLINE operator[](Key x) + { + typename Base::iterator it; + bool inserted; + this->emplace(x, it, inserted); + if (inserted) + new (&it->getSecond()) mapped_type(); + + return it->getSecond(); + } +}; diff --git a/dbms/src/Common/HashTable/FixedHashSet.h b/dbms/src/Common/HashTable/FixedHashSet.h new file mode 100644 index 00000000000..14e92b5c5fd --- /dev/null +++ b/dbms/src/Common/HashTable/FixedHashSet.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +template +class FixedHashSet : public FixedHashTable, Allocator> +{ +public: + using Base = FixedHashTable, Allocator>; + using Self = FixedHashSet; + + void merge(const Self & rhs) + { + for (size_t i = 0; i < Base::BUFFER_SIZE; ++i) + if (Base::buf[i].isZero(*this) && !rhs.buf[i].isZero(*this)) + Base::buf[i] = rhs.buf[i]; + } + + /// NOTE: Currently this method isn't used. When it does, the ReadBuffer should + /// contain the Key explicitly. + // void readAndMerge(DB::ReadBuffer & rb) + // { + + // } +}; diff --git a/dbms/src/Common/HashTable/FixedHashTable.h b/dbms/src/Common/HashTable/FixedHashTable.h new file mode 100644 index 00000000000..663848865da --- /dev/null +++ b/dbms/src/Common/HashTable/FixedHashTable.h @@ -0,0 +1,420 @@ +#pragma once + +#include + +template +struct FixedHashTableCell +{ + using State = TState; + + using value_type = Key; + bool full; + + FixedHashTableCell() {} + FixedHashTableCell(const Key &, const State &) : full(true) {} + + bool isZero(const State &) const { return !full; } + void setZero() { full = false; } + static constexpr bool need_zero_value_storage = false; + void setMapped(const value_type & /*value*/) {} + + /// This Cell is only stored inside an iterator. It's used to accomodate the fact + /// that the iterator based API always provide a reference to a continuous memory + /// containing the Key. As a result, we have to instantiate a real Key field. + /// All methods that return a mutable reference to the Key field are named with + /// -Mutable suffix, indicating this is uncommon usage. As this is only for lookup + /// tables, it's totally fine to discard the Key mutations. + struct CellExt + { + Key key; + + value_type & getValueMutable() { return key; } + const value_type & getValue() const { return key; } + void update(Key && key_, FixedHashTableCell *) { key = key_; } + }; +}; + + +/** Used as a lookup table for small keys such as UInt8, UInt16. It's different + * than a HashTable in that keys are not stored in the Cell buf, but inferred + * inside each iterator. There are a bunch of to make it faster than using + * HashTable: a) It doesn't have a conflict chain; b) There is no key + * comparision; c) The number of cycles for checking cell empty is halved; d) + * Memory layout is tighter, especially the Clearable variants. + * + * NOTE: For Set variants this should always be better. For Map variants + * however, as we need to assemble the real cell inside each iterator, there + * might be some cases we fall short. + * + * TODO: Deprecate the cell API so that end users don't rely on the structure + * of cell. Instead iterator should be used for operations such as cell + * transfer, key updates (f.g. StringRef) and serde. This will allow + * TwoLevelHashSet(Map) to contain different type of sets(maps). + */ +template +class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State +{ + static constexpr size_t BUFFER_SIZE = 1ULL << (sizeof(Key) * 8); + +protected: + friend class const_iterator; + friend class iterator; + friend class Reader; + + using Self = FixedHashTable; + using cell_type = Cell; + + size_t m_size = 0; /// Amount of elements + Cell * buf; /// A piece of memory for all elements except the element with zero key. + + void alloc() { buf = reinterpret_cast(Allocator::alloc(BUFFER_SIZE * sizeof(Cell))); } + + void free() + { + if (buf) + { + Allocator::free(buf, getBufferSizeInBytes()); + buf = nullptr; + } + } + + void destroyElements() + { + if (!std::is_trivially_destructible_v) + for (iterator it = begin(), it_end = end(); it != it_end; ++it) + it.ptr->~Cell(); + } + + + template + class iterator_base + { + using Container = std::conditional_t; + using cell_type = std::conditional_t; + + Container * container; + cell_type * ptr; + + friend class FixedHashTable; + + public: + iterator_base() {} + iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_) + { + cell.update(ptr - container->buf, ptr); + } + + bool operator==(const iterator_base & rhs) const { return ptr == rhs.ptr; } + bool operator!=(const iterator_base & rhs) const { return ptr != rhs.ptr; } + + Derived & operator++() + { + ++ptr; + + /// Skip empty cells in the main buffer. + auto buf_end = container->buf + container->BUFFER_SIZE; + while (ptr < buf_end && ptr->isZero(*container)) + ++ptr; + + return static_cast(*this); + } + + auto & operator*() + { + if (cell.key != ptr - container->buf) + cell.update(ptr - container->buf, ptr); + return cell; + } + auto * operator-> () + { + if (cell.key != ptr - container->buf) + cell.update(ptr - container->buf, ptr); + return &cell; + } + + auto getPtr() const { return ptr; } + size_t getHash() const { return ptr - container->buf; } + size_t getCollisionChainLength() const { return 0; } + typename cell_type::CellExt cell; + }; + + +public: + using key_type = Key; + using value_type = typename Cell::value_type; + + size_t hash(const Key & x) const { return x; } + + FixedHashTable() { alloc(); } + + FixedHashTable(FixedHashTable && rhs) : buf(nullptr) { *this = std::move(rhs); } + + ~FixedHashTable() + { + destroyElements(); + free(); + } + + FixedHashTable & operator=(FixedHashTable && rhs) + { + destroyElements(); + free(); + + std::swap(buf, rhs.buf); + std::swap(m_size, rhs.m_size); + + Allocator::operator=(std::move(rhs)); + Cell::State::operator=(std::move(rhs)); + + return *this; + } + + class Reader final : private Cell::State + { + public: + Reader(DB::ReadBuffer & in_) : in(in_) {} + + Reader(const Reader &) = delete; + Reader & operator=(const Reader &) = delete; + + bool next() + { + if (!is_initialized) + { + Cell::State::read(in); + DB::readVarUInt(size, in); + is_initialized = true; + } + + if (read_count == size) + { + is_eof = true; + return false; + } + + cell.read(in); + ++read_count; + + return true; + } + + inline const value_type & get() const + { + if (!is_initialized || is_eof) + throw DB::Exception("No available data", DB::ErrorCodes::NO_AVAILABLE_DATA); + + return cell.getValue(); + } + + private: + DB::ReadBuffer & in; + Cell cell; + size_t read_count = 0; + size_t size; + bool is_eof = false; + bool is_initialized = false; + }; + + + class iterator : public iterator_base + { + public: + using iterator_base::iterator_base; + }; + + class const_iterator : public iterator_base + { + public: + using iterator_base::iterator_base; + }; + + + const_iterator begin() const + { + if (!buf) + return end(); + + const Cell * ptr = buf; + auto buf_end = buf + BUFFER_SIZE; + while (ptr < buf_end && ptr->isZero(*this)) + ++ptr; + + return const_iterator(this, ptr); + } + + const_iterator cbegin() const { return begin(); } + + iterator begin() + { + if (!buf) + return end(); + + Cell * ptr = buf; + auto buf_end = buf + BUFFER_SIZE; + while (ptr < buf_end && ptr->isZero(*this)) + ++ptr; + + return iterator(this, ptr); + } + + const_iterator end() const { return const_iterator(this, buf + BUFFER_SIZE); } + const_iterator cend() const { return end(); } + iterator end() { return iterator(this, buf + BUFFER_SIZE); } + + +protected: + void ALWAYS_INLINE emplaceImpl(Key x, iterator & it, bool & inserted) + { + it = iterator(this, &buf[x]); + + if (!buf[x].isZero(*this)) + { + inserted = false; + return; + } + + new (&buf[x]) Cell(x, *this); + inserted = true; + ++m_size; + } + + +public: + std::pair ALWAYS_INLINE insert(const value_type & x) + { + std::pair res; + emplaceImpl(Cell::getKey(x), res.first, res.second); + if (res.second) + res.first.ptr->setMapped(x); + + return res; + } + + + void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted) { emplaceImpl(x, it, inserted); } + void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t) { emplaceImpl(x, it, inserted); } + + template + iterator ALWAYS_INLINE find(ObjectToCompareWith x) + { + return !buf[x].isZero(*this) ? iterator(this, &buf[x]) : end(); + } + + template + const_iterator ALWAYS_INLINE find(ObjectToCompareWith x) const + { + return !buf[x].isZero(*this) ? const_iterator(this, &buf[x]) : end(); + } + + template + iterator ALWAYS_INLINE find(ObjectToCompareWith, size_t hash_value) + { + return !buf[hash_value].isZero(*this) ? iterator(this, &buf[hash_value]) : end(); + } + + template + const_iterator ALWAYS_INLINE find(ObjectToCompareWith, size_t hash_value) const + { + return !buf[hash_value].isZero(*this) ? const_iterator(this, &buf[hash_value]) : end(); + } + + bool ALWAYS_INLINE has(Key x) const { return !buf[x].isZero(*this); } + bool ALWAYS_INLINE has(Key, size_t hash_value) const { return !buf[hash_value].isZero(*this); } + + void write(DB::WriteBuffer & wb) const + { + Cell::State::write(wb); + DB::writeVarUInt(m_size, wb); + + for (auto ptr = buf, buf_end = buf + BUFFER_SIZE; ptr < buf_end; ++ptr) + if (!ptr->isZero(*this)) + { + DB::writeVarUInt(ptr - buf); + ptr->write(wb); + } + } + + void writeText(DB::WriteBuffer & wb) const + { + Cell::State::writeText(wb); + DB::writeText(m_size, wb); + + for (auto ptr = buf, buf_end = buf + BUFFER_SIZE; ptr < buf_end; ++ptr) + { + if (!ptr->isZero(*this)) + { + DB::writeChar(',', wb); + DB::writeText(ptr - buf, wb); + DB::writeChar(',', wb); + ptr->writeText(wb); + } + } + } + + void read(DB::ReadBuffer & rb) + { + Cell::State::read(rb); + destroyElements(); + DB::readVarUInt(m_size, rb); + free(); + alloc(); + + for (size_t i = 0; i < m_size; ++i) + { + size_t place_value = 0; + DB::readVarUInt(place_value, rb); + Cell x; + x.read(rb); + new (&buf[place_value]) Cell(x, *this); + } + } + + void readText(DB::ReadBuffer & rb) + { + Cell::State::readText(rb); + destroyElements(); + DB::readText(m_size, rb); + free(); + alloc(); + + for (size_t i = 0; i < m_size; ++i) + { + size_t place_value = 0; + DB::assertChar(',', rb); + DB::readText(place_value, rb); + Cell x; + DB::assertChar(',', rb); + x.readText(rb); + new (&buf[place_value]) Cell(x, *this); + } + } + + size_t size() const { return m_size; } + + bool empty() const { return 0 == m_size; } + + void clear() + { + destroyElements(); + m_size = 0; + + memset(static_cast(buf), 0, BUFFER_SIZE * sizeof(*buf)); + } + + /// After executing this function, the table can only be destroyed, + /// and also you can use the methods `size`, `empty`, `begin`, `end`. + void clearAndShrink() + { + destroyElements(); + m_size = 0; + free(); + } + + size_t getBufferSizeInBytes() const { return BUFFER_SIZE * sizeof(Cell); } + + size_t getBufferSizeInCells() const { return BUFFER_SIZE; } + +#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS + size_t getCollisions() const { return 0; } +#endif +}; diff --git a/dbms/src/Common/HashTable/HashMap.h b/dbms/src/Common/HashTable/HashMap.h index c799444622d..445b2a9887c 100644 --- a/dbms/src/Common/HashTable/HashMap.h +++ b/dbms/src/Common/HashTable/HashMap.h @@ -15,11 +15,14 @@ struct NoInitTag {}; /// A pair that does not initialize the elements, if not needed. template -struct PairNoInit +class PairNoInit { First first; Second second; + template + friend class HashMapCell; +public: PairNoInit() {} template @@ -29,6 +32,11 @@ struct PairNoInit template PairNoInit(First_ && first_, Second_ && second_) : first(std::forward(first_)), second(std::forward(second_)) {} + + First & getFirstMutable() { return first; } + const First & getFirst() const { return first; } + Second & getSecond() { return second; } + const Second & getSecond() const { return second; } }; @@ -45,10 +53,14 @@ struct HashMapCell HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {} HashMapCell(const value_type & value_, const State &) : value(value_) {} - value_type & getValue() { return value; } + Key & getFirstMutable() { return value.first; } + const Key & getFirst() const { return value.first; } + Mapped & getSecond() { return value.second; } + const Mapped & getSecond() const { return value.second; } + + value_type & getValueMutable() { return value; } const value_type & getValue() const { return value; } - static Key & getKey(value_type & value) { return value.first; } static const Key & getKey(const value_type & value) { return value.first; } bool keyEquals(const Key & key_) const { return value.first == key_; } @@ -111,8 +123,8 @@ struct HashMapCellWithSavedHash : public HashMapCell using Base::Base; - bool keyEquals(const Key & key_) const { return this->value.first == key_; } - bool keyEquals(const Key & key_, size_t hash_) const { return saved_hash == hash_ && this->value.first == key_; } + bool keyEquals(const Key & key_) const { return this->value.getFirst() == key_; } + bool keyEquals(const Key & key_, size_t hash_) const { return saved_hash == hash_ && this->value.getFirst() == key_; } bool keyEquals(const Key & key_, size_t hash_, const typename Base::State &) const { return keyEquals(key_, hash_); } void setHash(size_t hash_value) { saved_hash = hash_value; } @@ -158,9 +170,9 @@ public: * the compiler can not guess about this, and generates the `load`, `increment`, `store` code. */ if (inserted) - new(&it->second) mapped_type(); + new(&it->getSecond()) mapped_type(); - return it->second; + return it->getSecond(); } }; diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index 2586a7f7750..795c05a2a3e 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -98,11 +98,10 @@ struct HashTableCell /// HashTableCell(const value_type & value_, const State & state) : key(value_) {} /// Get what the value_type of the container will be. - value_type & getValue() { return key; } + value_type & getValueMutable() { return key; } const value_type & getValue() const { return key; } /// Get the key. - static Key & getKey(value_type & value) { return value; } static const Key & getKey(const value_type & value) { return value; } /// Are the keys at the cells equal? @@ -459,8 +458,8 @@ protected: return static_cast(*this); } - auto & operator* () const { return ptr->getValue(); } - auto * operator->() const { return &ptr->getValue(); } + auto & operator* () const { return *ptr; } + auto * operator->() const { return ptr; } auto getPtr() const { return ptr; } size_t getHash() const { return ptr->getHash(*container); } diff --git a/dbms/src/Common/HashTable/SmallTable.h b/dbms/src/Common/HashTable/SmallTable.h index 27dc8c00332..8f02c29c31e 100644 --- a/dbms/src/Common/HashTable/SmallTable.h +++ b/dbms/src/Common/HashTable/SmallTable.h @@ -148,8 +148,8 @@ public: return *this; } - value_type & operator* () const { return ptr->getValue(); } - value_type * operator->() const { return &ptr->getValue(); } + Cell & operator* () const { return *ptr; } + Cell * operator->() const { return ptr; } Cell * getPtr() const { return ptr; } }; @@ -176,8 +176,8 @@ public: return *this; } - const value_type & operator* () const { return ptr->getValue(); } - const value_type * operator->() const { return &ptr->getValue(); } + const Cell & operator* () const { return *ptr; } + const Cell * operator->() const { return ptr; } const Cell * getPtr() const { return ptr; } }; @@ -399,8 +399,8 @@ public: typename SmallMapTable::iterator it; bool inserted; this->emplace(x, it, inserted); - new(&it->second) mapped_type(); - return it->second; + new(&it->getSecond()) mapped_type(); + return it->getSecond(); } }; diff --git a/dbms/src/Common/HashTable/TwoLevelHashMap.h b/dbms/src/Common/HashTable/TwoLevelHashMap.h index 617774a0aa7..a5e2467b131 100644 --- a/dbms/src/Common/HashTable/TwoLevelHashMap.h +++ b/dbms/src/Common/HashTable/TwoLevelHashMap.h @@ -29,9 +29,9 @@ public: this->emplace(x, it, inserted); if (inserted) - new(&it->second) mapped_type(); + new(&it->getSecond()) mapped_type(); - return it->second; + return it->getSecond(); } }; diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h index df565d1a68c..c2f85d10b5d 100644 --- a/dbms/src/Common/HashTable/TwoLevelHashTable.h +++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h @@ -98,7 +98,7 @@ public: /// It is assumed that the zero key (stored separately) is first in iteration order. if (it != src.end() && it.getPtr()->isZero(src)) { - insert(*it); + insert(it->getValue()); ++it; } @@ -141,8 +141,8 @@ public: return *this; } - value_type & operator* () const { return *current_it; } - value_type * operator->() const { return &*current_it; } + Cell & operator* () const { return *current_it; } + Cell * operator->() const { return current_it.getPtr(); } Cell * getPtr() const { return current_it.getPtr(); } size_t getHash() const { return current_it.getHash(); } @@ -179,8 +179,8 @@ public: return *this; } - const value_type & operator* () const { return *current_it; } - const value_type * operator->() const { return &*current_it; } + const Cell & operator* () const { return *current_it; } + const Cell * operator->() const { return current_it->getPtr(); } const Cell * getPtr() const { return current_it.getPtr(); } size_t getHash() const { return current_it.getHash(); } diff --git a/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h b/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h index db27d126c5c..836fbda222e 100644 --- a/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h +++ b/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h @@ -45,7 +45,7 @@ private: Large * tmp_large = new Large; for (const auto & x : small) - tmp_large->insert(x); + tmp_large->insert(x.getValue()); large = tmp_large; } @@ -99,7 +99,7 @@ public: else { for (const auto & x : rhs.small) - insert(x); + insert(x.getValue()); } } diff --git a/dbms/src/Common/SpaceSaving.h b/dbms/src/Common/SpaceSaving.h index 6e446d1c938..a681279a515 100644 --- a/dbms/src/Common/SpaceSaving.h +++ b/dbms/src/Common/SpaceSaving.h @@ -152,7 +152,7 @@ public: auto it = counter_map.find(key, hash); if (it != counter_map.end()) { - auto c = it->second; + auto c = it->getSecond(); c->count += increment; c->error += error; percolate(c); @@ -189,8 +189,8 @@ public: min->error = alpha + error; percolate(min); - it->second = min; - it->first = min->key; + it->getSecond() = min; + it->getFirstMutable() = min->key; counter_map.reinsert(it, hash); } } diff --git a/dbms/src/Common/tests/auto_array.cpp b/dbms/src/Common/tests/auto_array.cpp index e5aacb87842..11a4b79aa3a 100644 --- a/dbms/src/Common/tests/auto_array.cpp +++ b/dbms/src/Common/tests/auto_array.cpp @@ -155,10 +155,10 @@ int main(int argc, char ** argv) map.emplace(rand(), it, inserted); if (inserted) { - new(&it->second) Arr(n); + new(&it->getSecond()) Arr(n); for (size_t j = 0; j < n; ++j) - it->second[j] = field; + it->getSecond()[j] = field; } } diff --git a/dbms/src/Common/tests/hash_table.cpp b/dbms/src/Common/tests/hash_table.cpp index 048e9755928..291b7e7167e 100644 --- a/dbms/src/Common/tests/hash_table.cpp +++ b/dbms/src/Common/tests/hash_table.cpp @@ -21,13 +21,13 @@ int main(int, char **) bool inserted; cont.emplace(3, it, inserted); - std::cerr << inserted << ", " << *it << std::endl; + std::cerr << inserted << ", " << it->getValue() << std::endl; cont.emplace(3, it, inserted); - std::cerr << inserted << ", " << *it << std::endl; + std::cerr << inserted << ", " << it->getValue() << std::endl; for (auto x : cont) - std::cerr << x << std::endl; + std::cerr << x.getValue() << std::endl; DB::WriteBufferFromOwnString wb; cont.writeText(wb); diff --git a/dbms/src/Common/tests/parallel_aggregation.cpp b/dbms/src/Common/tests/parallel_aggregation.cpp index ba430b0c58c..36bbe6e66d5 100644 --- a/dbms/src/Common/tests/parallel_aggregation.cpp +++ b/dbms/src/Common/tests/parallel_aggregation.cpp @@ -82,14 +82,14 @@ void aggregate12(Map & map, Source::const_iterator begin, Source::const_iterator { if (*it == *prev_it) { - ++found->second; + ++found->getSecond(); continue; } prev_it = it; bool inserted; map.emplace(*it, found, inserted); - ++found->second; + ++found->getSecond(); } } @@ -107,14 +107,14 @@ void aggregate22(MapTwoLevel & map, Source::const_iterator begin, Source::const_ { if (*it == *prev_it) { - ++found->second; + ++found->getSecond(); continue; } prev_it = it; bool inserted; map.emplace(*it, found, inserted); - ++found->second; + ++found->getSecond(); } } @@ -126,7 +126,7 @@ void merge2(MapTwoLevel * maps, size_t num_threads, size_t bucket) { for (size_t i = 1; i < num_threads; ++i) for (auto it = maps[i].impls[bucket].begin(); it != maps[i].impls[bucket].end(); ++it) - maps[0].impls[bucket][it->first] += it->second; + maps[0].impls[bucket][it->getFirst()] += it->getSecond(); } void aggregate3(Map & local_map, Map & global_map, Mutex & mutex, Source::const_iterator begin, Source::const_iterator end) @@ -138,7 +138,7 @@ void aggregate3(Map & local_map, Map & global_map, Mutex & mutex, Source::const_ Map::iterator found = local_map.find(*it); if (found != local_map.end()) - ++found->second; + ++found->getSecond(); else if (local_map.size() < threshold) ++local_map[*it]; /// TODO You could do one lookup, not two. else @@ -163,13 +163,13 @@ void aggregate33(Map & local_map, Map & global_map, Mutex & mutex, Source::const Map::iterator found; bool inserted; local_map.emplace(*it, found, inserted); - ++found->second; + ++found->getSecond(); if (inserted && local_map.size() == threshold) { std::lock_guard lock(mutex); for (auto & value_type : local_map) - global_map[value_type.first] += value_type.second; + global_map[value_type.getFirst()] += value_type.getSecond(); local_map.clear(); } @@ -198,7 +198,7 @@ void aggregate4(Map & local_map, MapTwoLevel & global_map, Mutex * mutexes, Sour Map::iterator found = local_map.find(*it); if (found != local_map.end()) - ++found->second; + ++found->getSecond(); else { size_t hash_value = global_map.hash(*it); @@ -311,7 +311,7 @@ int main(int argc, char ** argv) for (size_t i = 1; i < num_threads; ++i) for (auto it = maps[i].begin(); it != maps[i].end(); ++it) - maps[0][it->first] += it->second; + maps[0][it->getFirst()] += it->getSecond(); watch.stop(); double time_merged = watch.elapsedSeconds(); @@ -365,7 +365,7 @@ int main(int argc, char ** argv) for (size_t i = 1; i < num_threads; ++i) for (auto it = maps[i].begin(); it != maps[i].end(); ++it) - maps[0][it->first] += it->second; + maps[0][it->getFirst()] += it->getSecond(); watch.stop(); @@ -435,7 +435,7 @@ int main(int argc, char ** argv) continue; finish = false; - maps[0][iterators[i]->first] += iterators[i]->second; + maps[0][iterators[i]->getFirst()] += iterators[i]->getSecond(); ++iterators[i]; } @@ -623,7 +623,7 @@ int main(int argc, char ** argv) for (size_t i = 0; i < num_threads; ++i) for (auto it = local_maps[i].begin(); it != local_maps[i].end(); ++it) - global_map[it->first] += it->second; + global_map[it->getFirst()] += it->getSecond(); pool.wait(); @@ -689,7 +689,7 @@ int main(int argc, char ** argv) for (size_t i = 0; i < num_threads; ++i) for (auto it = local_maps[i].begin(); it != local_maps[i].end(); ++it) - global_map[it->first] += it->second; + global_map[it->getFirst()] += it->getSecond(); pool.wait(); @@ -760,7 +760,7 @@ int main(int argc, char ** argv) for (size_t i = 0; i < num_threads; ++i) for (auto it = local_maps[i].begin(); it != local_maps[i].end(); ++it) - global_map[it->first] += it->second; + global_map[it->getFirst()] += it->getSecond(); pool.wait(); diff --git a/dbms/src/Common/tests/parallel_aggregation2.cpp b/dbms/src/Common/tests/parallel_aggregation2.cpp index 699fb65e9dc..5bee292f58d 100644 --- a/dbms/src/Common/tests/parallel_aggregation2.cpp +++ b/dbms/src/Common/tests/parallel_aggregation2.cpp @@ -51,9 +51,9 @@ struct AggregateIndependent map.emplace(*it, place, inserted); if (inserted) - creator(place->second); + creator(place->getSecond()); else - updater(place->second); + updater(place->getSecond()); } }); } @@ -93,7 +93,7 @@ struct AggregateIndependentWithSequentialKeysOptimization { if (it != begin && *it == prev_key) { - updater(place->second); + updater(place->getSecond()); continue; } prev_key = *it; @@ -102,9 +102,9 @@ struct AggregateIndependentWithSequentialKeysOptimization map.emplace(*it, place, inserted); if (inserted) - creator(place->second); + creator(place->getSecond()); else - updater(place->second); + updater(place->getSecond()); } }); } @@ -131,7 +131,7 @@ struct MergeSequential auto begin = source_maps[i]->begin(); auto end = source_maps[i]->end(); for (auto it = begin; it != end; ++it) - merger((*source_maps[0])[it->first], it->second); + merger((*source_maps[0])[it->getFirst()], it->getSecond()); } result_map = source_maps[0]; @@ -161,7 +161,7 @@ struct MergeSequentialTransposed /// In practice not better than usual. continue; finish = false; - merger((*result_map)[iterators[i]->first], iterators[i]->second); + merger((*result_map)[iterators[i]->getFirst()], iterators[i]->getSecond()); ++iterators[i]; } diff --git a/dbms/src/Common/tests/small_table.cpp b/dbms/src/Common/tests/small_table.cpp index 852f1b3f30d..32b4e8c48fe 100644 --- a/dbms/src/Common/tests/small_table.cpp +++ b/dbms/src/Common/tests/small_table.cpp @@ -20,13 +20,13 @@ int main(int, char **) bool inserted; cont.emplace(3, it, inserted); - std::cerr << inserted << ", " << *it << std::endl; + std::cerr << inserted << ", " << it->getValue() << std::endl; cont.emplace(3, it, inserted); - std::cerr << inserted << ", " << *it << std::endl; + std::cerr << inserted << ", " << it->getValue() << std::endl; for (auto x : cont) - std::cerr << x << std::endl; + std::cerr << x.getValue() << std::endl; DB::WriteBufferFromOwnString wb; cont.writeText(wb); @@ -42,7 +42,7 @@ int main(int, char **) cont[1] = "Goodbye."; for (auto x : cont) - std::cerr << x.first << " -> " << x.second << std::endl; + std::cerr << x.getFirst() << " -> " << x.getSecond() << std::endl; DB::WriteBufferFromOwnString wb; cont.writeText(wb); diff --git a/dbms/src/Core/tests/string_pool.cpp b/dbms/src/Core/tests/string_pool.cpp index caaa6fd81c5..04fa33f3abb 100644 --- a/dbms/src/Core/tests/string_pool.cpp +++ b/dbms/src/Core/tests/string_pool.cpp @@ -222,7 +222,7 @@ int main(int argc, char ** argv) size_t i = 0; for (auto it = set.begin(); i < elems_show && it != set.end(); ++it, ++i) { - devnull.write(it->first.data, it->first.size); + devnull.write(it->getFirst().data, it->getFirst().size); devnull << std::endl; } @@ -249,7 +249,7 @@ int main(int argc, char ** argv) size_t i = 0; for (auto it = set.begin(); i < elems_show && it != set.end(); ++it, ++i) { - devnull.write(it->first.data, it->first.size); + devnull.write(it->getFirst().data, it->getFirst().size); devnull << std::endl; } } diff --git a/dbms/src/DataTypes/DataTypeEnum.cpp b/dbms/src/DataTypes/DataTypeEnum.cpp index 5cb091fe6bb..bd93105a288 100644 --- a/dbms/src/DataTypes/DataTypeEnum.cpp +++ b/dbms/src/DataTypes/DataTypeEnum.cpp @@ -74,7 +74,7 @@ void DataTypeEnum::fillMaps() if (!name_to_value_pair.second) throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and '" + name_to_value_pair.first->first.toString() + "' = " + toString(name_to_value_pair.first->second), + + " and '" + name_to_value_pair.first->getFirst().toString() + "' = " + toString(name_to_value_pair.first->getSecond()), ErrorCodes::SYNTAX_ERROR}; const auto value_to_name_pair = value_to_name_map.insert( diff --git a/dbms/src/DataTypes/DataTypeEnum.h b/dbms/src/DataTypes/DataTypeEnum.h index c0c686ab007..19d4ad691dc 100644 --- a/dbms/src/DataTypes/DataTypeEnum.h +++ b/dbms/src/DataTypes/DataTypeEnum.h @@ -81,7 +81,7 @@ public: if (it == std::end(name_to_value_map)) throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::LOGICAL_ERROR}; - return it->second; + return it->getSecond(); } Field castToName(const Field & value_or_name) const override; diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp index 42aa0c943c7..fcb07e81264 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp @@ -222,7 +222,7 @@ void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes std::vector required_rows(outdated_keys.size()); std::transform( - std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.second.front(); }); + std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getSecond().front(); }); /// request new values update( diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h index 92666158015..6d00597533c 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h @@ -342,7 +342,7 @@ private: std::vector required_rows(outdated_keys.size()); std::transform( - std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.second.front(); }); + std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getSecond().front(); }); /// request new values update( @@ -468,7 +468,7 @@ private: std::vector required_rows(outdated_keys.size()); std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { - return pair.second.front(); + return pair.getSecond().front(); }); update( @@ -500,7 +500,7 @@ private: { const StringRef key = keys_array[row]; const auto it = map.find(key); - const auto string_ref = it != std::end(map) ? it->second : get_default(row); + const auto string_ref = it != std::end(map) ? it->getSecond() : get_default(row); out->insertData(string_ref.data, string_ref.size); } } @@ -607,7 +607,7 @@ private: /// Check which ids have not been found and require setting null_value for (const auto & key_found_pair : remaining_keys) { - if (key_found_pair.second) + if (key_found_pair.getSecond()) { ++found_num; continue; @@ -615,7 +615,7 @@ private: ++not_found_num; - auto key = key_found_pair.first; + auto key = key_found_pair.getFirst(); const auto hash = StringRefHash{}(key); const auto find_result = findCellIdx(key, now, hash); const auto & cell_idx = find_result.cell_idx; diff --git a/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp b/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp index a36b225680d..67056909945 100644 --- a/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp +++ b/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp @@ -611,7 +611,7 @@ void ComplexKeyHashedDictionary::getItemsImpl( const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); const auto it = attr.find(key); - set_value(i, it != attr.end() ? static_cast(it->second) : get_default(i)); + set_value(i, it != attr.end() ? static_cast(it->getSecond()) : get_default(i)); /// free memory allocated for the key temporary_keys_pool.rollback(key.size); @@ -779,7 +779,7 @@ std::vector ComplexKeyHashedDictionary::getKeys(const Attribute & att std::vector keys; keys.reserve(attr.size()); for (const auto & key : attr) - keys.push_back(key.first); + keys.push_back(key.getFirst()); return keys; } diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 6154b9243ff..43c75ba0434 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -83,7 +83,7 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType { auto it = attr.find(id); if (it != std::end(attr)) - id = it->second; + id = it->getSecond(); else break; } @@ -605,7 +605,7 @@ void HashedDictionary::getItemsImpl( for (const auto i : ext::range(0, rows)) { const auto it = attr.find(ids[i]); - set_value(i, it != attr.end() ? static_cast(it->second) : get_default(i)); + set_value(i, it != attr.end() ? static_cast(it->getSecond()) : get_default(i)); } query_count.fetch_add(rows, std::memory_order_relaxed); @@ -707,7 +707,7 @@ PaddedPODArray HashedDictionary::getIds(const Attribute & PaddedPODArray ids; ids.reserve(attr.size()); for (const auto & value : attr) - ids.push_back(value.first); + ids.push_back(value.getFirst()); return ids; } diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.cpp b/dbms/src/Dictionaries/RangeHashedDictionary.cpp index 48c884fa773..b1cae6956f2 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.cpp +++ b/dbms/src/Dictionaries/RangeHashedDictionary.cpp @@ -137,7 +137,7 @@ void RangeHashedDictionary::getString( if (it != std::end(attr)) { const auto date = dates[i]; - const auto & ranges_and_values = it->second; + const auto & ranges_and_values = it->getSecond(); const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values), [date](const Value & v) { @@ -408,7 +408,7 @@ void RangeHashedDictionary::getItemsImpl( if (it != std::end(attr)) { const auto date = dates[i]; - const auto & ranges_and_values = it->second; + const auto & ranges_and_values = it->getSecond(); const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values), [date](const Value & v) { @@ -435,7 +435,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K if (it != map.end()) { - auto & values = it->second; + auto & values = it->getSecond(); const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range, [](const Value & lhs, const Range & rhs_range) @@ -508,7 +508,7 @@ void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Key i if (it != map.end()) { - auto & values = it->second; + auto & values = it->getSecond(); const auto insert_it = std::lower_bound( std::begin(values), std::end(values), range, [](const Value & lhs, const Range & rhs_range) @@ -620,9 +620,9 @@ void RangeHashedDictionary::getIdsAndDates( for (const auto & key : attr) { - for (const auto & value : key.second) + for (const auto & value : key.getSecond()) { - ids.push_back(key.first); + ids.push_back(key.getFirst()); start_dates.push_back(value.range.left); end_dates.push_back(value.range.right); diff --git a/dbms/src/Formats/JSONEachRowRowInputStream.cpp b/dbms/src/Formats/JSONEachRowRowInputStream.cpp index 9bc2f073ff5..545db8908a0 100644 --- a/dbms/src/Formats/JSONEachRowRowInputStream.cpp +++ b/dbms/src/Formats/JSONEachRowRowInputStream.cpp @@ -64,9 +64,9 @@ inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, siz if (prev_positions.size() > key_index && prev_positions[key_index] != name_map.end() - && name == prev_positions[key_index]->first) + && name == prev_positions[key_index]->getFirst()) { - return prev_positions[key_index]->second; + return prev_positions[key_index]->getSecond(); } else { @@ -77,7 +77,7 @@ inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, siz if (key_index < prev_positions.size()) prev_positions[key_index] = it; - return it->second; + return it->getSecond(); } else return UNKNOWN_FIELD; diff --git a/dbms/src/Formats/TSKVRowInputStream.cpp b/dbms/src/Formats/TSKVRowInputStream.cpp index 710fad00ea6..cb49c6d0543 100644 --- a/dbms/src/Formats/TSKVRowInputStream.cpp +++ b/dbms/src/Formats/TSKVRowInputStream.cpp @@ -128,7 +128,7 @@ bool TSKVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext) } else { - index = it->second; + index = it->getSecond(); if (read_columns[index]) throw Exception("Duplicate field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); diff --git a/dbms/src/Functions/arrayIntersect.cpp b/dbms/src/Functions/arrayIntersect.cpp index e5f1d4f0dac..86e93ef5ec2 100644 --- a/dbms/src/Functions/arrayIntersect.cpp +++ b/dbms/src/Functions/arrayIntersect.cpp @@ -425,15 +425,15 @@ ColumnPtr FunctionArrayIntersect::execute(const UnpackedArrays & arrays, Mutable for (const auto & pair : map) { - if (pair.second == args) + if (pair.getSecond() == args) { ++result_offset; if constexpr (is_numeric_column) - result_data.insertValue(pair.first); + result_data.insertValue(pair.getFirst()); else if constexpr (std::is_same::value || std::is_same::value) - result_data.insertData(pair.first.data, pair.first.size); + result_data.insertData(pair.getFirst().data, pair.getFirst().size); else - result_data.deserializeAndInsertFromArena(pair.first.data); + result_data.deserializeAndInsertFromArena(pair.getFirst().data); if (all_nullable) null_map.push_back(0); diff --git a/dbms/src/Functions/transform.cpp b/dbms/src/Functions/transform.cpp index 07e92b1356b..7f9a5d922a5 100644 --- a/dbms/src/Functions/transform.cpp +++ b/dbms/src/Functions/transform.cpp @@ -508,7 +508,7 @@ private: { auto it = table.find(src[i]); if (it != table.end()) - memcpy(&dst[i], &it->second, sizeof(dst[i])); /// little endian. + memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); /// little endian. else dst[i] = dst_default; } @@ -524,7 +524,7 @@ private: { auto it = table.find(src[i]); if (it != table.end()) - memcpy(&dst[i], &it->second, sizeof(dst[i])); /// little endian. + memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); /// little endian. else dst[i] = dst_default[i]; } @@ -540,7 +540,7 @@ private: { auto it = table.find(src[i]); if (it != table.end()) - memcpy(&dst[i], &it->second, sizeof(dst[i])); + memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); else dst[i] = src[i]; } @@ -557,7 +557,7 @@ private: for (size_t i = 0; i < size; ++i) { auto it = table.find(src[i]); - StringRef ref = it != table.end() ? it->second : dst_default; + StringRef ref = it != table.end() ? it->getSecond() : dst_default; dst_data.resize(current_dst_offset + ref.size); memcpy(&dst_data[current_dst_offset], ref.data, ref.size); current_dst_offset += ref.size; @@ -581,7 +581,7 @@ private: StringRef ref; if (it != table.end()) - ref = it->second; + ref = it->getSecond(); else { ref.data = reinterpret_cast(&dst_default_data[current_dst_default_offset]); @@ -611,7 +611,7 @@ private: current_src_offset = src_offsets[i]; auto it = table.find(ref); if (it != table.end()) - memcpy(&dst[i], &it->second, sizeof(dst[i])); + memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); else dst[i] = dst_default; } @@ -632,7 +632,7 @@ private: current_src_offset = src_offsets[i]; auto it = table.find(ref); if (it != table.end()) - memcpy(&dst[i], &it->second, sizeof(dst[i])); + memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); else dst[i] = dst_default[i]; } @@ -655,7 +655,7 @@ private: auto it = table.find(src_ref); - StringRef dst_ref = it != table.end() ? it->second : (with_default ? dst_default : src_ref); + StringRef dst_ref = it != table.end() ? it->getSecond() : (with_default ? dst_default : src_ref); dst_data.resize(current_dst_offset + dst_ref.size); memcpy(&dst_data[current_dst_offset], dst_ref.data, dst_ref.size); current_dst_offset += dst_ref.size; @@ -697,7 +697,7 @@ private: StringRef dst_ref; if (it != table.end()) - dst_ref = it->second; + dst_ref = it->getSecond(); else { dst_ref.data = reinterpret_cast(&dst_default_data[current_dst_default_offset]); diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 222b6cc796e..40d06c3228b 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -1120,11 +1120,11 @@ void NO_INLINE Aggregator::convertToBlockImplFinal( for (const auto & value : data) { - method.insertKeyIntoColumns(value, key_columns, key_sizes); + method.insertKeyIntoColumns(value.getValue(), key_columns, key_sizes); for (size_t i = 0; i < params.aggregates_size; ++i) aggregate_functions[i]->insertResultInto( - value.second + offsets_of_aggregate_states[i], + value.getSecond() + offsets_of_aggregate_states[i], *final_aggregate_columns[i]); } @@ -1151,13 +1151,13 @@ void NO_INLINE Aggregator::convertToBlockImplNotFinal( for (auto & value : data) { - method.insertKeyIntoColumns(value, key_columns, key_sizes); + method.insertKeyIntoColumns(value.getValue(), key_columns, key_sizes); /// reserved, so push_back does not throw exceptions for (size_t i = 0; i < params.aggregates_size; ++i) - aggregate_columns[i]->push_back(value.second + offsets_of_aggregate_states[i]); + aggregate_columns[i]->push_back(value.getSecond() + offsets_of_aggregate_states[i]); - value.second = nullptr; + value.getSecond() = nullptr; } } @@ -1495,26 +1495,26 @@ void NO_INLINE Aggregator::mergeDataImpl( { typename Table::iterator res_it; bool inserted; - table_dst.emplace(it->first, res_it, inserted, it.getHash()); + table_dst.emplace(it->getFirst(), res_it, inserted, it.getHash()); if (!inserted) { for (size_t i = 0; i < params.aggregates_size; ++i) aggregate_functions[i]->merge( - res_it->second + offsets_of_aggregate_states[i], - it->second + offsets_of_aggregate_states[i], + res_it->getSecond() + offsets_of_aggregate_states[i], + it->getSecond() + offsets_of_aggregate_states[i], arena); for (size_t i = 0; i < params.aggregates_size; ++i) aggregate_functions[i]->destroy( - it->second + offsets_of_aggregate_states[i]); + it->getSecond() + offsets_of_aggregate_states[i]); } else { - res_it->second = it->second; + res_it->getSecond() = it->getSecond(); } - it->second = nullptr; + it->getSecond() = nullptr; } table_src.clearAndShrink(); @@ -1534,22 +1534,22 @@ void NO_INLINE Aggregator::mergeDataNoMoreKeysImpl( for (auto it = table_src.begin(), end = table_src.end(); it != end; ++it) { - typename Table::iterator res_it = table_dst.find(it->first, it.getHash()); + typename Table::iterator res_it = table_dst.find(it->getFirst(), it.getHash()); AggregateDataPtr res_data = table_dst.end() == res_it ? overflows - : res_it->second; + : res_it->getSecond(); for (size_t i = 0; i < params.aggregates_size; ++i) aggregate_functions[i]->merge( res_data + offsets_of_aggregate_states[i], - it->second + offsets_of_aggregate_states[i], + it->getSecond() + offsets_of_aggregate_states[i], arena); for (size_t i = 0; i < params.aggregates_size; ++i) - aggregate_functions[i]->destroy(it->second + offsets_of_aggregate_states[i]); + aggregate_functions[i]->destroy(it->getSecond() + offsets_of_aggregate_states[i]); - it->second = nullptr; + it->getSecond() = nullptr; } table_src.clearAndShrink(); @@ -1567,23 +1567,23 @@ void NO_INLINE Aggregator::mergeDataOnlyExistingKeysImpl( for (auto it = table_src.begin(); it != table_src.end(); ++it) { - decltype(it) res_it = table_dst.find(it->first, it.getHash()); + decltype(it) res_it = table_dst.find(it->getFirst(), it.getHash()); if (table_dst.end() == res_it) continue; - AggregateDataPtr res_data = res_it->second; + AggregateDataPtr res_data = res_it->getSecond(); for (size_t i = 0; i < params.aggregates_size; ++i) aggregate_functions[i]->merge( res_data + offsets_of_aggregate_states[i], - it->second + offsets_of_aggregate_states[i], + it->getSecond() + offsets_of_aggregate_states[i], arena); for (size_t i = 0; i < params.aggregates_size; ++i) - aggregate_functions[i]->destroy(it->second + offsets_of_aggregate_states[i]); + aggregate_functions[i]->destroy(it->getSecond() + offsets_of_aggregate_states[i]); - it->second = nullptr; + it->getSecond() = nullptr; } table_src.clearAndShrink(); @@ -2428,7 +2428,7 @@ void NO_INLINE Aggregator::destroyImpl(Table & table) const { for (auto elem : table) { - AggregateDataPtr & data = elem.second; + AggregateDataPtr & data = elem.getSecond(); /** If an exception (usually a lack of memory, the MemoryTracker throws) arose * after inserting the key into a hash table, but before creating all states of aggregate functions, diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index 87febbc77e8..5a13599bf89 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -64,8 +65,8 @@ class IBlockOutputStream; using AggregatedDataWithoutKey = AggregateDataPtr; -using AggregatedDataWithUInt8Key = HashMap>; -using AggregatedDataWithUInt16Key = HashMap>; +using AggregatedDataWithUInt8Key = FixedHashMap; +using AggregatedDataWithUInt16Key = FixedHashMap; using AggregatedDataWithUInt64Key = HashMap>; using AggregatedDataWithStringKey = HashMapWithSavedHash; @@ -178,7 +179,7 @@ struct AggregationMethodOneNumber // Insert the key from the hash table into columns. static void insertKeyIntoColumns(const typename Data::value_type & value, MutableColumns & key_columns, const Sizes & /*key_sizes*/) { - static_cast(key_columns[0].get())->insertRawData(reinterpret_cast(&value.first)); + static_cast(key_columns[0].get())->insertRawData(reinterpret_cast(&value.getFirst())); } }; @@ -206,7 +207,7 @@ struct AggregationMethodString static void insertKeyIntoColumns(const typename Data::value_type & value, MutableColumns & key_columns, const Sizes &) { - key_columns[0]->insertData(value.first.data, value.first.size); + key_columns[0]->insertData(value.getFirst().data, value.getFirst().size); } }; @@ -234,7 +235,7 @@ struct AggregationMethodFixedString static void insertKeyIntoColumns(const typename Data::value_type & value, MutableColumns & key_columns, const Sizes &) { - key_columns[0]->insertData(value.first.data, value.first.size); + key_columns[0]->insertData(value.getFirst().data, value.getFirst().size); } }; @@ -326,7 +327,7 @@ struct AggregationMethodKeysFixed /// corresponding key is nullable. Update the null map accordingly. size_t bucket = i / 8; size_t offset = i % 8; - UInt8 val = (reinterpret_cast(&value.first)[bucket] >> offset) & 1; + UInt8 val = (reinterpret_cast(&value.getFirst())[bucket] >> offset) & 1; null_map->insertValue(val); is_null = val == 1; } @@ -338,7 +339,7 @@ struct AggregationMethodKeysFixed else { size_t size = key_sizes[i]; - observed_column->insertData(reinterpret_cast(&value.first) + pos, size); + observed_column->insertData(reinterpret_cast(&value.getFirst()) + pos, size); pos += size; } } @@ -373,7 +374,7 @@ struct AggregationMethodSerialized static void insertKeyIntoColumns(const typename Data::value_type & value, MutableColumns & key_columns, const Sizes &) { - auto pos = value.first.data; + auto pos = value.getFirst().data; for (auto & column : key_columns) pos = column->deserializeAndInsertFromArena(pos); } diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index e1a30c5778d..a5bdb27fb23 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -1317,10 +1317,10 @@ private: for (; it != end; ++it) { - if (it->second.getUsed()) + if (it->getSecond().getUsed()) continue; - AdderNonJoined::add(it->second, rows_added, columns_left, columns_keys_and_right); + AdderNonJoined::add(it->getSecond(), rows_added, columns_left, columns_keys_and_right); if (rows_added >= max_block_size) { diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index c4dd3e24e63..47c58a17dc0 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -218,8 +219,8 @@ public: template struct MapsTemplate { - std::unique_ptr>> key8; - std::unique_ptr>> key16; + std::unique_ptr> key8; + std::unique_ptr> key16; std::unique_ptr>> key32; std::unique_ptr>> key64; std::unique_ptr> key_string; diff --git a/dbms/src/Interpreters/SetVariants.h b/dbms/src/Interpreters/SetVariants.h index e6c75da91fb..8b74f3dd09a 100644 --- a/dbms/src/Interpreters/SetVariants.h +++ b/dbms/src/Interpreters/SetVariants.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include @@ -182,9 +184,8 @@ struct SetMethodHashed */ struct NonClearableSet { - /// TODO Use either bit- or byte-set for these two options. - std::unique_ptr>>> key8; - std::unique_ptr>>> key16; + std::unique_ptr>> key8; + std::unique_ptr>> key16; /** Also for the experiment was tested the ability to use SmallSet, * as long as the number of elements in the set is small (and, if necessary, converted to a full-fledged HashSet). @@ -209,9 +210,8 @@ struct NonClearableSet struct ClearableSet { - /// TODO Use either bit- or byte-set for these two options. - std::unique_ptr>>> key8; - std::unique_ptr>>> key16; + std::unique_ptr>> key8; + std::unique_ptr>> key16; std::unique_ptr>>> key32; std::unique_ptr>>> key64; diff --git a/dbms/src/Interpreters/tests/CMakeLists.txt b/dbms/src/Interpreters/tests/CMakeLists.txt index 0cf33595335..3de6c321de2 100644 --- a/dbms/src/Interpreters/tests/CMakeLists.txt +++ b/dbms/src/Interpreters/tests/CMakeLists.txt @@ -14,6 +14,10 @@ add_executable (hash_map hash_map.cpp) target_include_directories (hash_map SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) target_link_libraries (hash_map PRIVATE dbms clickhouse_compression) +add_executable (hash_map_lookup hash_map_lookup.cpp) +target_include_directories (hash_map_lookup SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_link_libraries (hash_map_lookup PRIVATE dbms clickhouse_compression) + add_executable (hash_map3 hash_map3.cpp) target_include_directories(hash_map3 SYSTEM BEFORE PRIVATE ${METROHASH_INCLUDE_DIR}) target_link_libraries (hash_map3 PRIVATE dbms ${FARMHASH_LIBRARIES} ${METROHASH_LIBRARIES}) diff --git a/dbms/src/Interpreters/tests/hash_map.cpp b/dbms/src/Interpreters/tests/hash_map.cpp index a3e1cad8d12..275cd367179 100644 --- a/dbms/src/Interpreters/tests/hash_map.cpp +++ b/dbms/src/Interpreters/tests/hash_map.cpp @@ -162,8 +162,8 @@ int main(int argc, char ** argv) map.emplace(data[i], it, inserted); if (inserted) { - new(&it->second) Value; - std::swap(it->second, value); + new(&it->getSecond()) Value; + std::swap(it->getSecond(), value); INIT } } @@ -193,8 +193,8 @@ int main(int argc, char ** argv) map.emplace(data[i], it, inserted); if (inserted) { - new(&it->second) Value; - std::swap(it->second, value); + new(&it->getSecond()) Value; + std::swap(it->getSecond(), value); INIT } } @@ -225,8 +225,8 @@ int main(int argc, char ** argv) map.emplace(data[i], it, inserted); if (inserted) { - new(&it->second) Value; - std::swap(it->second, value); + new(&it->getSecond()) Value; + std::swap(it->getSecond(), value); INIT } } diff --git a/dbms/src/Interpreters/tests/hash_map3.cpp b/dbms/src/Interpreters/tests/hash_map3.cpp index 59e6d329958..8b32db85e70 100644 --- a/dbms/src/Interpreters/tests/hash_map3.cpp +++ b/dbms/src/Interpreters/tests/hash_map3.cpp @@ -38,7 +38,7 @@ public: if (this->buf[i].isZero(*this)) std::cerr << "[ ]"; else - std::cerr << '[' << this->buf[i].getValue().first.data << ", " << this->buf[i].getValue().second << ']'; + std::cerr << '[' << this->buf[i].getValue().getFirst().data << ", " << this->buf[i].getValue().getSecond() << ']'; } std::cerr << std::endl; } @@ -85,7 +85,7 @@ int main(int, char **) std::cerr << "Collisions: " << map.getCollisions() << std::endl; for (auto x : map) - std::cerr << x.first.toString() << " -> " << x.second << std::endl; + std::cerr << x.getFirst().toString() << " -> " << x.getSecond() << std::endl; return 0; } diff --git a/dbms/src/Interpreters/tests/hash_map_lookup.cpp b/dbms/src/Interpreters/tests/hash_map_lookup.cpp new file mode 100644 index 00000000000..1aceec7b18f --- /dev/null +++ b/dbms/src/Interpreters/tests/hash_map_lookup.cpp @@ -0,0 +1,124 @@ +#include +#include +#include + +#include + +#define DBMS_HASH_MAP_COUNT_COLLISIONS +#define DBMS_HASH_MAP_DEBUG_RESIZES + +#include +#include +#include +#include +#include +#include +#include + +/** Do this: +for file in ResolutionWidth ResolutionDepth; do + for size in 30000 100000 300000 1000000 5000000; do + echo + BEST_METHOD=0 + BEST_RESULT=0 + for method in {1..2}; do + echo -ne $file $size $method ''; + TOTAL_ELEMS=0 + for i in {0..1000}; do + TOTAL_ELEMS=$(( $TOTAL_ELEMS + $size )) + if [[ $TOTAL_ELEMS -gt 25000000 ]]; then break; fi + ./hash_map_lookup $size $method < ${file}.bin 2>&1 | + grep HashMap | grep -oE '[0-9\.]+ elem'; + done | awk -W interactive '{ if ($1 > x) { x = $1 }; printf(".") } END { print x }' | tee /tmp/hash_map_lookup_res; + CUR_RESULT=$(cat /tmp/hash_map_lookup_res | tr -d '.') + if [[ $CUR_RESULT -gt $BEST_RESULT ]]; then + BEST_METHOD=$method + BEST_RESULT=$CUR_RESULT + fi; + done; + echo Best: $BEST_METHOD - $BEST_RESULT + done; +done +*/ + + +template +void NO_INLINE bench(const std::vector & data, const char * name) +{ + Map map; + typename Map::iterator it; + bool inserted; + + Stopwatch watch; + for (size_t i = 0, size = data.size(); i < size; ++i) + { + map.emplace(data[i], it, inserted); + if (inserted) + it->getSecond() = 1; + else + ++it->getSecond(); + } + + for (size_t i = 0, size = data.size(); i < size; ++i) + { + it = map.find(data[i]); + ++it->getSecond(); + } + watch.stop(); + std::cerr << std::fixed << std::setprecision(2) << "HashMap (" << name << "). Size: " << map.size() + << ", elapsed: " << watch.elapsedSeconds() << " (" << data.size() / watch.elapsedSeconds() << " elem/sec.)" +#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS + << ", collisions: " << map.getCollisions() +#endif + << std::endl; +} + +template +void insert(Map & map, StringRef & k) +{ + bool inserted; + typename Map::iterator it; + map.emplace(k, it, inserted, nullptr); + if (inserted) + *it = 1; + else + ++*it; + std::cout << *map.find(k) << std::endl; +} + +int main(int argc, char ** argv) +{ + if (argc < 3) + { + std::cerr << "Usage: program n m\n"; + return 1; + } + + size_t n = atoi(argv[1]); + size_t m = atoi(argv[2]); + + std::vector data(n); + + { + Stopwatch watch; + DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO); + DB::CompressedReadBuffer in2(in1); + for (size_t i = 0; i < n && !in2.eof(); ++i) + { + DB::readBinary(data[i], in2); + } + + watch.stop(); + std::cerr << std::fixed << std::setprecision(2) << "Vector. Size: " << n << ", elapsed: " << watch.elapsedSeconds() << " (" + << n / watch.elapsedSeconds() << " elem/sec.)" << std::endl; + } + + using OldLookup = HashMap>; + using NewLookup = FixedHashMap; + + if (!m || m == 1) + bench(data, "Old Lookup"); + if (!m || m == 2) + bench(data, "New Lookup"); + return 0; +} diff --git a/dbms/src/Interpreters/tests/hash_map_string.cpp b/dbms/src/Interpreters/tests/hash_map_string.cpp index 3ccd1d710a3..9076a1e582e 100644 --- a/dbms/src/Interpreters/tests/hash_map_string.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string.cpp @@ -337,8 +337,8 @@ int main(int argc, char ** argv) { map.emplace(data[i], it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); @@ -366,8 +366,8 @@ int main(int argc, char ** argv) { map.emplace(data[i], it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); @@ -396,8 +396,8 @@ int main(int argc, char ** argv) { map.emplace(data[i], it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); @@ -426,8 +426,8 @@ int main(int argc, char ** argv) { map.emplace(data[i], it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/hash_map_string_2.cpp b/dbms/src/Interpreters/tests/hash_map_string_2.cpp index 330d80af8af..da9619c638d 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_2.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_2.cpp @@ -595,8 +595,8 @@ void NO_INLINE bench(const std::vector & data, const char * name) { map.emplace(static_cast(data[i]), it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/hash_map_string_3.cpp b/dbms/src/Interpreters/tests/hash_map_string_3.cpp index f58d79d0db7..850a9268c5d 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_3.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_3.cpp @@ -442,8 +442,8 @@ void NO_INLINE bench(const std::vector & data, const char * name) { map.emplace(static_cast(data[i]), it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/hash_map_string_small.cpp b/dbms/src/Interpreters/tests/hash_map_string_small.cpp index c50e7f68a2d..0fa8854fa8a 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_small.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_small.cpp @@ -144,8 +144,8 @@ int main(int argc, char ** argv) { map.emplace(data[i], it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); @@ -173,8 +173,8 @@ int main(int argc, char ** argv) { map.emplace(SmallStringRef(data[i].data, data[i].size), it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/two_level_hash_map.cpp b/dbms/src/Interpreters/tests/two_level_hash_map.cpp index 7b793d4f33a..475475f9c7a 100644 --- a/dbms/src/Interpreters/tests/two_level_hash_map.cpp +++ b/dbms/src/Interpreters/tests/two_level_hash_map.cpp @@ -67,8 +67,8 @@ int main(int argc, char ** argv) { map.emplace(data[i], it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); @@ -82,7 +82,7 @@ int main(int argc, char ** argv) size_t elems = 0; for (const auto & kv : map) { - sum_counts += kv.second; + sum_counts += kv.getSecond(); ++elems; } @@ -103,8 +103,8 @@ int main(int argc, char ** argv) { map.emplace(i, it, inserted); if (inserted) - it->second = 0; - ++it->second; + it->getSecond() = 0; + ++it->getSecond(); } watch.stop(); @@ -118,11 +118,11 @@ int main(int argc, char ** argv) size_t elems = 0; for (const auto & kv : map) { - sum_counts += kv.second; + sum_counts += kv.getSecond(); ++elems; - if (kv.first > n) - std::cerr << kv.first << std::endl; + if (kv.getFirst() > n) + std::cerr << kv.getFirst() << std::endl; } std::cerr << "sum_counts: " << sum_counts << ", elems: " << elems << std::endl; diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp index f5a4ac74a6d..bd293c224a0 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -48,7 +48,7 @@ void buildScatterSelector( if (inserted) { partition_num_to_first_row.push_back(i); - it->second = partitions_count; + it->getSecond() = partitions_count; ++partitions_count; @@ -61,7 +61,7 @@ void buildScatterSelector( } if (partitions_count > 1) - selector[i] = it->second; + selector[i] = it->getSecond(); } } diff --git a/dbms/src/Storages/StorageJoin.cpp b/dbms/src/Storages/StorageJoin.cpp index 34625ca9a79..3a734fdb9b1 100644 --- a/dbms/src/Storages/StorageJoin.cpp +++ b/dbms/src/Storages/StorageJoin.cpp @@ -327,18 +327,18 @@ private: { for (size_t j = 0; j < columns.size(); ++j) if (j == key_pos) - columns[j]->insertData(rawData(it->first), rawSize(it->first)); + columns[j]->insertData(rawData(it->getFirst()), rawSize(it->getFirst())); else - columns[j]->insertFrom(*it->second.block->getByPosition(column_indices[j]).column.get(), it->second.row_num); + columns[j]->insertFrom(*it->getSecond().block->getByPosition(column_indices[j]).column.get(), it->getSecond().row_num); ++rows_added; } else - for (auto current = &static_cast(it->second); current != nullptr; + for (auto current = &static_cast(it->getSecond()); current != nullptr; current = current->next) { for (size_t j = 0; j < columns.size(); ++j) if (j == key_pos) - columns[j]->insertData(rawData(it->first), rawSize(it->first)); + columns[j]->insertData(rawData(it->getFirst()), rawSize(it->getFirst())); else columns[j]->insertFrom(*current->block->getByPosition(column_indices[j]).column.get(), current->row_num); ++rows_added; diff --git a/utils/test-data-generator/MarkovModel.h b/utils/test-data-generator/MarkovModel.h index 19bde2cb31d..7ef69b2a1f0 100644 --- a/utils/test-data-generator/MarkovModel.h +++ b/utils/test-data-generator/MarkovModel.h @@ -105,7 +105,7 @@ public: if (table.end() == it) return pos - data; - *pos = it->second.sample(random()); + *pos = it->getSecond().sample(random()); /// Zero byte marks end of string. if (0 == *pos) @@ -125,12 +125,12 @@ public: for (auto & elem : table) { UInt32 new_total = 0; - for (auto & frequency : elem.second.data) + for (auto & frequency : elem.getSecond().data) { frequency.count = transform(frequency.count); new_total += frequency.count; } - elem.second.total = new_total; + elem.getSecond().total = new_total; } } @@ -142,10 +142,10 @@ public: for (const auto & elem : table) { - writeBinary(elem.first, out); - writeBinary(UInt8(elem.second.data.size()), out); + writeBinary(elem.getFirst(), out); + writeBinary(UInt8(elem.getSecond().data.size()), out); - for (const auto & frequency : elem.second.data) + for (const auto & frequency : elem.getSecond().data) { writeBinary(frequency.byte, out); writeVarUInt(frequency.count, out);