ClickHouse/dbms/src/Interpreters/tests/hash_map2.cpp

#include <iostream>
#include <iomanip>
#include <vector>

#include <unordered_map>

#include <sparsehash/dense_hash_map>
#include <sparsehash/sparse_hash_map>

#include <DB/Common/Stopwatch.h>

//#define DBMS_HASH_MAP_COUNT_COLLISIONS
#define DBMS_HASH_MAP_DEBUG_RESIZES

#include <DB/Core/Types.h>
#include <DB/IO/ReadBufferFromFile.h>
#include <DB/IO/CompressedReadBuffer.h>
#include <DB/Common/HashTable/HashMap.h>


using Key = UInt64;
using Value = UInt64;

struct CellWithoutZeroWithSavedHash : public HashMapCell<Key, Value, DefaultHash<Key> >
{
//    size_t saved_hash;

    static constexpr bool need_zero_value_storage = false;

    CellWithoutZeroWithSavedHash() : HashMapCell() {}
    CellWithoutZeroWithSavedHash(const Key & key_, const State & state) : HashMapCell(key_, state) {}
    CellWithoutZeroWithSavedHash(const value_type & value_, const State & state) : HashMapCell(value_, state) {}

/*    bool keyEquals(const Key & key_) const { return value.first == key_; }
    bool keyEquals(const CellWithoutZeroWithSavedHash & other) const { return saved_hash == other.saved_hash && value.first == other.value.first; }

    void setHash(size_t hash_value) { saved_hash = hash_value; }
    size_t getHash(const DefaultHash<Key> & hash) const { return saved_hash; }*/
};

struct Grower : public HashTableGrower<>
{
    /// Состояние этой структуры достаточно, чтобы получить размер буфера хэш-таблицы.

    /// Определяет начальный размер хэш-таблицы.
    static const size_t initial_size_degree = 16;
    Grower() { size_degree = initial_size_degree; }

//    size_t max_fill = (1 << initial_size_degree) * 0.9;

    /// Размер хэш-таблицы в ячейках.
    size_t bufSize() const                { return 1 << size_degree; }

    size_t maxFill() const                { return 1 << (size_degree - 1); }
//    size_t maxFill() const                { return max_fill; }

    size_t mask() const                    { return bufSize() - 1; }

    /// Из значения хэш-функции получить номер ячейки в хэш-таблице.
    size_t place(size_t x) const         { return x & mask(); }

    /// Следующая ячейка в цепочке разрешения коллизий.
    size_t next(size_t pos) const        { ++pos; return pos & mask(); }

    /// Является ли хэш-таблица достаточно заполненной. Нужно увеличить размер хэш-таблицы, или удалить из неё что-нибудь ненужное.
    bool overflow(size_t elems) const    { return elems > maxFill(); }

    /// Увеличить размер хэш-таблицы.
    void increaseSize()
    {
        size_degree += size_degree >= 23 ? 1 : 2;
//        max_fill = (1 << size_degree) * 0.9;
    }

    /// Установить размер буфера по количеству элементов хэш-таблицы. Используется при десериализации хэш-таблицы.
    void set(size_t num_elems)
    {
        throw Poco::Exception(__PRETTY_FUNCTION__);
    }
};


int main(int argc, char ** argv)
{
    size_t n = atoi(argv[1]);
    size_t m = atoi(argv[2]);

    std::vector<Key> data(n);

    std::cerr << "sizeof(Key) = " << sizeof(Key) << ", sizeof(Value) = " << sizeof(Value) << std::endl;

    {
        Stopwatch watch;
        DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
        DB::CompressedReadBuffer in2(in1);

        in2.readStrict(reinterpret_cast<char*>(&data[0]), sizeof(data[0]) * n);

        watch.stop();
        std::cerr << std::fixed << std::setprecision(2)
            << "Vector. Size: " << n
            << ", elapsed: " << watch.elapsedSeconds()
            << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
            << std::endl;
    }

    if (m == 1)
    {
        Stopwatch watch;

//        using Map = HashMap<Key, Value>;

        /// Из-за WithoutZero быстрее на 0.7% (для не влезающей в L3-кэш) - 2.3% (для влезающей в L3-кэш).
        using Map = HashMapTable<Key, CellWithoutZeroWithSavedHash, DefaultHash<Key>, Grower>;

        Map map;
        Map::iterator it;
        bool inserted;

        for (size_t i = 0; i < n; ++i)
        {
            map.emplace(data[i], it, inserted);
            if (inserted)
                it->second = 0;
            ++it->second;
        }

        watch.stop();
        std::cerr << std::fixed << std::setprecision(2)
            << "HashMap. Size: " << map.size()
            << ", elapsed: " << watch.elapsedSeconds()
            << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
            << ", collisions: " << map.getCollisions()
#endif
            << std::endl;
    }

    if (m == 2)
    {
        Stopwatch watch;

        std::unordered_map<Key, Value, DefaultHash<Key> > map;
        for (size_t i = 0; i < n; ++i)
            ++map[data[i]];

        watch.stop();
        std::cerr << std::fixed << std::setprecision(2)
            << "std::unordered_map. Size: " << map.size()
            << ", elapsed: " << watch.elapsedSeconds()
            << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
            << std::endl;
    }

    if (m == 3)
    {
        Stopwatch watch;

        google::dense_hash_map<Key, Value, DefaultHash<Key> > map;
        map.set_empty_key(-1ULL);
        for (size_t i = 0; i < n; ++i)
              ++map[data[i]];

        watch.stop();
        std::cerr << std::fixed << std::setprecision(2)
            << "google::dense_hash_map. Size: " << map.size()
            << ", elapsed: " << watch.elapsedSeconds()
            << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
            << std::endl;
    }

    if (m == 4)
    {
        Stopwatch watch;

        google::sparse_hash_map<Key, Value, DefaultHash<Key> > map;
        for (size_t i = 0; i < n; ++i)
            ++map[data[i]];

        watch.stop();
        std::cerr << std::fixed << std::setprecision(2)
            << "google::sparse_hash_map. Size: " << map.size()
            << ", elapsed: " << watch.elapsedSeconds()
            << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
            << std::endl;
    }

    return 0;
}
dbms: development [#CONV-2944]. 2011-12-12 10:05:35 +00:00			`#include <iostream>`
			`#include <iomanip>`
			`#include <vector>`

Removed 'tr1' [#METR-2807]. 2014-01-08 16:33:28 +00:00			`#include <unordered_map>`
dbms: development [#CONV-2944]. 2011-12-12 10:05:35 +00:00
Using one file name to include sparse/dense hash (error if you use different style in one translation unit) [#CONV-7042]. 2013-04-09 19:15:52 +00:00			`#include <sparsehash/dense_hash_map>`
			`#include <sparsehash/sparse_hash_map>`
dbms: development [#CONV-2944]. 2011-12-19 02:00:40 +00:00
Moved file [#METR-17973]. 2015-10-05 00:44:40 +00:00			`#include <DB/Common/Stopwatch.h>`
dbms: development [#CONV-2944]. 2011-12-19 02:00:40 +00:00
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00			`//#define DBMS_HASH_MAP_COUNT_COLLISIONS`
			`#define DBMS_HASH_MAP_DEBUG_RESIZES`
dbms: development [#CONV-2944]. 2012-06-01 10:45:29 +00:00
dbms: development [#CONV-2944]. 2011-12-19 02:00:40 +00:00			`#include <DB/Core/Types.h>`
dbms: development [#CONV-2944]. 2012-06-01 10:45:29 +00:00			`#include <DB/IO/ReadBufferFromFile.h>`
			`#include <DB/IO/CompressedReadBuffer.h>`
dbms: unified hash tables: development [#METR-2944]. 2014-04-28 01:48:24 +00:00			`#include <DB/Common/HashTable/HashMap.h>`
dbms: development [#CONV-2944]. 2011-12-12 10:05:35 +00:00

Using std::shared_ptr for data types [#METR-21503]. 2016-05-28 10:35:44 +00:00			`using Key = UInt64;`
			`using Value = UInt64;`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
			`struct CellWithoutZeroWithSavedHash : public HashMapCell<Key, Value, DefaultHash<Key> >`
			`{`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`// size_t saved_hash;`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`static constexpr bool need_zero_value_storage = false;`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`CellWithoutZeroWithSavedHash() : HashMapCell() {}`
			`CellWithoutZeroWithSavedHash(const Key & key_, const State & state) : HashMapCell(key_, state) {}`
			`CellWithoutZeroWithSavedHash(const value_type & value_, const State & state) : HashMapCell(value_, state) {}`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/* bool keyEquals(const Key & key_) const { return value.first == key_; }`
			`bool keyEquals(const CellWithoutZeroWithSavedHash & other) const { return saved_hash == other.saved_hash && value.first == other.value.first; }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void setHash(size_t hash_value) { saved_hash = hash_value; }`
			`size_t getHash(const DefaultHash<Key> & hash) const { return saved_hash; }*/`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00			`};`

dbms: fixed error [#METR-2944]. 2014-05-03 16:03:49 +00:00			`struct Grower : public HashTableGrower<>`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00			`{`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Состояние этой структуры достаточно, чтобы получить размер буфера хэш-таблицы.`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Определяет начальный размер хэш-таблицы.`
			`static const size_t initial_size_degree = 16;`
			`Grower() { size_degree = initial_size_degree; }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`// size_t max_fill = (1 << initial_size_degree) * 0.9;`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Размер хэш-таблицы в ячейках.`
			`size_t bufSize() const { return 1 << size_degree; }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t maxFill() const { return 1 << (size_degree - 1); }`
			`// size_t maxFill() const { return max_fill; }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t mask() const { return bufSize() - 1; }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Из значения хэш-функции получить номер ячейки в хэш-таблице.`
			`size_t place(size_t x) const { return x & mask(); }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Следующая ячейка в цепочке разрешения коллизий.`
			`size_t next(size_t pos) const { ++pos; return pos & mask(); }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Является ли хэш-таблица достаточно заполненной. Нужно увеличить размер хэш-таблицы, или удалить из неё что-нибудь ненужное.`
			`bool overflow(size_t elems) const { return elems > maxFill(); }`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Увеличить размер хэш-таблицы.`
			`void increaseSize()`
			`{`
			`size_degree += size_degree >= 23 ? 1 : 2;`
			`// max_fill = (1 << size_degree) * 0.9;`
			`}`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Установить размер буфера по количеству элементов хэш-таблицы. Используется при десериализации хэш-таблицы.`
			`void set(size_t num_elems)`
			`{`
			`throw Poco::Exception(__PRETTY_FUNCTION__);`
			`}`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00			`};`


dbms: development [#CONV-2944]. 2011-12-12 10:05:35 +00:00			`int main(int argc, char ** argv)`
			`{`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t n = atoi(argv[1]);`
			`size_t m = atoi(argv[2]);`

			`std::vector<Key> data(n);`

			`std::cerr << "sizeof(Key) = " << sizeof(Key) << ", sizeof(Value) = " << sizeof(Value) << std::endl;`

			`{`
			`Stopwatch watch;`
			`DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);`
			`DB::CompressedReadBuffer in2(in1);`

			`in2.readStrict(reinterpret_cast<char>(&data[0]), sizeof(data[0]) n);`

			`watch.stop();`
			`std::cerr << std::fixed << std::setprecision(2)`
			`<< "Vector. Size: " << n`
			`<< ", elapsed: " << watch.elapsedSeconds()`
			`<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"`
			`<< std::endl;`
			`}`

			`if (m == 1)`
			`{`
			`Stopwatch watch;`

			`// using Map = HashMap<Key, Value>;`

			`/// Из-за WithoutZero быстрее на 0.7% (для не влезающей в L3-кэш) - 2.3% (для влезающей в L3-кэш).`
			`using Map = HashMapTable<Key, CellWithoutZeroWithSavedHash, DefaultHash<Key>, Grower>;`

			`Map map;`
			`Map::iterator it;`
			`bool inserted;`

			`for (size_t i = 0; i < n; ++i)`
			`{`
			`map.emplace(data[i], it, inserted);`
			`if (inserted)`
			`it->second = 0;`
			`++it->second;`
			`}`

			`watch.stop();`
			`std::cerr << std::fixed << std::setprecision(2)`
			`<< "HashMap. Size: " << map.size()`
			`<< ", elapsed: " << watch.elapsedSeconds()`
			`<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00			`#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`<< ", collisions: " << map.getCollisions()`
dbms: experiments with hash tables [#METR-2944]. 2014-05-02 12:49:39 +00:00			`#endif`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`<< std::endl;`
			`}`

			`if (m == 2)`
			`{`
			`Stopwatch watch;`

			`std::unordered_map<Key, Value, DefaultHash<Key> > map;`
			`for (size_t i = 0; i < n; ++i)`
			`++map[data[i]];`

			`watch.stop();`
			`std::cerr << std::fixed << std::setprecision(2)`
			`<< "std::unordered_map. Size: " << map.size()`
			`<< ", elapsed: " << watch.elapsedSeconds()`
			`<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"`
			`<< std::endl;`
			`}`

			`if (m == 3)`
			`{`
			`Stopwatch watch;`

			`google::dense_hash_map<Key, Value, DefaultHash<Key> > map;`
			`map.set_empty_key(-1ULL);`
			`for (size_t i = 0; i < n; ++i)`
			`++map[data[i]];`

			`watch.stop();`
			`std::cerr << std::fixed << std::setprecision(2)`
			`<< "google::dense_hash_map. Size: " << map.size()`
			`<< ", elapsed: " << watch.elapsedSeconds()`
			`<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"`
			`<< std::endl;`
			`}`

			`if (m == 4)`
			`{`
			`Stopwatch watch;`

			`google::sparse_hash_map<Key, Value, DefaultHash<Key> > map;`
			`for (size_t i = 0; i < n; ++i)`
			`++map[data[i]];`

			`watch.stop();`
			`std::cerr << std::fixed << std::setprecision(2)`
			`<< "google::sparse_hash_map. Size: " << map.size()`
			`<< ", elapsed: " << watch.elapsedSeconds()`
			`<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"`
			`<< std::endl;`
			`}`

			`return 0;`
dbms: development [#CONV-2944]. 2011-12-12 10:05:35 +00:00			`}`