ClickHouse/dbms/src/Interpreters/tests/hash_map_string_small.cpp

183 lines
3.8 KiB
C++

#include <iostream>
#include <iomanip>
#include <vector>
#include <unordered_map>
#include <sparsehash/dense_hash_map>
#include <sparsehash/sparse_hash_map>
#include <statdaemons/Stopwatch.h>
//#define DBMS_HASH_MAP_COUNT_COLLISIONS
#define DBMS_HASH_MAP_DEBUG_RESIZES
#include <DB/Core/Types.h>
#include <DB/IO/ReadBufferFromFile.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/CompressedReadBuffer.h>
#include <DB/Core/StringRef.h>
#include <DB/Common/HashTable/HashMap.h>
#include <DB/Interpreters/AggregationCommon.h>
struct SmallStringRef
{
UInt32 size;
union
{
const char * data_big;
char data_small[12];
};
bool isSmall() const { return size <= 12; }
const char * data() const
{
return isSmall() ? data_small : data_big;
}
SmallStringRef(const char * data_, size_t size_)
{
size = size_;
if (isSmall())
memcpy(data_small, data_, size_);
else
data_big = data_;
}
SmallStringRef(const unsigned char * data_, size_t size_) : SmallStringRef(reinterpret_cast<const char *>(data_), size_) {}
SmallStringRef(const std::string & s) : SmallStringRef(s.data(), s.size()) {}
SmallStringRef() {}
std::string toString() const { return std::string(data(), size); }
};
inline bool operator==(SmallStringRef lhs, SmallStringRef rhs)
{
if (lhs.size != rhs.size)
return false;
if (lhs.size == 0)
return true;
return memequalSSE2Wide(lhs.data(), rhs.data(), lhs.size);
}
namespace ZeroTraits
{
template <>
inline bool check<SmallStringRef>(SmallStringRef x) { return x.size == 0; }
template <>
inline void set<SmallStringRef>(SmallStringRef & x) { x.size = 0; }
};
template <>
struct DefaultHash<SmallStringRef>
{
size_t operator() (SmallStringRef x) const
{
return DefaultHash<StringRef>()(StringRef(x.data(), x.size));
}
};
typedef UInt64 Value;
int main(int argc, char ** argv)
{
size_t n = atoi(argv[1]);
size_t m = atoi(argv[2]);
DB::Arena pool;
std::vector<StringRef> data(n);
std::cerr << "sizeof(Key) = " << sizeof(SmallStringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
{
Stopwatch watch;
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
DB::CompressedReadBuffer in2(in1);
std::string tmp;
for (size_t i = 0; i < n && !in2.eof(); ++i)
{
DB::readStringBinary(tmp, in2);
data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size());
}
watch.stop();
std::cerr << std::fixed << std::setprecision(2)
<< "Vector. Size: " << n
<< ", elapsed: " << watch.elapsedSeconds()
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
<< std::endl;
}
if (!m || m == 1)
{
Stopwatch watch;
typedef HashMapWithSavedHash<StringRef, Value> Map;
Map map;
Map::iterator it;
bool inserted;
for (size_t i = 0; i < n; ++i)
{
map.emplace(data[i], it, inserted);
if (inserted)
it->second = 0;
++it->second;
}
watch.stop();
std::cerr << std::fixed << std::setprecision(2)
<< "HashMap (StringRef). Size: " << map.size()
<< ", elapsed: " << watch.elapsedSeconds()
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
<< ", collisions: " << map.getCollisions()
#endif
<< std::endl;
}
if (!m || m == 2)
{
Stopwatch watch;
typedef HashMapWithSavedHash<SmallStringRef, Value> Map;
Map map;
Map::iterator it;
bool inserted;
for (size_t i = 0; i < n; ++i)
{
map.emplace(SmallStringRef(data[i].data, data[i].size), it, inserted);
if (inserted)
it->second = 0;
++it->second;
}
watch.stop();
std::cerr << std::fixed << std::setprecision(2)
<< "HashMap (SmallStringRef). Size: " << map.size()
<< ", elapsed: " << watch.elapsedSeconds()
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
<< ", collisions: " << map.getCollisions()
#endif
<< std::endl;
}
return 0;
}