2014-11-09 12:43:09 +00:00
|
|
|
#include <iostream>
|
|
|
|
#include <iomanip>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
#include <sparsehash/dense_hash_map>
|
|
|
|
#include <sparsehash/sparse_hash_map>
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/Stopwatch.h>
|
2014-11-09 12:43:09 +00:00
|
|
|
|
|
|
|
//#define DBMS_HASH_MAP_COUNT_COLLISIONS
|
|
|
|
#define DBMS_HASH_MAP_DEBUG_RESIZES
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/Types.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/CompressedReadBuffer.h>
|
2017-06-23 20:22:35 +00:00
|
|
|
#include <common/StringRef.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/HashTable/HashMap.h>
|
|
|
|
#include <Interpreters/AggregationCommon.h>
|
2014-11-09 12:43:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
struct SmallStringRef
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
UInt32 size;
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
union
|
|
|
|
{
|
|
|
|
const char * data_big;
|
|
|
|
char data_small[12];
|
|
|
|
};
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
bool isSmall() const { return size <= 12; }
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
const char * data() const
|
|
|
|
{
|
|
|
|
return isSmall() ? data_small : data_big;
|
|
|
|
}
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
SmallStringRef(const char * data_, size_t size_)
|
|
|
|
{
|
|
|
|
size = size_;
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (isSmall())
|
|
|
|
memcpy(data_small, data_, size_);
|
|
|
|
else
|
|
|
|
data_big = data_;
|
|
|
|
}
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
SmallStringRef(const unsigned char * data_, size_t size_) : SmallStringRef(reinterpret_cast<const char *>(data_), size_) {}
|
2017-09-07 21:04:48 +00:00
|
|
|
explicit SmallStringRef(const std::string & s) : SmallStringRef(s.data(), s.size()) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
SmallStringRef() {}
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
std::string toString() const { return std::string(data(), size); }
|
2014-11-09 12:43:09 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
inline bool operator==(SmallStringRef lhs, SmallStringRef rhs)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (lhs.size != rhs.size)
|
|
|
|
return false;
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (lhs.size == 0)
|
|
|
|
return true;
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-01-27 19:55:33 +00:00
|
|
|
#if __SSE2__
|
2017-04-01 07:20:54 +00:00
|
|
|
return memequalSSE2Wide(lhs.data(), rhs.data(), lhs.size);
|
2017-01-25 19:17:13 +00:00
|
|
|
#else
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
2017-01-25 19:17:13 +00:00
|
|
|
#endif
|
2014-11-09 12:43:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
namespace ZeroTraits
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <>
|
|
|
|
inline bool check<SmallStringRef>(SmallStringRef x) { return x.size == 0; }
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
template <>
|
|
|
|
inline void set<SmallStringRef>(SmallStringRef & x) { x.size = 0; }
|
2014-11-09 12:43:09 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template <>
|
|
|
|
struct DefaultHash<SmallStringRef>
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t operator() (SmallStringRef x) const
|
|
|
|
{
|
|
|
|
return DefaultHash<StringRef>()(StringRef(x.data(), x.size));
|
|
|
|
}
|
2014-11-09 12:43:09 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2016-05-28 10:35:44 +00:00
|
|
|
using Value = UInt64;
|
2014-11-09 12:43:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
int main(int argc, char ** argv)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t n = atoi(argv[1]);
|
|
|
|
size_t m = atoi(argv[2]);
|
|
|
|
|
|
|
|
DB::Arena pool;
|
|
|
|
std::vector<StringRef> data(n);
|
|
|
|
|
|
|
|
std::cerr << "sizeof(Key) = " << sizeof(SmallStringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
|
|
|
|
|
|
|
|
{
|
|
|
|
Stopwatch watch;
|
|
|
|
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
|
|
|
|
DB::CompressedReadBuffer in2(in1);
|
|
|
|
|
|
|
|
std::string tmp;
|
|
|
|
for (size_t i = 0; i < n && !in2.eof(); ++i)
|
|
|
|
{
|
|
|
|
DB::readStringBinary(tmp, in2);
|
|
|
|
data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
std::cerr << std::fixed << std::setprecision(2)
|
|
|
|
<< "Vector. Size: " << n
|
|
|
|
<< ", elapsed: " << watch.elapsedSeconds()
|
|
|
|
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
|
|
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!m || m == 1)
|
|
|
|
{
|
|
|
|
Stopwatch watch;
|
|
|
|
|
|
|
|
using Map = HashMapWithSavedHash<StringRef, Value>;
|
|
|
|
|
|
|
|
Map map;
|
|
|
|
Map::iterator it;
|
|
|
|
bool inserted;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; ++i)
|
|
|
|
{
|
|
|
|
map.emplace(data[i], it, inserted);
|
|
|
|
if (inserted)
|
|
|
|
it->second = 0;
|
|
|
|
++it->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
std::cerr << std::fixed << std::setprecision(2)
|
|
|
|
<< "HashMap (StringRef). Size: " << map.size()
|
|
|
|
<< ", elapsed: " << watch.elapsedSeconds()
|
|
|
|
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
|
2014-11-09 12:43:09 +00:00
|
|
|
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
|
2017-04-01 07:20:54 +00:00
|
|
|
<< ", collisions: " << map.getCollisions()
|
2014-11-09 12:43:09 +00:00
|
|
|
#endif
|
2017-04-01 07:20:54 +00:00
|
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!m || m == 2)
|
|
|
|
{
|
|
|
|
Stopwatch watch;
|
|
|
|
|
|
|
|
using Map = HashMapWithSavedHash<SmallStringRef, Value>;
|
|
|
|
|
|
|
|
Map map;
|
|
|
|
Map::iterator it;
|
|
|
|
bool inserted;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; ++i)
|
|
|
|
{
|
|
|
|
map.emplace(SmallStringRef(data[i].data, data[i].size), it, inserted);
|
|
|
|
if (inserted)
|
|
|
|
it->second = 0;
|
|
|
|
++it->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
std::cerr << std::fixed << std::setprecision(2)
|
|
|
|
<< "HashMap (SmallStringRef). Size: " << map.size()
|
|
|
|
<< ", elapsed: " << watch.elapsedSeconds()
|
|
|
|
<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
|
2014-11-09 12:43:09 +00:00
|
|
|
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
|
2017-04-01 07:20:54 +00:00
|
|
|
<< ", collisions: " << map.getCollisions()
|
2014-11-09 12:43:09 +00:00
|
|
|
#endif
|
2017-04-01 07:20:54 +00:00
|
|
|
<< std::endl;
|
|
|
|
}
|
2014-11-09 12:43:09 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return 0;
|
2014-11-09 12:43:09 +00:00
|
|
|
}
|