mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
SipHash: allowed not to call update for empty strings (less than 0.3% performance degradation on all test data) [#METR-16781].
This commit is contained in:
parent
33825e4de6
commit
bc17c73ee4
@ -76,6 +76,7 @@ public:
|
||||
v3 = 0x7465646279746573ULL ^ k1;
|
||||
|
||||
cnt = 0;
|
||||
current_word = 0;
|
||||
}
|
||||
|
||||
void update(const char * data, u64 size)
|
||||
|
71
dbms/src/Common/tests/sip_hash_perf.cpp
Normal file
71
dbms/src/Common/tests/sip_hash_perf.cpp
Normal file
@ -0,0 +1,71 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <iomanip>
|
||||
|
||||
#include <DB/Common/SipHash.h>
|
||||
#include <DB/IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <DB/IO/ReadHelpers.h>
|
||||
#include <statdaemons/Stopwatch.h>
|
||||
|
||||
|
||||
/** Тестировать так:
|
||||
*
|
||||
* clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits WHERE k != ''" > phrases.tsv
|
||||
* clickhouse-client --query="SELECT URL AS k FROM test.hits" > urls.tsv
|
||||
* clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits" > phrases_with_empty.tsv
|
||||
* clickhouse-client --query="SELECT Title AS k FROM test.hits" > titles.tsv
|
||||
* clickhouse-client --query="SELECT PageCharset AS k FROM test.hits" > charset.tsv
|
||||
*
|
||||
* for i in {1..1000}; do ./sip_hash_perf < titles.tsv 2>&1 | grep Processed | grep -oP '\d+\.\d+ rows/sec'; done | awk '{ if ($1 > x) { x = $1; print x } }'
|
||||
*/
|
||||
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
std::vector<std::string> data;
|
||||
DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);
|
||||
|
||||
std::cerr << std::fixed << std::setprecision(3);
|
||||
|
||||
{
|
||||
Stopwatch watch;
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
data.emplace_back();
|
||||
DB::readEscapedString(data.back(), in);
|
||||
DB::assertString("\n", in);
|
||||
}
|
||||
|
||||
double seconds = watch.elapsedSeconds();
|
||||
std::cerr << "Read "
|
||||
<< data.size() << " rows, "
|
||||
<< (in.count() / 1048576.0) << " MiB "
|
||||
<< " in " << seconds << " sec., "
|
||||
<< (data.size() / seconds) << " rows/sec., "
|
||||
<< (in.count() / 1048576.0 / seconds) << " MiB/sec.\n";
|
||||
}
|
||||
|
||||
{
|
||||
size_t res = 0;
|
||||
Stopwatch watch;
|
||||
|
||||
for (const auto & s : data)
|
||||
{
|
||||
SipHash hash;
|
||||
hash.update(s.data(), s.size());
|
||||
res += hash.get64();
|
||||
}
|
||||
|
||||
double seconds = watch.elapsedSeconds();
|
||||
std::cerr << "Processed "
|
||||
<< data.size() << " rows, "
|
||||
<< (in.count() / 1048576.0) << " MiB "
|
||||
<< " in " << seconds << " sec., "
|
||||
<< (data.size() / seconds) << " rows/sec., "
|
||||
<< (in.count() / 1048576.0 / seconds) << " MiB/sec. "
|
||||
<< "(res = " << res << ")\n";
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user