HashingReadBuffer: added test. fixed bug [#METR-10969]

This commit is contained in:
Pavel Kartavyy 2014-06-06 17:50:09 +04:00 committed by Michael Kolupaev
parent b8b4a4a721
commit 133cbbab37
4 changed files with 116 additions and 43 deletions

View File

@ -14,13 +14,16 @@ class HashingReadBuffer : public BufferWithOwnMemory<ReadBuffer>
{
public:
HashingReadBuffer(ReadBuffer & in_, size_t block_size = DBMS_DEFAULT_HASHING_BLOCK_SIZE) :
block_pos(0), block_size(DBMS_DEFAULT_HASHING_BLOCK_SIZE), state(0, 0), in(in_), ignore_before_this(nullptr)
block_pos(0), block_size(DBMS_DEFAULT_HASHING_BLOCK_SIZE), state(0, 0), in(in_)
{
working_buffer = in.buffer();
pos = in.position();
/// если какая то часть данных уже была прочитана до нас, то не дадим этим данным повлиять на хэш
if (in.position() != in.buffer().begin())
ignore_before_this = in.position();
/// считаем хэш от уже прочитанных данных
if (working_buffer.size())
{
calculateHash(pos, working_buffer.end() - pos);
}
}
uint128 getHash()
@ -39,19 +42,8 @@ private:
state = CityHash128WithSeed(data, block_size, state);
}
bool nextImpl() override
void calculateHash(Position data, size_t len)
{
size_t len = working_buffer.size();
Position data = working_buffer.begin();
/// корректировка на данные прочитанные до нас
if (ignore_before_this)
{
len -= ignore_before_this - working_buffer.begin();
data = ignore_before_this;
ignore_before_this = nullptr;
}
if (len)
{
/// если данных меньше, чем block_size то сложим их в свой буффер и посчитаем от них hash позже
@ -66,18 +58,18 @@ private:
if (block_pos)
{
size_t n = block_size - block_pos;
memcpy(&memory[block_pos], data, len);
memcpy(&memory[block_pos], data, n);
append(&memory[0]);
len -= n;
data += n;
block_pos = 0;
}
while (len >= block_pos)
while (len >= block_size)
{
append(data);
len -= block_pos;
data += block_pos;
len -= block_size;
data += block_size;
}
if (len)
@ -87,9 +79,17 @@ private:
}
}
}
}
bool nextImpl() override
{
in.position() = pos;
bool res = in.next();
working_buffer = in.buffer();
pos = in.position();
calculateHash(working_buffer.begin(), working_buffer.size());
return res;
}
@ -98,8 +98,5 @@ private:
size_t block_size;
uint128 state;
ReadBuffer & in;
/// игнорируем уже прочитанные данные
Position ignore_before_this;
};
}

View File

@ -0,0 +1,22 @@
#include <DB/IO/HashingWriteBuffer.h>
#include <DB/IO/WriteBufferFromFile.h>
#define FAIL(msg) { std::cout << msg; exit(1); }
uint128 referenceHash(const char * data, size_t len)
{
const size_t block_size = DBMS_DEFAULT_HASHING_BLOCK_SIZE;
uint128 state(0, 0);
size_t pos;
for (pos = 0; pos + block_size <= len; pos += block_size)
{
state = CityHash128WithSeed(data + pos, block_size, state);
}
if (pos < len)
state = CityHash128WithSeed(data + pos, len - pos, state);
return state;
}

View File

@ -0,0 +1,73 @@
#include <DB/IO/ReadBufferFromIStream.h>
#include <DB/IO/HashingReadBuffer.h>
#include <DB/IO/WriteBufferFromOStream.h>
#include "hashing_buffer.h"
#include <iostream>
void test(size_t data_size)
{
std::vector<char> vec(data_size);
char * data = &vec[0];
for (size_t i = 0; i < data_size; ++i)
data[i] = rand() & 255;
uint128 reference = referenceHash(data, data_size);
std::vector<size_t> block_sizes = {56, 128, 513, 2048, 3055, 4097, 4096};
for (size_t read_buffer_block_size : block_sizes)
{
std::cout << "block size " << read_buffer_block_size << std::endl;
std::stringstream io;
DB::WriteBufferFromOStream out_(io);
DB::HashingWriteBuffer out(out_);
out.write(data, data_size);
out.next();
//std::cout.write(data, data_size);
//std::cout << std::endl;
//std::cout << io.str() << std::endl;
DB::ReadBufferFromIStream source(io, read_buffer_block_size);
DB::HashingReadBuffer buf(source);
std::vector<char> read_buf(data_size);
buf.read(read_buf.data(), data_size);
bool failed_to_read = false;
for (size_t i = 0; i < data_size; ++i)
if (read_buf[i] != vec[i])
{
failed_to_read = true;
}
if (failed_to_read)
{
std::cout.write(data, data_size);
std::cout << std::endl;
std::cout.write(read_buf.data(), data_size);
std::cout << std::endl;
FAIL("Fail to read data");
}
if (buf.getHash() != reference)
{
std::cout << uint128ToString(buf.getHash()) << " " << uint128ToString(reference) << std::endl;
FAIL("failed on data size " << data_size << " reading by blocks of size " << read_buffer_block_size);
}
if (buf.getHash() != out.getHash())
FAIL("Hash of HashingReadBuffer doesn't match with hash of HashingWriteBuffer on data size " << data_size << " reading by blocks of size " << read_buffer_block_size);
}
}
int main()
{
test(5);
test(100);
test(2048);
test(2049);
test(100000);
test(1 << 17);
return 0;
}

View File

@ -1,26 +1,7 @@
#include <DB/IO/HashingWriteBuffer.h>
#include <DB/IO/WriteBufferFromFile.h>
#define FAIL(msg) { std::cout << msg; exit(1); }
uint128 referenceHash(char * data, size_t len)
{
const size_t block_size = DBMS_DEFAULT_HASHING_BLOCK_SIZE;
uint128 state(0, 0);
size_t pos;
for (pos = 0; pos + block_size <= len; pos += block_size)
{
state = CityHash128WithSeed(data + pos, block_size, state);
}
if (pos < len)
state = CityHash128WithSeed(data + pos, len - pos, state);
return state;
}
#include "hashing_buffer.h"
void test(size_t data_size)
{