From 0ebb145d32dbcdd2cbfc15bb69d56d6a35be5bd5 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 13 Jun 2019 17:04:38 +0300 Subject: [PATCH] Post-PR fixes: * BitHelpers.cpp was removed, corresponding code was moved to the header * BitIO test as GTest-based test binary * gtest-based unit test for DoubleDelta and Gorilla codecs * getLeadingZeroBits from SFINAE to consexpr if * removed couple of unneeded if's * Fixed sql-test to use one table --- dbms/src/Common/BitHelpers.h | 50 +-- .../CompressionCodecDoubleDelta.cpp | 83 ++--- .../Compression/CompressionCodecGorilla.cpp | 21 +- dbms/src/Compression/ICompressionCodec.cpp | 4 +- dbms/src/Compression/ICompressionCodec.h | 4 +- .../tests/gtest_compressionCodec.cpp | 310 ++++++++++++++++++ dbms/src/IO/BitHelpers.cpp | 135 -------- dbms/src/IO/BitHelpers.h | 145 ++++++-- dbms/src/IO/tests/CMakeLists.txt | 3 - dbms/src/IO/tests/bit_io.cpp | 188 ----------- dbms/src/IO/tests/gtest_bit_io.cpp | 188 +++++++++++ .../00950_column_encoding_double_delta.sql | 211 ------------ .../00950_column_encoding_gorilla.sql | 89 ----- ...> 00950_test_double_delta_codec.reference} | 1 - .../00950_test_double_delta_codec.sql | 151 +++++++++ ...nce => 00950_test_gorilla_codec.reference} | 1 - .../0_stateless/00950_test_gorilla_codec.sql | 51 +++ 17 files changed, 910 insertions(+), 725 deletions(-) create mode 100644 dbms/src/Compression/tests/gtest_compressionCodec.cpp delete mode 100644 dbms/src/IO/BitHelpers.cpp delete mode 100644 dbms/src/IO/tests/bit_io.cpp create mode 100644 dbms/src/IO/tests/gtest_bit_io.cpp delete mode 100644 dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.sql delete mode 100644 dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.sql rename dbms/tests/queries/0_stateless/{00950_column_encoding_double_delta.reference => 00950_test_double_delta_codec.reference} (94%) create mode 100644 dbms/tests/queries/0_stateless/00950_test_double_delta_codec.sql rename dbms/tests/queries/0_stateless/{00950_column_encoding_gorilla.reference => 00950_test_gorilla_codec.reference} (80%) create mode 100644 dbms/tests/queries/0_stateless/00950_test_gorilla_codec.sql diff --git a/dbms/src/Common/BitHelpers.h b/dbms/src/Common/BitHelpers.h index 860b0e87971..50e28819e9b 100644 --- a/dbms/src/Common/BitHelpers.h +++ b/dbms/src/Common/BitHelpers.h @@ -34,29 +34,41 @@ inline size_t roundUpToPowerOfTwoOrZero(size_t n) template -inline std::enable_if_t && (sizeof(T) <= sizeof(unsigned int)), int> -getLeadingZeroBits(T x) +inline size_t getLeadingZeroBits(T x) { - return x == 0 ? sizeof(x) * 8 : __builtin_clz(x); + if (!x) + return sizeof(x) * 8; + + if constexpr (sizeof(T) <= sizeof(unsigned int)) + { + return __builtin_clz(x); + } + else if constexpr (sizeof(T) <= sizeof(unsigned long int)) + { + return __builtin_clzl(x); + } + else + { + return __builtin_clzll(x); + } } template -inline std::enable_if_t && (sizeof(T) == sizeof(unsigned long long int)), int> -getLeadingZeroBits(T x) +inline size_t getTrailingZeroBits(T x) { - return x == 0 ? sizeof(x) * 8 : __builtin_clzll(x); -} + if (!x) + return sizeof(x) * 8; -template -inline std::enable_if_t && (sizeof(T) <= sizeof(unsigned int)), int> -getTrailingZeroBits(T x) -{ - return x == 0 ? sizeof(x) * 8 : __builtin_ctz(x); -} - -template -inline std::enable_if_t && (sizeof(T) == sizeof(unsigned long long int)), int> -getTrailingZeroBits(T x) -{ - return x == 0 ? sizeof(x) * 8 : __builtin_ctzll(x); + if constexpr (sizeof(T) <= sizeof(unsigned int)) + { + return __builtin_ctz(x); + } + else if constexpr (sizeof(T) <= sizeof(unsigned long int)) + { + return __builtin_ctzl(x); + } + else + { + return __builtin_ctzll(x); + } } diff --git a/dbms/src/Compression/CompressionCodecDoubleDelta.cpp b/dbms/src/Compression/CompressionCodecDoubleDelta.cpp index 7a274119572..682c5c03015 100644 --- a/dbms/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/dbms/src/Compression/CompressionCodecDoubleDelta.cpp @@ -48,9 +48,43 @@ UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size) return (items_count * max_item_size_bits + 8) / 8; } +struct WriteSpec +{ + const UInt8 prefix_bits; + const UInt8 prefix; + const UInt8 data_bits; +}; + +template +WriteSpec getWriteSpec(const T & value) +{ + if (value > -63 && value < 64) + { + return WriteSpec{2, 0b10, 7}; + } + else if (value > -255 && value < 256) + { + return WriteSpec{3, 0b110, 9}; + } + else if (value > -2047 && value < 2048) + { + return WriteSpec{4, 0b1110, 12}; + } + else if (value > std::numeric_limits::min() && value < std::numeric_limits::max()) + { + return WriteSpec{5, 0b11110, 32}; + } + else + { + return WriteSpec{5, 0b11111, 64}; + } +} + template UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) { + using UnsignedDeltaType = typename std::make_unsigned::type; + if (source_size % sizeof(T) != 0) throw Exception("Cannot compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS); const char * source_end = source + source_size; @@ -85,10 +119,9 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) WriteBuffer buffer(dest, getCompressedDataSize(sizeof(T), source_size - sizeof(T)*2)); BitWriter writer(buffer); - while (source < source_end) + for (; source < source_end; source += sizeof(T)) { const T curr_value = unalignedLoad(source); - source += sizeof(curr_value); const auto delta = curr_value - prev_value; const DeltaType double_delta = static_cast(delta - static_cast(prev_delta)); @@ -103,37 +136,12 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) else { const auto sign = std::signbit(double_delta); - const auto abs_value = static_cast::type>(std::abs(double_delta)); - if (double_delta > -63 && double_delta < 64) - { - writer.writeBits(2, 0b10); - writer.writeBits(1, sign); - writer.writeBits(6, abs_value); - } - else if (double_delta > -255 && double_delta < 256) - { - writer.writeBits(3, 0b110); - writer.writeBits(1, sign); - writer.writeBits(8, abs_value); - } - else if (double_delta > -2047 && double_delta < 2048) - { - writer.writeBits(4, 0b1110); - writer.writeBits(1, sign); - writer.writeBits(11, abs_value); - } - else if (double_delta > std::numeric_limits::min() && double_delta < std::numeric_limits::max()) - { - writer.writeBits(5, 0b11110); - writer.writeBits(1, sign); - writer.writeBits(31, abs_value); - } - else - { - writer.writeBits(5, 0b11111); - writer.writeBits(1, sign); - writer.writeBits(63, abs_value); - } + const auto abs_value = static_cast(std::abs(double_delta)); + const auto write_spec = getWriteSpec(double_delta); + + writer.writeBits(write_spec.prefix_bits, write_spec.prefix); + writer.writeBits(1, sign); + writer.writeBits(write_spec.data_bits - 1, abs_value); } } @@ -180,13 +188,8 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) for (UInt32 items_read = 2; items_read < items_count && !reader.eof(); ++items_read) { DeltaType double_delta = 0; - if (reader.readBit() == 0) + if (reader.readBit() == 1) { - double_delta = 0; - } - else - { - // first bit is 1 const UInt8 data_sizes[] = {6, 8, 11, 31, 63}; UInt8 i = 0; for (; i < sizeof(data_sizes) - 1; ++i) @@ -203,6 +206,8 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) double_delta *= -1; } } + // else if first bit is zero, no need to read more data. + const T curr_value = static_cast(prev_value + prev_delta + double_delta); unalignedStore(dest, curr_value); dest += sizeof(curr_value); diff --git a/dbms/src/Compression/CompressionCodecGorilla.cpp b/dbms/src/Compression/CompressionCodecGorilla.cpp index 6cbc76d7450..f1c2f0f0abc 100644 --- a/dbms/src/Compression/CompressionCodecGorilla.cpp +++ b/dbms/src/Compression/CompressionCodecGorilla.cpp @@ -140,7 +140,6 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) writer.writeBits(curr_xored_info.data_bits, xored_data >> curr_xored_info.trailing_zero_bits); prev_xored_info = curr_xored_info; } - std::cerr << std::endl; prev_value = curr_value; } @@ -182,29 +181,20 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) // we have to keep track of items to avoid reading more that there is. for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read) { - T curr_value{}; - binary_value_info curr_xored_info; + T curr_value = prev_value; + binary_value_info curr_xored_info = prev_xored_info; T xored_data{}; - if (reader.readBit() == 0) + if (reader.readBit() == 1) { - // 0b0 prefix - curr_value = prev_value; - } - else - { - if (reader.readBit() == 0) - { - // 0b10 prefix - curr_xored_info = prev_xored_info; - } - else + if (reader.readBit() == 1) { // 0b11 prefix curr_xored_info.leading_zero_bits = reader.readBits(LEADING_ZEROES_BIT_LENGTH); curr_xored_info.data_bits = reader.readBits(DATA_BIT_LENGTH); curr_xored_info.trailing_zero_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits - curr_xored_info.data_bits; } + // else: 0b10 prefix - use prev_xored_info if (curr_xored_info.leading_zero_bits == 0 && curr_xored_info.data_bits == 0 @@ -218,6 +208,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) xored_data <<= curr_xored_info.trailing_zero_bits; curr_value = prev_value ^ xored_data; } + // else: 0b0 prefix - use prev_value unalignedStore(dest, curr_value); dest += sizeof(curr_value); diff --git a/dbms/src/Compression/ICompressionCodec.cpp b/dbms/src/Compression/ICompressionCodec.cpp index f9707d7c9da..ddedf8a4c9c 100644 --- a/dbms/src/Compression/ICompressionCodec.cpp +++ b/dbms/src/Compression/ICompressionCodec.cpp @@ -29,7 +29,7 @@ namespace ErrorCodes } -UInt32 ICompressionCodec::compress(char * source, UInt32 source_size, char * dest) const +UInt32 ICompressionCodec::compress(const char * source, UInt32 source_size, char * dest) const { dest[0] = getMethodByte(); UInt8 header_size = getHeaderSize(); @@ -41,7 +41,7 @@ UInt32 ICompressionCodec::compress(char * source, UInt32 source_size, char * des } -UInt32 ICompressionCodec::decompress(char * source, UInt32 source_size, char * dest) const +UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, char * dest) const { UInt8 method = source[0]; if (method != getMethodByte()) diff --git a/dbms/src/Compression/ICompressionCodec.h b/dbms/src/Compression/ICompressionCodec.h index 040cb84c5eb..27630a79a0b 100644 --- a/dbms/src/Compression/ICompressionCodec.h +++ b/dbms/src/Compression/ICompressionCodec.h @@ -35,10 +35,10 @@ public: virtual String getCodecDesc() const = 0; /// Compressed bytes from uncompressed source to dest. Dest should preallocate memory - virtual UInt32 compress(char * source, UInt32 source_size, char * dest) const; + virtual UInt32 compress(const char * source, UInt32 source_size, char * dest) const; /// Decompress bytes from compressed source to dest. Dest should preallocate memory - virtual UInt32 decompress(char * source, UInt32 source_size, char * dest) const; + virtual UInt32 decompress(const char * source, UInt32 source_size, char * dest) const; /// Number of bytes, that will be used to compress uncompressed_size bytes with current codec virtual UInt32 getCompressedReserveSize(UInt32 uncompressed_size) const { return getHeaderSize() + getMaxCompressedDataSize(uncompressed_size); } diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp new file mode 100644 index 00000000000..7cffcbf5a6b --- /dev/null +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -0,0 +1,310 @@ +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace +{ +using namespace DB; + +template +::testing::AssertionResult EqualContainers(const ContainerLeft & left, const ContainerRight & right) +{ + const auto MAX_MISMATCHING_ITEMS = 5; + + const auto l_size = std::size(left); + const auto r_size = std::size(right); + const auto size = std::min(l_size, r_size); + + ::testing::AssertionResult result = ::testing::AssertionSuccess(); + size_t mismatching_items = 0; + + if (l_size != r_size) + { + result = ::testing::AssertionFailure() << "size mismatch" << " expected: " << l_size << " got:" << r_size; + } + + for (size_t i = 0; i < size; ++i) + { + if (left[i] != right[i]) + { + if (result) + { + result = ::testing::AssertionFailure(); + } + result << "pos " << i << ": " + << " expected: " << std::hex << left[i] + << " got:" << std::hex << right[i] + << std::endl; + + if (++mismatching_items >= MAX_MISMATCHING_ITEMS) + { + result << "..." << std::endl; + break; + } + } + } + + return result; +} + +template +const char* type_name() +{ + return typeid(T).name(); +} + +template <> +const char* type_name() +{ + return "uint32"; +} + +template <> +const char* type_name() +{ + return "int32"; +} + +template <> +const char* type_name() +{ + return "uint64"; +} + +template <> +const char* type_name() +{ + return "int64"; +} + +struct CodecTestParam +{ + std::vector source_data; + UInt8 data_byte_size; + std::string case_name; +}; + +std::ostream & operator<<(std::ostream & ostr, const CodecTestParam & param) +{ + return ostr << "name: " << param.case_name + << "\nbyte size: " << static_cast(param.data_byte_size) + << "\ndata size: " << param.source_data.size(); +} + +template +std::string to_string(Args && ... args) +{ + std::ostringstream ostr; + (ostr << ... << std::forward(args)); + + return ostr.str(); +} + +template +CodecTestParam makeParam(Args && ... args) +{ + std::initializer_list vals{static_cast(args)...}; + std::vector data(sizeof(T) * std::size(vals)); + + char * write_pos = data.data(); + for (const auto & v : vals) + { + unalignedStore(write_pos, v); + write_pos += sizeof(v); + } + + return CodecTestParam{std::move(data), sizeof(T), to_string(data.size(), " predefined values")}; +} + +template +CodecTestParam generateParam(Generator gen, const char* gen_name) +{ + static_assert (End >= Begin, "End must be not less than Begin"); + + std::vector data(sizeof(T) * (End - Begin)); + char * write_pos = data.data(); + + for (size_t i = Begin; i < End; ++i) + { + const T v = gen(static_cast(i)); + unalignedStore(write_pos, v); + write_pos += sizeof(v); + } + + return CodecTestParam{std::move(data), sizeof(T), + to_string(type_name(), " from ", gen_name, "(", Begin, " => ", End, ")")}; +} + +void TestTranscoding(ICompressionCodec * codec, const CodecTestParam & param) +{ + const auto & source_data = param.source_data; + + const UInt32 encoded_max_size = codec->getCompressedReserveSize(source_data.size()); + PODArray encoded(encoded_max_size); + + const UInt32 encoded_size = codec->compress(source_data.data(), source_data.size(), encoded.data()); + encoded.resize(encoded_size); + + PODArray decoded(source_data.size()); + const UInt32 decoded_size = codec->decompress(encoded.data(), encoded.size(), decoded.data()); + decoded.resize(decoded_size); + + ASSERT_TRUE(EqualContainers(source_data, decoded)); +} + +class CodecTest : public ::testing::TestWithParam +{}; + +TEST_P(CodecTest, DoubleDelta) +{ + const auto & param = GetParam(); + auto codec = std::make_unique(param.data_byte_size); + + TestTranscoding(codec.get(), param); +} + +TEST_P(CodecTest, Gorilla) +{ + const auto & param = GetParam(); + auto codec = std::make_unique(param.data_byte_size); + + TestTranscoding(codec.get(), param); +} + +auto SameValueGenerator = [](auto value) +{ + return [=](auto i) + { + return static_cast(value); + }; +}; + +auto SequentialGenerator = [](auto stride = 1) +{ + return [=](auto i) + { + using ValueType = decltype(i); + return static_cast(stride * i); + }; +}; + +template +struct MonotonicGenerator +{ + MonotonicGenerator(T stride = 1, size_t max_step = 10) + : prev_value{}, + stride(stride), + max_step(max_step) + {} + + template + U operator()(U i) + { + if (!prev_value.has_value()) + { + prev_value = i * stride; + } + + const U result = *prev_value + static_cast(stride * (rand() % max_step)); + + prev_value = result; + return result; + } + + std::optional prev_value; + const T stride; + const size_t max_step; +}; + +auto MinMaxGenerator = [](auto i) +{ + if (i % 2 == 0) + { + return std::numeric_limits::min(); + } + else + { + return std::numeric_limits::max(); + } +}; + +auto RandomGenerator = [](auto i) {return static_cast(rand());}; + +INSTANTIATE_TEST_CASE_P(Basic, + CodecTest, + ::testing::Values( + makeParam(1, 2, 3, 4), + makeParam(1, 2, 3, 4), + makeParam(1.1, 2.2, 3.3, 4.4), + makeParam(1.1, 2.2, 3.3, 4.4) + ) +); + +#define G(generator) generator, #generator + +INSTANTIATE_TEST_CASE_P(Same, + CodecTest, + ::testing::Values( + generateParam(G(SameValueGenerator(1000))), + generateParam(G(SameValueGenerator(-1000))), + generateParam(G(SameValueGenerator(1000))), + generateParam(G(SameValueGenerator(-1000))), + generateParam(G(SameValueGenerator(M_E))), + generateParam(G(SameValueGenerator(M_E))) + ) +); + +INSTANTIATE_TEST_CASE_P(Sequential, + CodecTest, + ::testing::Values( + generateParam(G(SequentialGenerator(1))), + generateParam(G(SequentialGenerator(-1))), + generateParam(G(SequentialGenerator(1))), + generateParam(G(SequentialGenerator(-1))), + generateParam(G(SequentialGenerator(M_E))), + generateParam(G(SequentialGenerator(M_E))) + ) +); + +INSTANTIATE_TEST_CASE_P(Monotonic, + CodecTest, + ::testing::Values( + generateParam(G(MonotonicGenerator(1, 5))), + generateParam(G(MonotonicGenerator(-1, 5))), + generateParam(G(MonotonicGenerator(1, 5))), + generateParam(G(MonotonicGenerator(-1, 5))), + generateParam(G(MonotonicGenerator(M_E, 5))), + generateParam(G(MonotonicGenerator(M_E, 5))) + ) +); + +INSTANTIATE_TEST_CASE_P(Random, + CodecTest, + ::testing::Values( + generateParam(G(RandomGenerator)), + generateParam(G(RandomGenerator)) + ) +); + +INSTANTIATE_TEST_CASE_P(Overflow, + CodecTest, + ::testing::Values( + generateParam(G(MinMaxGenerator)), + generateParam(G(MinMaxGenerator)), + generateParam(G(MinMaxGenerator)), + generateParam(G(MinMaxGenerator)) + ) +); + +} diff --git a/dbms/src/IO/BitHelpers.cpp b/dbms/src/IO/BitHelpers.cpp deleted file mode 100644 index e735ef2ebbe..00000000000 --- a/dbms/src/IO/BitHelpers.cpp +++ /dev/null @@ -1,135 +0,0 @@ -#include "BitHelpers.h" - -#include - -namespace -{ -const DB::UInt8 MAX_BUFFER_SIZE_BITS = 8; -} - -namespace DB -{ - -BitReader::BitReader(ReadBuffer & buf_) - : buf(buf_), - bits_buffer(0), - bits_count(0) -{} - -BitReader::~BitReader() -{} - -UInt64 BitReader::readBits(UInt8 bits) -{ - UInt64 result = 0; - bits = std::min(static_cast(sizeof(result) * 8), bits); - - while (bits != 0) - { - if (bits_count == 0) - { - fillBuffer(); - if (bits_count == 0) - { - // EOF. - break; - } - } - - const auto to_read = std::min(bits, bits_count); - // read MSB bits from bits_bufer - const UInt8 v = bits_buffer >> (bits_count - to_read); - const UInt8 mask = static_cast(~(~0U << to_read)); - const UInt8 value = v & mask; - result |= value; - - // unset MSB that were read - bits_buffer &= ~(mask << (bits_count - to_read)); - bits_count -= to_read; - bits -= to_read; - - result <<= std::min(bits, static_cast(sizeof(bits_buffer)*8)); - } - - return result; -} - -UInt8 BitReader::readBit() -{ - return static_cast(readBits(1)); -} - -bool BitReader::eof() const -{ - return bits_count == 0 && buf.eof(); -} - -void BitReader::fillBuffer() -{ - auto read = buf.read(reinterpret_cast(&bits_buffer), MAX_BUFFER_SIZE_BITS/8); - bits_count = static_cast(read) * 8; -} - -BitWriter::BitWriter(WriteBuffer & buf_) - : buf(buf_), - bits_buffer(0), - bits_count(0) -{} - -BitWriter::~BitWriter() -{ - flush(); -} - -void BitWriter::writeBits(UInt8 bits, UInt64 value) -{ - bits = std::min(static_cast(sizeof(value) * 8), bits); - - while (bits > 0) - { - auto v = value; - auto to_write = bits; - - const UInt8 capacity = MAX_BUFFER_SIZE_BITS - bits_count; - if (capacity < bits) - { - // write MSB: - v >>= bits - capacity; - to_write = capacity; - } - - - const UInt64 mask = (1 << to_write) - 1; - v &= mask; - assert(v <= 255); - - bits_buffer <<= to_write; - bits_buffer |= v; - bits_count += to_write; - - if (bits_count < MAX_BUFFER_SIZE_BITS) - break; - - doFlush(); - bits -= to_write; - } -} - -void BitWriter::flush() -{ - if (bits_count != 0) - { - bits_buffer <<= (MAX_BUFFER_SIZE_BITS - bits_count); - doFlush(); - } -} - -void BitWriter::doFlush() -{ - buf.write(reinterpret_cast(&bits_buffer), MAX_BUFFER_SIZE_BITS/8); - - bits_count = 0; - bits_buffer = 0; -} - -} // namespace DB diff --git a/dbms/src/IO/BitHelpers.h b/dbms/src/IO/BitHelpers.h index 2228185ed30..a025e241882 100644 --- a/dbms/src/IO/BitHelpers.h +++ b/dbms/src/IO/BitHelpers.h @@ -7,10 +7,10 @@ namespace DB { -/** Reads data from underlying ReadBuffer in bit by bit, max 64 bits at once. +/** Reads data from underlying ReadBuffer bit by bit, max 64 bits at once. * * reads MSB bits first, imagine that you have a data: - * 11110000 10101010 00100100 11111111 + * 11110000 10101010 00100100 11111110 * * Given that r is BitReader created with a ReadBuffer that reads from data above: * r.readBits(3) => 0b111 @@ -19,31 +19,84 @@ namespace DB * r.readBit() => 0b1 * r.readBit() => 0b0 * r.readBits(15) => 0b10001001001111111 - * r.readBit() => 0b1 + * r.readBit() => 0b0 **/ + class BitReader { ReadBuffer & buf; UInt8 bits_buffer; UInt8 bits_count; + static constexpr UInt8 BIT_BUFFER_SIZE = sizeof(bits_buffer) * 8; public: - BitReader(ReadBuffer & buf_); - ~BitReader(); + BitReader(ReadBuffer & buf_) + : buf(buf_), + bits_buffer(0), + bits_count(0) + {} - BitReader(BitReader &&) = default; + ~BitReader() + {} - // bits is at most 64 - UInt64 readBits(UInt8 bits); - UInt8 readBit(); + inline UInt64 readBits(UInt8 bits) + { + UInt64 result = 0; + bits = std::min(static_cast(sizeof(result) * 8), bits); - // true when both bit-buffer and underlying byte-buffer are empty. - bool eof() const; + while (bits != 0) + { + if (bits_count == 0) + { + fillBuffer(); + if (bits_count == 0) + { + // EOF. + break; + } + } + + const auto to_read = std::min(bits, bits_count); + // read MSB bits from bits_bufer + const UInt8 v = bits_buffer >> (bits_count - to_read); + const UInt8 mask = static_cast(~(~0U << to_read)); + const UInt8 value = v & mask; + result |= value; + + // unset MSB that were read + bits_buffer &= ~(mask << (bits_count - to_read)); + bits_count -= to_read; + bits -= to_read; + + result <<= std::min(bits, BIT_BUFFER_SIZE); + } + + return result; + } + + inline UInt64 peekBits(UInt8 /*bits*/) + { + return 0; + } + + inline UInt8 readBit() + { + return static_cast(readBits(1)); + } + + inline bool eof() const + { + return bits_count == 0 && buf.eof(); + } private: - void fillBuffer(); + void fillBuffer() + { + auto read = buf.read(reinterpret_cast(&bits_buffer), BIT_BUFFER_SIZE / 8); + bits_count = static_cast(read) * 8; + } }; class BitWriter @@ -53,19 +106,71 @@ class BitWriter UInt8 bits_buffer; UInt8 bits_count; + static constexpr UInt8 BIT_BUFFER_SIZE = sizeof(bits_buffer) * 8; + public: - BitWriter(WriteBuffer & buf_); - ~BitWriter(); + BitWriter(WriteBuffer & buf_) + : buf(buf_), + bits_buffer(0), + bits_count(0) + {} - BitWriter(BitWriter &&) = default; + ~BitWriter() + { + flush(); + } - // write `size` low bits of the `value`. - void writeBits(UInt8 size, UInt64 value); + inline void writeBits(UInt8 bits, UInt64 value) + { + bits = std::min(static_cast(sizeof(value) * 8), bits); - void flush(); + while (bits > 0) + { + auto v = value; + auto to_write = bits; + + const UInt8 capacity = BIT_BUFFER_SIZE - bits_count; + if (capacity < bits) + { + // write MSB: + v >>= bits - capacity; + to_write = capacity; + } + + + const UInt64 mask = (1 << to_write) - 1; + v &= mask; +// assert(v <= 255); + + bits_buffer <<= to_write; + bits_buffer |= v; + bits_count += to_write; + + if (bits_count < BIT_BUFFER_SIZE) + break; + + doFlush(); + bits -= to_write; + } + } + + inline void flush() + { + if (bits_count != 0) + { + bits_buffer <<= (BIT_BUFFER_SIZE - bits_count); + doFlush(); + } + } private: - void doFlush(); + void doFlush() + { + buf.write(reinterpret_cast(&bits_buffer), BIT_BUFFER_SIZE / 8); + + bits_count = 0; + bits_buffer = 0; + } }; -} // namespace DB +} diff --git a/dbms/src/IO/tests/CMakeLists.txt b/dbms/src/IO/tests/CMakeLists.txt index 71190d11942..127dc45d9bb 100644 --- a/dbms/src/IO/tests/CMakeLists.txt +++ b/dbms/src/IO/tests/CMakeLists.txt @@ -82,6 +82,3 @@ target_link_libraries (zlib_ng_bug PRIVATE ${Poco_Foundation_LIBRARY}) if(NOT USE_INTERNAL_POCO_LIBRARY) target_include_directories(zlib_ng_bug SYSTEM BEFORE PRIVATE ${Poco_INCLUDE_DIRS}) endif() - -add_executable(bit_io bit_io.cpp) -target_link_libraries (bit_io PRIVATE clickhouse_common_io) diff --git a/dbms/src/IO/tests/bit_io.cpp b/dbms/src/IO/tests/bit_io.cpp deleted file mode 100644 index 4bb2d5012d0..00000000000 --- a/dbms/src/IO/tests/bit_io.cpp +++ /dev/null @@ -1,188 +0,0 @@ - -#include - -#include -#include -#include - -#include -#include -#include -#include - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-const-variable" -#pragma GCC diagnostic ignored "-Wunused-variable" - -namespace -{ -using namespace DB; - -// Intentionally asymetric both byte and word-size to detect read and write inconsistencies -// each prime bit is set to 0. -// v-61 v-53 v-47 v-41 v-37 v-31 v-23 v-17 v-11 v-5 -const UInt64 BIT_PATTERN = 0b11101011'11101111'10111010'11101111'10101111'10111010'11101011'10101001; -const UInt8 PRIMES[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61}; -const UInt8 REPEAT_TIMES = 11; - -template -std::string bin(const T & value, size_t bits = sizeof(T)*8) -{ - static const UInt8 MAX_BITS = sizeof(T)*8; - assert(bits <= MAX_BITS); - - return std::bitset(static_cast(value)) - .to_string().substr(MAX_BITS - bits, bits); -} - -template -T getBits(UInt8 bits, const T & value) -{ - const T mask = ((static_cast(1) << static_cast(bits)) - 1); - return value & mask; -} - -std::ostream & dumpBuffer(const char * begin, const char * end, std::ostream * destination, const char* col_sep = " ", const char* row_sep = "\n", const size_t cols_in_row = 8, UInt32 max_bytes = 0xFFFFFFFF) -{ - size_t col = 0; - for (auto p = begin; p < end && p - begin < max_bytes; ++p) - { - *destination << bin(*p); - if (++col % cols_in_row == 0) - { - if (row_sep) - *destination << row_sep; - } - else if (col_sep) - { - *destination << col_sep; - } - } - - return *destination; -} - -std::ostream & dumpBufferContents(BufferBase & buffer, std::ostream * destination, const char* col_sep = " ", const char* row_sep = "\n", const size_t cols_in_row = 8, UInt32 max_bytes = 0xFFFFFFFF) -{ - const auto & data = buffer.buffer(); - return dumpBuffer(data.begin(), data.end(), destination, col_sep, row_sep, cols_in_row, max_bytes); -} - -std::string dumpBufferContents(BufferBase & buffer, const char* col_sep = " ", const char* row_sep = "\n", const size_t cols_in_row = 8) -{ - std::stringstream sstr; - dumpBufferContents(buffer, &sstr, col_sep, row_sep, cols_in_row); - - return sstr.str(); -} - - -bool test(const std::vector> & bits_and_vals, const char * expected_buffer_binary = nullptr) -{ - MemoryWriteBuffer memory_write_buffer(1024, 1024, 1.5, 20*1024); - - { - BitWriter writer(memory_write_buffer); - for (const auto & bv : bits_and_vals) - { - writer.writeBits(bv.first, bv.second); - } - writer.flush(); - } - - { - auto memory_read_buffer = memory_write_buffer.tryGetReadBuffer(); - - if (expected_buffer_binary != nullptr) - { - const auto actual_buffer_binary = dumpBufferContents(*memory_read_buffer, " ", " "); - if (actual_buffer_binary != expected_buffer_binary) - { - std::cerr << "Invalid buffer memory after writing\n" - << "expected: " << strlen(expected_buffer_binary) << "\n" << expected_buffer_binary - << "\ngot: " << actual_buffer_binary.size() << "\n" << actual_buffer_binary - << std::endl; - - return false; - } - } - - BitReader reader(*memory_read_buffer); - - int item = 0; - for (const auto & bv : bits_and_vals) - { - const auto expected_value = getBits(bv.first, bv.second); - - const auto actual_value = reader.readBits(bv.first); - - if (expected_value != actual_value) - { - std::cerr << "Invalid value #" << item << " with " << static_cast(bv.first) << ", " << bin(bv.second) << "\n" - << "\texpected: " << bin(expected_value) << "\n" - << "\tgot : " << bin(actual_value) << ".\n\n\nBuffer memory:\n"; - dumpBufferContents(*memory_read_buffer, &std::cerr) << std::endl << std::endl; - - return false; - } - ++item; - } - } - - return true; -} - -bool primes_test() -{ - std::vector> test_data; - MemoryWriteBuffer memory_write_buffer; - - { - for (UInt8 r = 0; r < REPEAT_TIMES; ++r) - { - for (const auto p : PRIMES) - { - test_data.emplace_back(p, BIT_PATTERN); - } - } - } - - return test(test_data); -} - -void simple_test(UInt8 bits, UInt64 value) -{ - test({{bits, value}}); -} - -} // namespace - -int main() -{ - UInt32 test_case = 0; - for (const auto p : PRIMES) - { - simple_test(p, 0xFFFFFFFFFFFFFFFF); - std::cout << ++test_case << " with all-ones and " << static_cast(p) << std::endl; - } - - for (const auto p : PRIMES) - { - simple_test(p, BIT_PATTERN); - std::cout << ++test_case << " with fancy bit pattern and " << static_cast(p) << std::endl; - } - - test({{9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}}, - "11111111 10000000 00111111 11100000 00001111 11111000 "); - - test({{7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {3, 0xFFFF}}, - "01111110 11111101 11111011 11110111 11101111 11011111 10111111 01111111 11000000 "); - - test({{33, 0xFF110d0b07050300}, {33, 0xAAEE29251f1d1713}, }); - test({{33, BIT_PATTERN}, {33, BIT_PATTERN}}); - - std::cout << ++test_case << " primes " << std::endl; - primes_test(); - - return 0; -} diff --git a/dbms/src/IO/tests/gtest_bit_io.cpp b/dbms/src/IO/tests/gtest_bit_io.cpp new file mode 100644 index 00000000000..e2243dfb616 --- /dev/null +++ b/dbms/src/IO/tests/gtest_bit_io.cpp @@ -0,0 +1,188 @@ +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include + +//#pragma GCC diagnostic push +//#pragma GCC diagnostic ignored "-Wunused-const-variable" +//#pragma GCC diagnostic ignored "-Wunused-variable" +//#pragma GCC diagnostic ignored "-Wunused-function" + +namespace +{ +using namespace DB; + +// Intentionally asymetric both byte and word-size to detect read and write inconsistencies +// each prime bit is set to 0. +// v-61 v-53 v-47 v-41 v-37 v-31 v-23 v-17 v-11 v-5 +const UInt64 BIT_PATTERN = 0b11101011'11101111'10111010'11101111'10101111'10111010'11101011'10101001; +const UInt8 PRIMES[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61}; +const UInt8 REPEAT_TIMES = 11; + +template +std::string bin(const T & value, size_t bits = sizeof(T)*8) +{ + static const UInt8 MAX_BITS = sizeof(T)*8; + assert(bits <= MAX_BITS); + + return std::bitset(static_cast(value)) + .to_string().substr(MAX_BITS - bits, bits); +} + +template +T getBits(UInt8 bits, const T & value) +{ + const T mask = ((static_cast(1) << static_cast(bits)) - 1); + return value & mask; +} + +std::ostream & dumpBuffer(const char * begin, + const char * end, + std::ostream * destination, + const char* col_sep = " ", + const char* row_sep = "\n", + const size_t cols_in_row = 8, + UInt32 max_bytes = 0xFFFFFFFF) +{ + size_t col = 0; + for (auto p = begin; p < end && p - begin < max_bytes; ++p) + { + *destination << bin(*p); + if (++col % cols_in_row == 0) + { + if (row_sep) + *destination << row_sep; + } + else if (col_sep) + { + *destination << col_sep; + } + } + + return *destination; +} + +std::string dumpBufferContents(BufferBase & buf, + const char* col_sep = " ", + const char* row_sep = "\n", + const size_t cols_in_row = 8) + +{ + std::stringstream sstr; + dumpBuffer(buf.buffer().begin(), buf.buffer().end(), &sstr, col_sep, row_sep, cols_in_row); + + return sstr.str(); +} + +struct TestCaseParameter +{ + std::vector> bits_and_vals; + std::string expected_buffer_binary; + + explicit TestCaseParameter(std::vector> vals, std::string binary = std::string{}) + : bits_and_vals(std::move(vals)), + expected_buffer_binary(binary) + {} +}; + +class BitIO : public ::testing::TestWithParam +{}; + +TEST_P(BitIO, WriteAndRead) +{ + const auto & param = GetParam(); + const auto & bits_and_vals = param.bits_and_vals; + const auto & expected_buffer_binary = param.expected_buffer_binary; + + UInt64 max_buffer_size = 0; + for (const auto & bv : bits_and_vals) + { + max_buffer_size += bv.first; + } + max_buffer_size = (max_buffer_size + 8) / 8; + SCOPED_TRACE(max_buffer_size); + + MemoryWriteBuffer memory_write_buffer(max_buffer_size * 2, max_buffer_size, 1.5, max_buffer_size); + + { + BitWriter writer(memory_write_buffer); + for (const auto & bv : bits_and_vals) + { + writer.writeBits(bv.first, bv.second); + } + writer.flush(); + } + + { + auto memory_read_buffer = memory_write_buffer.tryGetReadBuffer(); + + if (expected_buffer_binary != std::string{}) + { + const auto actual_buffer_binary = dumpBufferContents(*memory_read_buffer, " ", " "); + ASSERT_EQ(expected_buffer_binary, actual_buffer_binary); + } + + BitReader reader(*memory_read_buffer); + + int item = 0; + for (const auto & bv : bits_and_vals) + { + const auto actual_value = reader.readBits(bv.first); + + ASSERT_EQ(getBits(bv.first, bv.second), actual_value) + << "item #" << item << ", width: " << static_cast(bv.first) + << ", value: " << bin(bv.second) + << ".\n\n\nBuffer memory:\n" << dumpBufferContents(*memory_read_buffer); + + ++item; + } + } +} + +INSTANTIATE_TEST_CASE_P(Simple, + BitIO, + ::testing::Values( + TestCaseParameter( + {{9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}}, + "11111111 10000000 00111111 11100000 00001111 11111000 "), + TestCaseParameter( + {{7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {3, 0xFFFF}}, + "01111110 11111101 11111011 11110111 11101111 11011111 10111111 01111111 11000000 "), + TestCaseParameter({{33, 0xFF110d0b07050300}, {33, 0xAAEE29251f1d1713}}), + TestCaseParameter({{33, BIT_PATTERN}, {33, BIT_PATTERN}}) +)); + +TestCaseParameter primes_case(UInt8 repeat_times, UInt64 pattern) +{ + std::vector> test_data; + + { + for (UInt8 r = 0; r < repeat_times; ++r) + { + for (const auto p : PRIMES) + { + test_data.emplace_back(p, pattern); + } + } + } + + return TestCaseParameter(test_data); +} + +INSTANTIATE_TEST_CASE_P(Primes, + BitIO, + ::testing::Values( + primes_case(11, 0xFFFFFFFFFFFFFFFFULL), + primes_case(11, BIT_PATTERN) +)); + +} // namespace + diff --git a/dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.sql b/dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.sql deleted file mode 100644 index 335403c44ab..00000000000 --- a/dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.sql +++ /dev/null @@ -1,211 +0,0 @@ -DROP TABLE IF EXISTS reference; -DROP TABLE IF EXISTS doubleDelta; - -CREATE TABLE reference ( - key UInt64, - valueU64 UInt64, - valueU32 UInt32, - valueU16 UInt16, - valueU8 UInt8, - valueI64 Int64, - valueI32 Int32, - valueI16 Int16, - valueI8 Int8, - valueDT DateTime, - valueD Date -) Engine = MergeTree ORDER BY key; - - -CREATE TABLE doubleDelta ( - key UInt64 CODEC(DoubleDelta), - valueU64 UInt64 CODEC(DoubleDelta), - valueU32 UInt32 CODEC(DoubleDelta), - valueU16 UInt16 CODEC(DoubleDelta), - valueU8 UInt8 CODEC(DoubleDelta), - valueI64 Int64 CODEC(DoubleDelta), - valueI32 Int32 CODEC(DoubleDelta), - valueI16 Int16 CODEC(DoubleDelta), - valueI8 Int8 CODEC(DoubleDelta), - valueDT DateTime CODEC(DoubleDelta), - valueD Date CODEC(DoubleDelta) -) Engine = MergeTree ORDER BY key; - - --- n^3 covers all double delta storage cases, from small difference between neighbour values (stride) to big. -INSERT INTO reference (key, valueU64, valueU32, valueU16, valueU8, valueI64, valueI32, valueI16, valueI8, valueDT, valueD) - SELECT number as n, n * n * n as v, v, v, v, v, v, v, v, toDateTime(v), toDate(v) FROM system.numbers LIMIT 1, 100; - --- best case - constant stride -INSERT INTO reference (key, valueU64, valueU32, valueU16, valueU8, valueI64, valueI32, valueI16, valueI8, valueDT, valueD) - SELECT number as n, n as v, v, v, v, v, v, v, v, toDateTime(v), toDate(v) FROM system.numbers LIMIT 101, 100; - --- checking for overflow -INSERT INTO reference (key, valueU64, valueI64) -VALUES (201, 18446744073709551616, 9223372036854775808), (202, 0, -9223372036854775808), (203, 18446744073709551616, 9223372036854775808); - --- worst case - random stride -INSERT INTO reference (key, valueU64, valueU32, valueU16, valueU8, valueI64, valueI32, valueI16, valueI8, valueDT, valueD) - SELECT number as n, n + (rand64() - 9223372036854775808)/1000 as v, v, v, v, v, v, v, v, toDateTime(v), toDate(v) FROM system.numbers LIMIT 301, 100; - - -INSERT INTO doubleDelta SELECT * FROM reference; - --- same number of rows -SELECT a[1] - a[2] FROM ( - SELECT groupArray(1) AS a FROM ( - SELECT count() FROM reference - UNION ALL - SELECT count() FROM doubleDelta - ) -); - -SELECT 'U64'; -SELECT - key, - r.valueU64, d.valueU64, r.valueU64 - d.valueU64 as dU64 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dU64 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'U32'; -SELECT - key, - r.valueU32, d.valueU32, r.valueU32 - d.valueU32 as dU32 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dU32 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'U16'; -SELECT - key, - r.valueU16, d.valueU16, r.valueU16 - d.valueU16 as dU16 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dU16 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'U8'; -SELECT - key, - r.valueU8, d.valueU8, r.valueU8 - d.valueU8 as dU8 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dU8 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'I64'; -SELECT - key, - r.valueI64, d.valueI64, r.valueI64 - d.valueI64 as dI64 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dI64 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'I32'; -SELECT - key, - r.valueI32, d.valueI32, r.valueI32 - d.valueI32 as dI32 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dI32 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'I16'; -SELECT - key, - r.valueI16, d.valueI16, r.valueI16 - d.valueI16 as dI16 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dI16 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'I8'; -SELECT - key, - r.valueI8, d.valueI8, r.valueI8 - d.valueI8 as dI8 -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dI8 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'DT'; -SELECT - key, - r.valueDT, d.valueDT, r.valueDT - d.valueDT as dDT -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dDT != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'D'; -SELECT - key, - r.valueD, d.valueD, r.valueD - d.valueD as dD -FROM reference as r, doubleDelta as d -WHERE - r.key == d.key -AND - dD != 0 -ORDER BY r.key -LIMIT 10; - --- Compatibity with other codecs -DROP TABLE IF EXISTS dd_lz4_codec; -CREATE TABLE dd_lz4_codec ( - key UInt64 CODEC(DoubleDelta, LZ4), - valueU64 UInt64 CODEC(DoubleDelta, LZ4), - valueU32 UInt32 CODEC(DoubleDelta, LZ4), - valueU16 UInt16 CODEC(DoubleDelta, LZ4), - valueU8 UInt8 CODEC(DoubleDelta, LZ4), - valueI64 Int64 CODEC(DoubleDelta, LZ4), - valueI32 Int32 CODEC(DoubleDelta, LZ4), - valueI16 Int16 CODEC(DoubleDelta, LZ4), - valueI8 Int8 CODEC(DoubleDelta, LZ4), - valueDT DateTime CODEC(DoubleDelta, LZ4), - valueD Date CODEC(DoubleDelta, LZ4) -) Engine = MergeTree ORDER BY key; - -INSERT INTO dd_lz4_codec SELECT * FROM reference; - -DROP TABLE IF EXISTS reference; -DROP TABLE IF EXISTS doubleDelta; -DROP TABLE IF EXISTS dd_lz4_codec; \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.sql b/dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.sql deleted file mode 100644 index 5b41a70a50a..00000000000 --- a/dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.sql +++ /dev/null @@ -1,89 +0,0 @@ -DROP DATABASE IF EXISTS codec_test; -CREATE DATABASE codec_test; -USE codec_test; - - -DROP TABLE IF EXISTS reference; -DROP TABLE IF EXISTS gorilla; - -CREATE TABLE reference ( - key UInt64, - valueF64 Float64, - valueF32 Float32 -) Engine = MergeTree ORDER BY key; - - -CREATE TABLE gorilla ( - key UInt64, - valueF64 Float64 CODEC(Gorilla), - valueF32 Float32 CODEC(Gorilla) -) Engine = MergeTree ORDER BY key; - --- best case - same value -INSERT INTO reference (key, valueF64, valueF32) - SELECT number AS n, e() AS v, v FROM system.numbers LIMIT 1, 100; - --- good case - values that grow insignificantly -INSERT INTO reference (key, valueF64, valueF32) - SELECT number AS n, log2(n) AS v, v FROM system.numbers LIMIT 1001, 100; - --- bad case - values differ significantly -INSERT INTO reference (key, valueF64, valueF32) - SELECT number AS n, n*sqrt(n) AS v, v FROM system.numbers LIMIT 2001, 100; - --- worst case - random values -INSERT INTO reference (key, valueF64, valueF32) - SELECT number AS n, (rand64() - 9223372036854775808)/10000000000000 AS v, v FROM system.numbers LIMIT 3001, 100; - - -INSERT INTO gorilla SELECT * FROM reference; - -SELECT a[1] - a[2] FROM ( - SELECT groupArray(1) AS a FROM ( - SELECT count() FROM reference - UNION ALL - SELECT count() FROM gorilla - ) -); - --- These floating-point values are expected to be BINARY equal, hence comparing the values are safe. - -SELECT 'F64'; -SELECT - key, - r.valueF64, g.valueF64, r.valueF64 - g.valueF64 AS dU64 -FROM reference AS r, gorilla AS g -WHERE - r.key == g.key -AND - dU64 != 0 -ORDER BY r.key -LIMIT 10; - - -SELECT 'F32'; -SELECT - key, - r.valueF32, g.valueF32, r.valueF32 - g.valueF32 AS dU32 -FROM reference AS r, gorilla AS g -WHERE - r.key == g.key -AND - dU32 != 0 -ORDER BY r.key -LIMIT 10; - - --- Compatibity with other codecs -DROP TABLE IF EXISTS g_lz4_codec; -CREATE TABLE g_lz4_codec ( - key UInt64 CODEC(Gorilla, LZ4), - valueU64 Float64 CODEC(Gorilla, LZ4), - valueU32 Float32 CODEC(Gorilla, LZ4) -) Engine = MergeTree ORDER BY key; - -INSERT INTO g_lz4_codec SELECT * FROM reference; - -DROP TABLE IF EXISTS reference; -DROP TABLE IF EXISTS gorilla; -DROP TABLE IF EXISTS g_lz4_codec; \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.reference b/dbms/tests/queries/0_stateless/00950_test_double_delta_codec.reference similarity index 94% rename from dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.reference rename to dbms/tests/queries/0_stateless/00950_test_double_delta_codec.reference index 6b25688b4dd..5fb3bfb3629 100644 --- a/dbms/tests/queries/0_stateless/00950_column_encoding_double_delta.reference +++ b/dbms/tests/queries/0_stateless/00950_test_double_delta_codec.reference @@ -1,4 +1,3 @@ -0 U64 U32 U16 diff --git a/dbms/tests/queries/0_stateless/00950_test_double_delta_codec.sql b/dbms/tests/queries/0_stateless/00950_test_double_delta_codec.sql new file mode 100644 index 00000000000..a5ae4766ece --- /dev/null +++ b/dbms/tests/queries/0_stateless/00950_test_double_delta_codec.sql @@ -0,0 +1,151 @@ +USE test; + +DROP TABLE IF EXISTS codecTest; + +CREATE TABLE codecTest ( + key UInt64, + ref_valueU64 UInt64, + ref_valueU32 UInt32, + ref_valueU16 UInt16, + ref_valueU8 UInt8, + ref_valueI64 Int64, + ref_valueI32 Int32, + ref_valueI16 Int16, + ref_valueI8 Int8, + ref_valueDT DateTime, + ref_valueD Date, + valueU64 UInt64 CODEC(DoubleDelta), + valueU32 UInt32 CODEC(DoubleDelta), + valueU16 UInt16 CODEC(DoubleDelta), + valueU8 UInt8 CODEC(DoubleDelta), + valueI64 Int64 CODEC(DoubleDelta), + valueI32 Int32 CODEC(DoubleDelta), + valueI16 Int16 CODEC(DoubleDelta), + valueI8 Int8 CODEC(DoubleDelta), + valueDT DateTime CODEC(DoubleDelta), + valueD Date CODEC(DoubleDelta) +) Engine = MergeTree ORDER BY key; + + +-- checking for overflow +INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueI64, valueI64) + VALUES (101, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807), (202, 0, 0, -9223372036854775808, -9223372036854775808), (203, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807); + +-- n^3 covers all double delta storage cases, from small difference between neighbouref_values (stride) to big. +INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD) + SELECT number as n, n * n * n as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v) + FROM system.numbers LIMIT 101, 100; + +-- best case - constant stride +INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD) + SELECT number as n, n as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v) + FROM system.numbers LIMIT 201, 100; + + +-- worst case - random stride +INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD) + SELECT number as n, n + (rand64() - 9223372036854775807)/1000 as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v) + FROM system.numbers LIMIT 301, 100; + + +SELECT 'U64'; +SELECT + key, + ref_valueU64, valueU64, ref_valueU64 - valueU64 as dU64 +FROM codecTest +WHERE + dU64 != 0 +LIMIT 10; + + +SELECT 'U32'; +SELECT + key, + ref_valueU32, valueU32, ref_valueU32 - valueU32 as dU32 +FROM codecTest +WHERE + dU32 != 0 +LIMIT 10; + + +SELECT 'U16'; +SELECT + key, + ref_valueU16, valueU16, ref_valueU16 - valueU16 as dU16 +FROM codecTest +WHERE + dU16 != 0 +LIMIT 10; + + +SELECT 'U8'; +SELECT + key, + ref_valueU8, valueU8, ref_valueU8 - valueU8 as dU8 +FROM codecTest +WHERE + dU8 != 0 +LIMIT 10; + + +SELECT 'I64'; +SELECT + key, + ref_valueI64, valueI64, ref_valueI64 - valueI64 as dI64 +FROM codecTest +WHERE + dI64 != 0 +LIMIT 10; + + +SELECT 'I32'; +SELECT + key, + ref_valueI32, valueI32, ref_valueI32 - valueI32 as dI32 +FROM codecTest +WHERE + dI32 != 0 +LIMIT 10; + + +SELECT 'I16'; +SELECT + key, + ref_valueI16, valueI16, ref_valueI16 - valueI16 as dI16 +FROM codecTest +WHERE + dI16 != 0 +LIMIT 10; + + +SELECT 'I8'; +SELECT + key, + ref_valueI8, valueI8, ref_valueI8 - valueI8 as dI8 +FROM codecTest +WHERE + dI8 != 0 +LIMIT 10; + + +SELECT 'DT'; +SELECT + key, + ref_valueDT, valueDT, ref_valueDT - valueDT as dDT +FROM codecTest +WHERE + dDT != 0 +LIMIT 10; + + +SELECT 'D'; +SELECT + key, + ref_valueD, valueD, ref_valueD - valueD as dD +FROM codecTest +WHERE + dD != 0 +LIMIT 10; + + +DROP TABLE IF EXISTS codecTest; \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.reference b/dbms/tests/queries/0_stateless/00950_test_gorilla_codec.reference similarity index 80% rename from dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.reference rename to dbms/tests/queries/0_stateless/00950_test_gorilla_codec.reference index daacf52c9b1..5e871ea0329 100644 --- a/dbms/tests/queries/0_stateless/00950_column_encoding_gorilla.reference +++ b/dbms/tests/queries/0_stateless/00950_test_gorilla_codec.reference @@ -1,3 +1,2 @@ -0 F64 F32 diff --git a/dbms/tests/queries/0_stateless/00950_test_gorilla_codec.sql b/dbms/tests/queries/0_stateless/00950_test_gorilla_codec.sql new file mode 100644 index 00000000000..23e4e85e212 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00950_test_gorilla_codec.sql @@ -0,0 +1,51 @@ +USE test; + +DROP TABLE IF EXISTS codecTest; + +CREATE TABLE codecTest ( + key UInt64, + ref_valueF64 Float64, + ref_valueF32 Float32, + valueF64 Float64 CODEC(Gorilla), + valueF32 Float32 CODEC(Gorilla) +) Engine = MergeTree ORDER BY key; + +-- best case - same value +INSERT INTO codecTest (key, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, e() AS v, v, v, v FROM system.numbers LIMIT 1, 100; + +-- good case - values that grow insignificantly +INSERT INTO codecTest (key, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, log2(n) AS v, v, v, v FROM system.numbers LIMIT 101, 100; + +-- bad case - values differ significantly +INSERT INTO codecTest (key, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, n*sqrt(n) AS v, v, v, v FROM system.numbers LIMIT 201, 100; + +-- worst case - random values +INSERT INTO codecTest (key, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, (rand64() - 9223372036854775808)/10000000000000 AS v, v, v, v FROM system.numbers LIMIT 3001, 100; + + +-- These floating-point values are expected to be BINARY equal, hence comparing the values are safe. + +SELECT 'F64'; +SELECT + key, + ref_valueF64, valueF64, ref_valueF64 - valueF64 AS dF64 +FROM codecTest +WHERE + dF64 != 0 +LIMIT 10; + + +SELECT 'F32'; +SELECT + key, + ref_valueF32, valueF32, ref_valueF32 - valueF32 AS dF32 +FROM codecTest +WHERE + dF32 != 0 +LIMIT 10; + +DROP TABLE IF EXISTS codecTest; \ No newline at end of file