From 923c6889e8bbe4aa5166e949c967dbdb5a493ea2 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Sun, 11 Aug 2019 11:01:02 +0300 Subject: [PATCH 01/18] Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; --- .../Compression/CompressionCodecGorilla.cpp | 20 +- dbms/src/Compression/ICompressionCodec.cpp | 2 +- .../tests/gtest_compressionCodec.cpp | 790 +++++++++++++----- 3 files changed, 592 insertions(+), 220 deletions(-) diff --git a/dbms/src/Compression/CompressionCodecGorilla.cpp b/dbms/src/Compression/CompressionCodecGorilla.cpp index 79cc6d27e81..8af6c8bfd39 100644 --- a/dbms/src/Compression/CompressionCodecGorilla.cpp +++ b/dbms/src/Compression/CompressionCodecGorilla.cpp @@ -78,11 +78,12 @@ binary_value_info getLeadingAndTrailingBits(const T & value) const UInt8 lz = getLeadingZeroBits(value); const UInt8 tz = getTrailingZeroBits(value); const UInt8 data_size = value == 0 ? 0 : static_cast(bit_size - lz - tz); + return binary_value_info{lz, data_size, tz}; } template -UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) +UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) { static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T)); // -1 since there must be at least 1 non-zero bit. @@ -91,6 +92,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) if (source_size % sizeof(T) != 0) throw Exception("Cannot compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS); const char * source_end = source + source_size; + const char * dest_end = dest + dest_size; const UInt32 items_count = source_size / sizeof(T); @@ -110,7 +112,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) dest += sizeof(prev_value); } - WriteBuffer buffer(dest, getCompressedDataSize(sizeof(T), source_size - sizeof(items_count) - sizeof(prev_value))); + WriteBuffer buffer(dest, dest_end - dest); BitWriter writer(buffer); while (source < source_end) @@ -265,24 +267,26 @@ UInt32 CompressionCodecGorilla::doCompressData(const char * source, UInt32 sourc dest[1] = bytes_to_skip; memcpy(&dest[2], source, bytes_to_skip); size_t start_pos = 2 + bytes_to_skip; - UInt32 compressed_size = 0; + UInt32 result_size = 0; + + const UInt32 compressed_size = getMaxCompressedDataSize(source_size); switch (data_bytes_size) { case 1: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + result_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size); break; case 2: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + result_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size); break; case 4: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + result_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size); break; case 8: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + result_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size); break; } - return 1 + 1 + compressed_size; + return 1 + 1 + result_size; } void CompressionCodecGorilla::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 /* uncompressed_size */) const diff --git a/dbms/src/Compression/ICompressionCodec.cpp b/dbms/src/Compression/ICompressionCodec.cpp index a50001238da..aafca2f5eb3 100644 --- a/dbms/src/Compression/ICompressionCodec.cpp +++ b/dbms/src/Compression/ICompressionCodec.cpp @@ -49,8 +49,8 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch UInt8 header_size = getHeaderSize(); UInt32 decompressed_size = unalignedLoad(&source[5]); doDecompressData(&source[header_size], source_size - header_size, dest, decompressed_size); - return decompressed_size; + return decompressed_size; } UInt32 ICompressionCodec::readCompressedBlockSize(const char * source) diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index 0f03070fff3..7fca3d98da7 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -1,10 +1,13 @@ -#include -#include +#include + #include #include #include #include +#include +#include +#include #include @@ -24,6 +27,33 @@ using namespace DB; +namespace std +{ +template +std::ostream & operator<<(std::ostream & ostr, const std::optional & opt) +{ + if (!opt) + { + return ostr << ""; + } + + return ostr << *opt; +} + +template +std::vector operator+(std::vector && left, std::vector && right) +{ + std::vector result(std::move(left)); + std::move(std::begin(right), std::end(right), std::back_inserter(result)); + + return result; +} + +} + +namespace +{ + template std::string bin(const T & value, size_t bits = sizeof(T)*8) { @@ -37,43 +67,46 @@ std::string bin(const T & value, size_t bits = sizeof(T)*8) template const char* type_name() { +#define MAKE_TYPE_NAME(TYPE) \ + if constexpr (std::is_same_v) return #TYPE; + + MAKE_TYPE_NAME(UInt8); + MAKE_TYPE_NAME(UInt16); + MAKE_TYPE_NAME(UInt32); + MAKE_TYPE_NAME(UInt64); + MAKE_TYPE_NAME(Int8); + MAKE_TYPE_NAME(Int16); + MAKE_TYPE_NAME(Int32); + MAKE_TYPE_NAME(Int64); + MAKE_TYPE_NAME(Float32); + MAKE_TYPE_NAME(Float64); + +#undef MAKE_TYPE_NAME + return typeid(T).name(); } -template <> -const char* type_name() +template +DataTypePtr makeDataType() { - return "uint32"; -} +#define MAKE_DATA_TYPE(TYPE) \ + if constexpr (std::is_same_v) return std::make_shared() -template <> -const char* type_name() -{ - return "int32"; -} + MAKE_DATA_TYPE(UInt8); + MAKE_DATA_TYPE(UInt16); + MAKE_DATA_TYPE(UInt32); + MAKE_DATA_TYPE(UInt64); + MAKE_DATA_TYPE(Int8); + MAKE_DATA_TYPE(Int16); + MAKE_DATA_TYPE(Int32); + MAKE_DATA_TYPE(Int64); + MAKE_DATA_TYPE(Float32); + MAKE_DATA_TYPE(Float64); -template <> -const char* type_name() -{ - return "uint64"; -} +#undef MAKE_DATA_TYPE -template <> -const char* type_name() -{ - return "int64"; -} - -template <> -const char* type_name() -{ - return "float"; -} - -template <> -const char* type_name() -{ - return "double"; + assert(false && "unsupported size"); + return nullptr; } @@ -135,52 +168,83 @@ template return result; } -struct CodecTestParam +struct CodecTestSequence { - std::string type_name; - std::vector source_data; - UInt8 data_byte_size; - double min_compression_ratio; - std::string case_name; + std::string name; + std::vector serialized_data; - // to allow setting ratio after building with complex builder functions. - CodecTestParam && setRatio(const double & ratio) && - { - this->min_compression_ratio = ratio; - return std::move(*this); - } + DataTypePtr data_type; }; -CodecTestParam operator+(CodecTestParam && left, CodecTestParam && right) +struct Codec { - assert(left.type_name == right.type_name); - assert(left.data_byte_size == right.data_byte_size); + std::string codec_statement; + std::optional expected_compression_ratio; - std::vector data(std::move(left.source_data)); - data.insert(data.end(), right.source_data.begin(), right.source_data.end()); + explicit Codec(std::string codec_statement_, std::optional expected_compression_ratio_ = std::nullopt) + : codec_statement(std::move(codec_statement_)), + expected_compression_ratio(expected_compression_ratio_) + {} - return CodecTestParam{ - left.type_name, - std::move(data), - left.data_byte_size, - std::min(left.min_compression_ratio, right.min_compression_ratio), - left.case_name + " + " + right.case_name + Codec() + : Codec(std::string()) + {} +}; + +CodecTestSequence operator+(CodecTestSequence && left, CodecTestSequence && right) +{ + assert(left.data_type->equals(*right.data_type)); + + std::vector data(std::move(left.serialized_data)); + data.insert(data.end(), right.serialized_data.begin(), right.serialized_data.end()); + + return CodecTestSequence{ + left.name + " + " + right.name, + std::move(data), + std::move(left.data_type) }; } -std::ostream & operator<<(std::ostream & ostr, const CodecTestParam & param) +template +CodecTestSequence operator*(CodecTestSequence && left, T times) { - return ostr << "name: " << param.case_name - << "\ntype name:" << param.type_name - << "\nbyte size: " << static_cast(param.data_byte_size) - << "\ndata size: " << param.source_data.size(); + std::vector data(std::move(left.serialized_data)); + const size_t initial_size = data.size(); + const size_t final_size = initial_size * times; + + data.reserve(final_size); + + for (T i = 0; i < times; ++i) + { + data.insert(data.end(), data.begin(), data.begin() + initial_size); + } + + return CodecTestSequence{ + left.name + " x " + std::to_string(times), + std::move(data), + std::move(left.data_type) + }; } -// compression ratio < 1.0 means that codec output is smaller than input. -const double DEFAULT_MIN_COMPRESSION_RATIO = 1.0; +std::ostream & operator<<(std::ostream & ostr, const Codec & codec) +{ + return ostr << "Codec{" + << "name: " << codec.codec_statement + << ", expected_compression_ratio: " << codec.expected_compression_ratio + << "}"; +} + +std::ostream & operator<<(std::ostream & ostr, const CodecTestSequence & seq) +{ + return ostr << "CodecTestSequence{" + << "name: " << seq.name + << ", type name: " << seq.data_type->getName() + << ", data size: " << seq.serialized_data.size() << " bytes" + << "}"; +} template -CodecTestParam makeParam(Args && ... args) +CodecTestSequence makeSeq(Args && ... args) { std::initializer_list vals{static_cast(args)...}; std::vector data(sizeof(T) * std::size(vals)); @@ -192,14 +256,17 @@ CodecTestParam makeParam(Args && ... args) write_pos += sizeof(v); } - return CodecTestParam{type_name(), std::move(data), sizeof(T), DEFAULT_MIN_COMPRESSION_RATIO, - (boost::format("%1% values of %2%") % std::size(vals) % type_name()).str()}; + return CodecTestSequence{ + (boost::format("%1% values of %2%") % std::size(vals) % type_name()).str(), + std::move(data), + makeDataType() + }; } -template -CodecTestParam generateParam(Generator gen, const char* gen_name) +template +CodecTestSequence generateSeq(Generator gen, const char* gen_name, size_t Begin = 0, size_t End = 10000) { - static_assert (End >= Begin, "End must be not less than Begin"); + assert (End >= Begin); std::vector data(sizeof(T) * (End - Begin)); char * write_pos = data.data(); @@ -211,89 +278,104 @@ CodecTestParam generateParam(Generator gen, const char* gen_name) write_pos += sizeof(v); } - return CodecTestParam{type_name(), std::move(data), sizeof(T), DEFAULT_MIN_COMPRESSION_RATIO, - (boost::format("%1% values of %2% from %3%") % (End - Begin) % type_name() % gen_name).str()}; + return CodecTestSequence{ + (boost::format("%1% values of %2% from %3%") % (End - Begin) % type_name() % gen_name).str(), + std::move(data), + makeDataType() + }; } -void TestTranscoding(ICompressionCodec * codec, const CodecTestParam & param) -{ - const auto & source_data = param.source_data; - const UInt32 encoded_max_size = codec->getCompressedReserveSize(source_data.size()); - PODArray encoded(encoded_max_size); - - const UInt32 encoded_size = codec->compress(source_data.data(), source_data.size(), encoded.data()); - encoded.resize(encoded_size); - - PODArray decoded(source_data.size()); - const UInt32 decoded_size = codec->decompress(encoded.data(), encoded.size(), decoded.data()); - decoded.resize(decoded_size); - - switch (param.data_byte_size) - { - case 1: - ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); - break; - case 2: - ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); - break; - case 4: - ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); - break; - case 8: - ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); - break; - default: - FAIL() << "Invalid data_byte_size: " << param.data_byte_size; - } - const auto header_size = codec->getHeaderSize(); - const auto compression_ratio = (encoded_size - header_size) / (source_data.size() * 1.0); - - ASSERT_LE(compression_ratio, param.min_compression_ratio) - << "\n\tdecoded size: " << source_data.size() - << "\n\tencoded size: " << encoded_size - << "(no header: " << encoded_size - header_size << ")"; -} - -class CodecTest : public ::testing::TestWithParam +class CodecTest : public ::testing::TestWithParam> { public: - static void SetUpTestCase() + enum MakeCodecParam { - // To make random predicatble and avoid failing test "out of the blue". - srand(0); + CODEC_WITH_DATA_TYPE, + CODEC_WITHOUT_DATA_TYPE, + }; + + CompressionCodecPtr makeCodec(MakeCodecParam with_data_type) const + { + const auto & codec_string = std::get<0>(GetParam()).codec_statement; + const auto & data_type = with_data_type == CODEC_WITH_DATA_TYPE ? std::get<1>(GetParam()).data_type : nullptr; + + const std::string codec_statement = "(" + codec_string + ")"; + Tokens tokens(codec_statement.begin().base(), codec_statement.end().base()); + TokenIterator token_iterator(tokens); + + Expected expected; + ASTPtr codec_ast; + ParserCodec parser; + parser.parse(token_iterator, codec_ast, expected); + + + return CompressionCodecFactory::instance().get(codec_ast, data_type); + } + + void testTranscoding(ICompressionCodec & codec) + { + const auto & test_sequence = std::get<1>(GetParam()); + const auto & source_data = test_sequence.serialized_data; + + const UInt32 encoded_max_size = codec.getCompressedReserveSize(source_data.size()); + PODArray encoded(encoded_max_size); + + const UInt32 encoded_size = codec.compress(source_data.data(), source_data.size(), encoded.data()); + encoded.resize(encoded_size); + + PODArray decoded(source_data.size()); + const UInt32 decoded_size = codec.decompress(encoded.data(), encoded.size(), decoded.data()); + decoded.resize(decoded_size); + + switch (test_sequence.data_type->getSizeOfValueInMemory()) + { + case 1: + ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); + break; + case 2: + ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); + break; + case 4: + ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); + break; + case 8: + ASSERT_TRUE(EqualByteContainersAs(source_data, decoded)); + break; + default: + FAIL() << "Invalid test sequence data type: " << test_sequence.data_type->getName(); + } + const auto header_size = codec.getHeaderSize(); + const auto compression_ratio = (encoded_size - header_size) / (source_data.size() * 1.0); + + const auto & codec_spec = std::get<0>(GetParam()); + if (codec_spec.expected_compression_ratio) + { + ASSERT_LE(compression_ratio, *codec_spec.expected_compression_ratio) + << "\n\tdecoded size: " << source_data.size() + << "\n\tencoded size: " << encoded_size + << "(no header: " << encoded_size - header_size << ")"; + } } }; -TEST_P(CodecTest, DoubleDelta) +TEST_P(CodecTest, TranscodingWithDataType) { - auto param = GetParam(); - auto codec = std::make_unique(param.data_byte_size); - if (param.type_name == type_name() || param.type_name == type_name()) - { - // dd doesn't work great with many cases of integers and may result in very poor compression rate. - param.min_compression_ratio *= 1.5; - } - - TestTranscoding(codec.get(), param); + const auto codec = makeCodec(CODEC_WITH_DATA_TYPE); + testTranscoding(*codec); } -TEST_P(CodecTest, Gorilla) +TEST_P(CodecTest, TranscodingWithoutDataType) { - auto param = GetParam(); - auto codec = std::make_unique(param.data_byte_size); - if (param.type_name == type_name() || param.type_name == type_name() - || param.type_name == type_name() || param.type_name == type_name()) - { - // gorilla doesn't work great with many cases of integers and may result in very poor compression rate. - param.min_compression_ratio *= 1.5; - } - - TestTranscoding(codec.get(), param); + const auto codec = makeCodec(CODEC_WITHOUT_DATA_TYPE); + testTranscoding(*codec); } +/////////////////////////////////////////////////////////////////////////////////////////////////// // Here we use generators to produce test payload for codecs. -// Generator is a callable that should produce output value of the same type as input value. +// Generator is a callable that can produce infinite number of values, +// output value MUST be of the same type input value. +/////////////////////////////////////////////////////////////////////////////////////////////////// auto SameValueGenerator = [](auto value) { @@ -332,141 +414,427 @@ auto SequentialGenerator = [](auto stride = 1) //}; template +using uniform_distribution = +typename std::conditional_t, std::uniform_real_distribution, + typename std::conditional_t, std::uniform_int_distribution, void>>; + + +template struct MonotonicGenerator { - MonotonicGenerator(T stride_ = 1, size_t max_step_ = 10) + MonotonicGenerator(T stride_ = 1, T max_step = 10) : prev_value(0), stride(stride_), - max_step(max_step_) + random_engine(0), + distribution(0, max_step) {} template U operator()(U) { - const U result = prev_value + static_cast(stride * (rand() % max_step)); - - prev_value = result; - return result; + prev_value = prev_value + stride * distribution(random_engine); + return static_cast(prev_value); } +private: T prev_value; const T stride; - const size_t max_step; -}; - -auto MinMaxGenerator = [](auto i) -{ - if (i % 2 == 0) - { - return std::numeric_limits::min(); - } - else - { - return std::numeric_limits::max(); - } + std::default_random_engine random_engine; + uniform_distribution distribution; }; template struct RandomGenerator { - RandomGenerator(T seed = 0, T value_cap_ = std::numeric_limits::max()) - : e(seed), - value_cap(value_cap_) + RandomGenerator(T seed = 0, T value_min = std::numeric_limits::min(), T value_max = std::numeric_limits::max()) + : random_engine(seed), + distribution(value_min, value_max) { } template - U operator()(U i) + U operator()(U) { - return static_cast(distribution(e) % value_cap); + return static_cast(distribution(random_engine)); } private: - std::default_random_engine e; - std::uniform_int_distribution distribution; - const T value_cap; + std::default_random_engine random_engine; + uniform_distribution distribution; }; auto RandomishGenerator = [](auto i) { - return static_cast(sin(static_cast(i) * i) * i); + return static_cast(sin(static_cast(i * i)) * i); }; -// helper macro to produce human-friendly test case name +auto MinMaxGenerator = []() +{ + return [step = 0](auto i) mutable + { + if (step++ % 2 == 0) + { + return std::numeric_limits::min(); + } + else + { + return std::numeric_limits::max(); + } + }; +}; + +// Fill dest value with 0x00 or 0xFF +auto FFand0Generator = []() +{ + return [step = 0](auto i) mutable + { + decltype(i) result; + if (step++ % 2 == 0) + { + memset(&result, 0, sizeof(result)); + } + else + { + memset(&result, 0xFF, sizeof(result)); + } + + return result; + }; +}; + + +// Makes many sequences with generator, first sequence length is 1, second is 2... up to `sequences_count`. +template +std::vector generatePyramidOfSequences(const size_t sequences_count, Generator && generator, const char* generator_name) +{ + std::vector sequences; + sequences.reserve(sequences_count); + for (size_t i = 1; i < sequences_count; ++i) + { + std::string name = generator_name + std::string(" from 0 to ") + std::to_string(i); + sequences.push_back(generateSeq(std::forward(generator), name.c_str(), 0, i)); + } + + return sequences; +}; + + +// helper macro to produce human-friendly sequence name from generator #define G(generator) generator, #generator +const auto DefaultCodecsToTest = ::testing::Values( + Codec("DoubleDelta"), + Codec("DoubleDelta, LZ4"), + Codec("DoubleDelta, ZSTD"), + Codec("Gorilla"), + Codec("Gorilla, LZ4"), + Codec("Gorilla, ZSTD") +); + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// test cases +/////////////////////////////////////////////////////////////////////////////////////////////////// + +INSTANTIATE_TEST_CASE_P(Simple, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + makeSeq(1, 2, 3, 5, 7, 11, 13, 17, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(SmallSequences, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::ValuesIn( + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + + generatePyramidOfSequences(42, G(SequentialGenerator(1))) + ) + ), +); + INSTANTIATE_TEST_CASE_P(Mixed, CodecTest, - ::testing::Values( - generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1), - generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1), - generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1), - generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1) + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001), + generateSeq(G(MinMaxGenerator()), 1, 5) + generateSeq(G(SequentialGenerator(1)), 1, 1001) + ) ), ); -INSTANTIATE_TEST_CASE_P(Same, +INSTANTIATE_TEST_CASE_P(SameValueInt, CodecTest, - ::testing::Values( - generateParam(G(SameValueGenerator(1000))), - generateParam(G(SameValueGenerator(-1000))), - generateParam(G(SameValueGenerator(1000))), - generateParam(G(SameValueGenerator(-1000))), - generateParam(G(SameValueGenerator(M_E))), - generateParam(G(SameValueGenerator(M_E))) + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))), + generateSeq(G(SameValueGenerator(1000))) + ) ), ); -INSTANTIATE_TEST_CASE_P(Sequential, +INSTANTIATE_TEST_CASE_P(SameNegativeValueInt, CodecTest, - ::testing::Values( - generateParam(G(SequentialGenerator(1))), - generateParam(G(SequentialGenerator(-1))), - generateParam(G(SequentialGenerator(1))), - generateParam(G(SequentialGenerator(-1))), - generateParam(G(SequentialGenerator(M_E))), - generateParam(G(SequentialGenerator(M_E))) + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))), + generateSeq(G(SameValueGenerator(-1000))) + ) ), ); -INSTANTIATE_TEST_CASE_P(Monotonic, +INSTANTIATE_TEST_CASE_P(SameValueFloat, CodecTest, - ::testing::Values( - generateParam(G(MonotonicGenerator(1, 5))), - generateParam(G(MonotonicGenerator(-1, 5))), - generateParam(G(MonotonicGenerator(1, 5))), - generateParam(G(MonotonicGenerator(-1, 5))), - generateParam(G(MonotonicGenerator(M_E, 5))), - generateParam(G(MonotonicGenerator(M_E, 5))) + ::testing::Combine( + ::testing::Values( + Codec("Gorilla"), + Codec("Gorilla, LZ4") + ), + ::testing::Values( + generateSeq(G(SameValueGenerator(M_E))), + generateSeq(G(SameValueGenerator(M_E))) + ) ), ); -INSTANTIATE_TEST_CASE_P(Random, +INSTANTIATE_TEST_CASE_P(SameNegativeValueFloat, CodecTest, - ::testing::Values( - generateParam(G(RandomGenerator(0, 1000'000'000))).setRatio(1.2), - generateParam(G(RandomGenerator(0, 1000'000'000))).setRatio(1.1) + ::testing::Combine( + ::testing::Values( + Codec("Gorilla"), + Codec("Gorilla, LZ4") + ), + ::testing::Values( + generateSeq(G(SameValueGenerator(-1 * M_E))), + generateSeq(G(SameValueGenerator(-1 * M_E))) + ) ), ); -INSTANTIATE_TEST_CASE_P(Randomish, +INSTANTIATE_TEST_CASE_P(SequentialInt, CodecTest, - ::testing::Values( - generateParam(G(RandomishGenerator)).setRatio(1.1), - generateParam(G(RandomishGenerator)).setRatio(1.1), - generateParam(G(RandomishGenerator)).setRatio(1.1), - generateParam(G(RandomishGenerator)).setRatio(1.1), - generateParam(G(RandomishGenerator)).setRatio(1.1), - generateParam(G(RandomishGenerator)).setRatio(1.1) + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))), + generateSeq(G(SequentialGenerator(1))) + ) ), ); -INSTANTIATE_TEST_CASE_P(Overflow, +// -1, -2, -3, ... etc for signed +// 0xFF, 0xFE, 0xFD, ... for unsigned +INSTANTIATE_TEST_CASE_P(SequentialReverseInt, CodecTest, - ::testing::Values( - generateParam(G(MinMaxGenerator)), - generateParam(G(MinMaxGenerator)), - generateParam(G(MinMaxGenerator)), - generateParam(G(MinMaxGenerator)) + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))), + generateSeq(G(SequentialGenerator(-1))) + ) ), ); + +INSTANTIATE_TEST_CASE_P(SequentialFloat, + CodecTest, + ::testing::Combine( + ::testing::Values( + Codec("Gorilla"), + Codec("Gorilla, LZ4") + ), + ::testing::Values( + generateSeq(G(SequentialGenerator(M_E))), + generateSeq(G(SequentialGenerator(M_E))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(SequentialReverseFloat, + CodecTest, + ::testing::Combine( + ::testing::Values( + Codec("Gorilla"), + Codec("Gorilla, LZ4") + ), + ::testing::Values( + generateSeq(G(SequentialGenerator(-1 * M_E))), + generateSeq(G(SequentialGenerator(-1 * M_E))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(MonotonicInt, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))), + generateSeq(G(MonotonicGenerator(1, 5))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(MonotonicReverseInt, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))), + generateSeq(G(MonotonicGenerator(-1, 5))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(MonotonicFloat, + CodecTest, + ::testing::Combine( + ::testing::Values( + Codec("Gorilla") + ), + ::testing::Values( + generateSeq(G(MonotonicGenerator(M_E, 5))), + generateSeq(G(MonotonicGenerator(M_E, 5))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(MonotonicReverseFloat, + CodecTest, + ::testing::Combine( + ::testing::Values( + Codec("Gorilla") + ), + ::testing::Values( + generateSeq(G(MonotonicGenerator(-1 * M_E, 5))), + generateSeq(G(MonotonicGenerator(-1 * M_E, 5))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(RandomInt, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(RandomGenerator(0))), + generateSeq(G(RandomGenerator(0))), + generateSeq(G(RandomGenerator(0, 0, 1000'000'000))), + generateSeq(G(RandomGenerator(0, 0, 1000'000'000))) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(RandomishInt, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(RandomishGenerator)), + generateSeq(G(RandomishGenerator)), + generateSeq(G(RandomishGenerator)), + generateSeq(G(RandomishGenerator)), + generateSeq(G(RandomishGenerator)), + generateSeq(G(RandomishGenerator)) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(RandomishFloat, + CodecTest, + ::testing::Combine( + DefaultCodecsToTest, + ::testing::Values( + generateSeq(G(RandomishGenerator)), + generateSeq(G(RandomishGenerator)) + ) + ), +); + +// Double delta overflow case, deltas are out of bounds for target type +INSTANTIATE_TEST_CASE_P(OverflowInt, + CodecTest, + ::testing::Combine( + ::testing::Values( + Codec("DoubleDelta", 1.2), + Codec("DoubleDelta, LZ4", 1.0) + ), + ::testing::Values( + generateSeq(G(MinMaxGenerator())), + generateSeq(G(MinMaxGenerator())), + generateSeq(G(MinMaxGenerator())), + generateSeq(G(MinMaxGenerator())) + ) + ), +); + +INSTANTIATE_TEST_CASE_P(OverflowFloat, + CodecTest, + ::testing::Combine( + ::testing::Values( + Codec("Gorilla", 1.1), + Codec("Gorilla, LZ4", 1.0) + ), + ::testing::Values( + generateSeq(G(MinMaxGenerator())), + generateSeq(G(MinMaxGenerator())), + generateSeq(G(FFand0Generator())), + generateSeq(G(FFand0Generator())) + ) + ), +); + +} From 3f572a45b73655d837194a4cd17313b5620bfc6f Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Mon, 12 Aug 2019 10:46:26 +0200 Subject: [PATCH 02/18] Fix nullIf when we have a null constant on the right argument --- dbms/src/Functions/nullIf.cpp | 6 +++--- dbms/tests/queries/0_stateless/00395_nullable.reference | 1 + dbms/tests/queries/0_stateless/00395_nullable.sql | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/nullIf.cpp b/dbms/src/Functions/nullIf.cpp index 59ce4b54c82..a68537c9c7e 100644 --- a/dbms/src/Functions/nullIf.cpp +++ b/dbms/src/Functions/nullIf.cpp @@ -43,7 +43,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override { - /// nullIf(col1, col2) == if(col1 == col2, NULL, col1) + /// nullIf(col1, col2) == if(col1 != col2, col1, NULL) Block temp_block = block; @@ -51,7 +51,7 @@ public: temp_block.insert({nullptr, std::make_shared(), ""}); { - auto equals_func = FunctionFactory::instance().get("equals", context)->build( + auto equals_func = FunctionFactory::instance().get("notEquals", context)->build( {temp_block.getByPosition(arguments[0]), temp_block.getByPosition(arguments[1])}); equals_func->execute(temp_block, {arguments[0], arguments[1]}, res_pos, input_rows_count); } @@ -69,7 +69,7 @@ public: auto func_if = FunctionFactory::instance().get("if", context)->build( {temp_block.getByPosition(res_pos), temp_block.getByPosition(null_pos), temp_block.getByPosition(arguments[0])}); - func_if->execute(temp_block, {res_pos, null_pos, arguments[0]}, result, input_rows_count); + func_if->execute(temp_block, {res_pos, arguments[0], null_pos}, result, input_rows_count); block.getByPosition(result).column = std::move(temp_block.getByPosition(result).column); } diff --git a/dbms/tests/queries/0_stateless/00395_nullable.reference b/dbms/tests/queries/0_stateless/00395_nullable.reference index 5ab0e1843ca..f620a811216 100644 --- a/dbms/tests/queries/0_stateless/00395_nullable.reference +++ b/dbms/tests/queries/0_stateless/00395_nullable.reference @@ -70,6 +70,7 @@ 42 42 \N \N 6 \N \N \N \N +1 ----- coalesce ----- \N 1 diff --git a/dbms/tests/queries/0_stateless/00395_nullable.sql b/dbms/tests/queries/0_stateless/00395_nullable.sql index 65be2b45150..71dc045ad09 100644 --- a/dbms/tests/queries/0_stateless/00395_nullable.sql +++ b/dbms/tests/queries/0_stateless/00395_nullable.sql @@ -125,6 +125,7 @@ SELECT '----- ifNull, nullIf -----'; SELECT col1, col2, ifNull(col1,col2) FROM test1_00395 ORDER BY col1,col2 ASC; SELECT col1, col2, nullIf(col1,col2) FROM test1_00395 ORDER BY col1,col2 ASC; +SELECT nullIf(1, NULL); SELECT '----- coalesce -----'; From 2ca6c0b1bfa210c65b488e40b381b5dd2e4834cf Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 12 Aug 2019 12:37:48 +0300 Subject: [PATCH 03/18] Fix bug with memory allocation for string fields in complex key cache dictionary --- ...exKeyCacheDictionary_setAttributeValue.cpp | 6 +-- .../test_cached_dictionary_string/__init__.py | 0 .../configs/config.xml | 30 +++++++++++++ .../dictionaries/complex_key_cache_string.xml | 45 +++++++++++++++++++ .../configs/users.xml | 23 ++++++++++ .../test_cached_dictionary_string/test.py | 36 +++++++++++++++ 6 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 dbms/tests/integration/test_cached_dictionary_string/__init__.py create mode 100644 dbms/tests/integration/test_cached_dictionary_string/configs/config.xml create mode 100644 dbms/tests/integration/test_cached_dictionary_string/configs/dictionaries/complex_key_cache_string.xml create mode 100644 dbms/tests/integration/test_cached_dictionary_string/configs/users.xml create mode 100644 dbms/tests/integration/test_cached_dictionary_string/test.py diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp b/dbms/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp index cf2eef82347..24b7d83f14e 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp @@ -63,9 +63,9 @@ void ComplexKeyCacheDictionary::setAttributeValue(Attribute & attribute, const s const auto str_size = string.size(); if (str_size != 0) { - auto string_ptr = string_arena->alloc(str_size + 1); - std::copy(string.data(), string.data() + str_size + 1, string_ptr); - string_ref = StringRef{string_ptr, str_size}; + auto str_ptr = string_arena->alloc(str_size); + std::copy(string.data(), string.data() + str_size, str_ptr); + string_ref = StringRef{str_ptr, str_size}; } else string_ref = {}; diff --git a/dbms/tests/integration/test_cached_dictionary_string/__init__.py b/dbms/tests/integration/test_cached_dictionary_string/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/integration/test_cached_dictionary_string/configs/config.xml b/dbms/tests/integration/test_cached_dictionary_string/configs/config.xml new file mode 100644 index 00000000000..a1518083be3 --- /dev/null +++ b/dbms/tests/integration/test_cached_dictionary_string/configs/config.xml @@ -0,0 +1,30 @@ + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + + + 9000 + 127.0.0.1 + + + + true + none + + AcceptCertificateHandler + + + + + 500 + 5368709120 + ./clickhouse/ + users.xml + + /etc/clickhouse-server/config.d/*.xml + diff --git a/dbms/tests/integration/test_cached_dictionary_string/configs/dictionaries/complex_key_cache_string.xml b/dbms/tests/integration/test_cached_dictionary_string/configs/dictionaries/complex_key_cache_string.xml new file mode 100644 index 00000000000..0c3ba112215 --- /dev/null +++ b/dbms/tests/integration/test_cached_dictionary_string/configs/dictionaries/complex_key_cache_string.xml @@ -0,0 +1,45 @@ + + + radars + + + localhost + 9000 + default + + default + radars_table
+
+ + + + + radar_id + String + False + False + + + + radar_ip + String + + False + True + + + client_id + String + + False + True + + + + + 20 + + + 1 +
+
diff --git a/dbms/tests/integration/test_cached_dictionary_string/configs/users.xml b/dbms/tests/integration/test_cached_dictionary_string/configs/users.xml new file mode 100644 index 00000000000..6061af8e33d --- /dev/null +++ b/dbms/tests/integration/test_cached_dictionary_string/configs/users.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + ::/0 + + default + default + + + + + + + + diff --git a/dbms/tests/integration/test_cached_dictionary_string/test.py b/dbms/tests/integration/test_cached_dictionary_string/test.py new file mode 100644 index 00000000000..8774a6a1040 --- /dev/null +++ b/dbms/tests/integration/test_cached_dictionary_string/test.py @@ -0,0 +1,36 @@ +import pytest +import os +import time +from helpers.cluster import ClickHouseCluster +import random + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) + +node = cluster.add_instance('node', main_configs=['configs/dictionaries/complex_key_cache_string.xml']) + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + node.query("create table radars_table (radar_id String, radar_ip String, client_id String) engine=MergeTree() order by radar_id") + + yield cluster + finally: + cluster.shutdown() + + +def test_memory_consumption(started_cluster): + node.query("insert into radars_table select toString(number), 'xxxxxxxx', 'yyyyyyyy' from numbers(100000)") + + for i in xrange(30): + node.query("select dictGetString('radars', 'client_id', tuple('{}'))".format(random.randint(0, 10000))) + + allocated_first = int(node.query("select bytes_allocated from system.dictionaries where name = 'radars'").strip()) + + for i in xrange(100): + node.query("select dictGetString('radars', 'client_id', tuple(toString(number))) from numbers({}, 1000)".format(random.randint(0, 10000))) + + allocated_second = int(node.query("select bytes_allocated from system.dictionaries where name = 'radars'").strip()) + one_element_size = allocated_first / 30 # number of elemnts in dict + assert abs(allocated_first - allocated_second) <= one_element_size * 2 # less than two elements From 2b47839a989513b8809335d4bb4d667e2487f472 Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 12 Aug 2019 15:21:07 +0300 Subject: [PATCH 04/18] fix memory tracking under sanitizers --- dbms/src/Common/new_delete.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbms/src/Common/new_delete.cpp b/dbms/src/Common/new_delete.cpp index 9da6ccf492f..f2a85163035 100644 --- a/dbms/src/Common/new_delete.cpp +++ b/dbms/src/Common/new_delete.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -49,6 +50,11 @@ ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t size [ #else if (size) CurrentMemoryTracker::free(size); +#ifdef _GNU_SOURCE + /// It's innaccurate resource free for sanitizers. malloc_usable_size() result is greater or equal to allocated size. + else + CurrentMemoryTracker::free(malloc_usable_size(ptr)); +#endif #endif } catch (...) From 98d38e041f7655a14d8f3cbec596758f52d4c9af Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 12 Aug 2019 16:30:29 +0300 Subject: [PATCH 05/18] Fix recreation of replicated table with fixed granularity --- dbms/src/Storages/MergeTree/MergeTreeData.h | 6 +- .../Storages/StorageReplicatedMergeTree.cpp | 19 ++++ .../src/Storages/StorageReplicatedMergeTree.h | 10 ++ .../__init__.py | 0 .../test.py | 97 +++++++++++++++++++ 5 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 dbms/tests/integration/test_adaptive_granularity_replicated/__init__.py create mode 100644 dbms/tests/integration/test_adaptive_granularity_replicated/test.py diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index b43851eb7d9..8c2217324f2 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -576,7 +576,9 @@ public: virtual std::vector getMutationsStatus() const = 0; - bool canUseAdaptiveGranularity() const + /// Returns true if table can create new parts with adaptive granularity + /// Has additional constraint in replicated version + virtual bool canUseAdaptiveGranularity() const { return settings.index_granularity_bytes != 0 && (settings.enable_mixed_granularity_parts || !has_non_adaptive_index_granularity_parts); @@ -632,7 +634,7 @@ public: String sampling_expr_column_name; Names columns_required_for_sampling; - const MergeTreeSettings settings; + MergeTreeSettings settings; /// Limiting parallel sends per one table, used in DataPartsExchange std::atomic_uint current_table_sends {0}; diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 5addd26482c..a436d426202 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -297,6 +297,17 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( } createNewZooKeeperNodes(); + + other_replicas_fixed_granularity = checkFixedGranualrityInZookeeper(); +} + + +bool StorageReplicatedMergeTree::checkFixedGranualrityInZookeeper() +{ + auto zookeeper = getZooKeeper(); + String metadata_str = zookeeper->get(zookeeper_path + "/metadata"); + auto metadata_from_zk = ReplicatedMergeTreeTableMetadata::parse(metadata_str); + return metadata_from_zk.index_granularity_bytes == 0; } @@ -5143,4 +5154,12 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, const C return results; } +bool StorageReplicatedMergeTree::canUseAdaptiveGranularity() const +{ + return settings.index_granularity_bytes != 0 && + (settings.enable_mixed_granularity_parts || + (!has_non_adaptive_index_granularity_parts && !other_replicas_fixed_granularity)); +} + + } diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.h b/dbms/src/Storages/StorageReplicatedMergeTree.h index 878c5ce0619..7f632ab4cb4 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.h +++ b/dbms/src/Storages/StorageReplicatedMergeTree.h @@ -172,6 +172,9 @@ public: CheckResults checkData(const ASTPtr & query, const Context & context) override; + /// Checks ability to use granularity + bool canUseAdaptiveGranularity() const override; + private: /// Delete old parts from disk and from ZooKeeper. void clearOldPartsAndRemoveFromZK(); @@ -285,6 +288,9 @@ private: /// An event that awakens `alter` method from waiting for the completion of the ALTER query. zkutil::EventPtr alter_query_event = std::make_shared(); + /// True if replica was created for existing table with fixed granularity + bool other_replicas_fixed_granularity = false; + /** Creates the minimum set of nodes in ZooKeeper. */ void createTableIfNotExists(); @@ -506,6 +512,10 @@ private: void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, const Context & query_context); void fetchPartition(const ASTPtr & partition, const String & from, const Context & query_context); + /// Check granularity of already existing replicated table in zookeeper if it exists + /// return true if it's fixed + bool checkFixedGranualrityInZookeeper(); + protected: /** If not 'attach', either creates a new table in ZK, or adds a replica to an existing table. */ diff --git a/dbms/tests/integration/test_adaptive_granularity_replicated/__init__.py b/dbms/tests/integration/test_adaptive_granularity_replicated/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/integration/test_adaptive_granularity_replicated/test.py b/dbms/tests/integration/test_adaptive_granularity_replicated/test.py new file mode 100644 index 00000000000..a0bfa2e0b76 --- /dev/null +++ b/dbms/tests/integration/test_adaptive_granularity_replicated/test.py @@ -0,0 +1,97 @@ +import time +import pytest + +from helpers.cluster import ClickHouseCluster +from multiprocessing.dummy import Pool +from helpers.client import QueryRuntimeException, QueryTimeoutExceedException + +from helpers.test_tools import assert_eq_with_retry + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance('node1', with_zookeeper=True) +node2 = cluster.add_instance('node2', with_zookeeper=True) +node3 = cluster.add_instance('node3', with_zookeeper=True, image='yandex/clickhouse-server:19.1.14', with_installed_binary=True) + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + time.sleep(1) + + yield cluster + finally: + cluster.shutdown() + +def test_creating_table_different_setting(start_cluster): + node1.query("CREATE TABLE t1 (c1 String, c2 String) ENGINE=ReplicatedMergeTree('/clickhouse/t1', '1') ORDER BY tuple(c1) SETTINGS index_granularity_bytes = 0") + node1.query("INSERT INTO t1 VALUES('x', 'y')") + + node2.query("CREATE TABLE t1 (c1 String, c2 String) ENGINE=ReplicatedMergeTree('/clickhouse/t1', '2') ORDER BY tuple(c1)") + + node1.query("INSERT INTO t1 VALUES('a', 'b')") + node2.query("SYSTEM SYNC REPLICA t1", timeout=5) + + node1.query("SELECT count() FROM t1") == "2\n" + node2.query("SELECT count() FROM t1") == "1\n" + + node2.query("INSERT INTO t1 VALUES('c', 'd')") + node1.query("SYSTEM SYNC REPLICA t1", timeout=5) + + # replication works + node1.query("SELECT count() FROM t1") == "3\n" + node2.query("SELECT count() FROM t1") == "2\n" + + # OPTIMIZE also works correctly + node2.query("OPTIMIZE TABLE t1 FINAL") == "3\n" + node1.query("SYSTEM SYNC REPLICA t1", timeout=5) + + node1.query("SELECT count() FROM t1") == "3\n" + node2.query("SELECT count() FROM t1") == "2\n" + + path_part = node1.query("SELECT path FROM system.parts WHERE table = 't1' AND active=1 ORDER BY partition DESC LIMIT 1").strip() + + with pytest.raises(Exception): # check that we have no adaptive files + node1.exec_in_container(["bash", "-c", "find {p} -name '*.mrk2' | grep '.*'".format(p=path_part)]) + + path_part = node2.query("SELECT path FROM system.parts WHERE table = 't1' AND active=1 ORDER BY partition DESC LIMIT 1").strip() + + with pytest.raises(Exception): # check that we have no adaptive files + node2.exec_in_container(["bash", "-c", "find {p} -name '*.mrk2' | grep '.*'".format(p=path_part)]) + + +def test_old_node_with_new_node(start_cluster): + node3.query("CREATE TABLE t2 (c1 String, c2 String) ENGINE=ReplicatedMergeTree('/clickhouse/t2', '3') ORDER BY tuple(c1)") + node3.query("INSERT INTO t2 VALUES('x', 'y')") + + node2.query("CREATE TABLE t2 (c1 String, c2 String) ENGINE=ReplicatedMergeTree('/clickhouse/t2', '2') ORDER BY tuple(c1)") + + node3.query("INSERT INTO t2 VALUES('a', 'b')") + node2.query("SYSTEM SYNC REPLICA t2", timeout=5) + + node3.query("SELECT count() FROM t2") == "2\n" + node2.query("SELECT count() FROM t2") == "1\n" + + node2.query("INSERT INTO t2 VALUES('c', 'd')") + node3.query("SYSTEM SYNC REPLICA t2", timeout=5) + + # replication works + node3.query("SELECT count() FROM t2") == "3\n" + node2.query("SELECT count() FROM t2") == "2\n" + + # OPTIMIZE also works correctly + node3.query("OPTIMIZE table t2 FINAL") + node2.query("SYSTEM SYNC REPLICA t2", timeout=5) + + node3.query("SELECT count() FROM t2") == "3\n" + node2.query("SELECT count() FROM t2") == "2\n" + + path_part = node3.query("SELECT path FROM system.parts WHERE table = 't2' AND active=1 ORDER BY partition DESC LIMIT 1").strip() + + with pytest.raises(Exception): # check that we have no adaptive files + node3.exec_in_container(["bash", "-c", "find {p} -name '*.mrk2' | grep '.*'".format(p=path_part)]) + + path_part = node2.query("SELECT path FROM system.parts WHERE table = 't2' AND active=1 ORDER BY partition DESC LIMIT 1").strip() + + with pytest.raises(Exception): # check that we have no adaptive files + node2.exec_in_container(["bash", "-c", "find {p} -name '*.mrk2' | grep '.*'".format(p=path_part)]) From 34b96c19de9fad471895a2b5a58257eeb4c27484 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 12 Aug 2019 17:06:17 +0300 Subject: [PATCH 06/18] Better test --- .../test_cached_dictionary_string/test.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/dbms/tests/integration/test_cached_dictionary_string/test.py b/dbms/tests/integration/test_cached_dictionary_string/test.py index 8774a6a1040..f3381581121 100644 --- a/dbms/tests/integration/test_cached_dictionary_string/test.py +++ b/dbms/tests/integration/test_cached_dictionary_string/test.py @@ -21,16 +21,31 @@ def started_cluster(): def test_memory_consumption(started_cluster): - node.query("insert into radars_table select toString(number), 'xxxxxxxx', 'yyyyyyyy' from numbers(100000)") + node.query("insert into radars_table select toString(rand() % 5000), '{0}', '{0}' from numbers(1000)".format('w' * 8)) + node.query("insert into radars_table select toString(rand() % 5000), '{0}', '{0}' from numbers(1000)".format('x' * 16)) + node.query("insert into radars_table select toString(rand() % 5000), '{0}', '{0}' from numbers(1000)".format('y' * 32)) + node.query("insert into radars_table select toString(rand() % 5000), '{0}', '{0}' from numbers(1000)".format('z' * 64)) - for i in xrange(30): - node.query("select dictGetString('radars', 'client_id', tuple('{}'))".format(random.randint(0, 10000))) + # Fill dictionary + node.query("select dictGetString('radars', 'client_id', tuple(toString(number))) from numbers(0, 5000)") allocated_first = int(node.query("select bytes_allocated from system.dictionaries where name = 'radars'").strip()) - for i in xrange(100): - node.query("select dictGetString('radars', 'client_id', tuple(toString(number))) from numbers({}, 1000)".format(random.randint(0, 10000))) + alloc_array = [] + for i in xrange(5): + node.query("select dictGetString('radars', 'client_id', tuple(toString(number))) from numbers(0, 5000)") - allocated_second = int(node.query("select bytes_allocated from system.dictionaries where name = 'radars'").strip()) - one_element_size = allocated_first / 30 # number of elemnts in dict - assert abs(allocated_first - allocated_second) <= one_element_size * 2 # less than two elements + allocated = int(node.query("select bytes_allocated from system.dictionaries where name = 'radars'").strip()) + alloc_array.append(allocated) + + # size doesn't grow + assert all(allocated_first >= a for a in alloc_array) + + for i in xrange(5): + node.query("select dictGetString('radars', 'client_id', tuple(toString(number))) from numbers(0, 5000)") + + allocated = int(node.query("select bytes_allocated from system.dictionaries where name = 'radars'").strip()) + alloc_array.append(allocated) + + # size doesn't grow + assert all(allocated_first >= a for a in alloc_array) From ad5c7c0b651879538e6722869141af59da0980be Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Mon, 12 Aug 2019 18:16:23 +0300 Subject: [PATCH 07/18] done --- dbms/src/Common/CurrentThread.cpp | 5 +++++ dbms/src/Common/CurrentThread.h | 3 +++ libs/libloggers/loggers/OwnSplitChannel.cpp | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dbms/src/Common/CurrentThread.cpp b/dbms/src/Common/CurrentThread.cpp index 84d63a04b96..d9343070808 100644 --- a/dbms/src/Common/CurrentThread.cpp +++ b/dbms/src/Common/CurrentThread.cpp @@ -26,6 +26,11 @@ void CurrentThread::updatePerformanceCounters() current_thread->updatePerformanceCounters(); } +bool CurrentThread::isInitialized() +{ + return unlikely(current_thread); +} + ThreadStatus & CurrentThread::get() { if (unlikely(!current_thread)) diff --git a/dbms/src/Common/CurrentThread.h b/dbms/src/Common/CurrentThread.h index 30f1d567040..638f3657eed 100644 --- a/dbms/src/Common/CurrentThread.h +++ b/dbms/src/Common/CurrentThread.h @@ -33,6 +33,9 @@ class InternalTextLogsQueue; class CurrentThread { public: + //Return true in case of successful initializaiton + static bool isInitialized(); + /// Handler to current thread static ThreadStatus & get(); diff --git a/libs/libloggers/loggers/OwnSplitChannel.cpp b/libs/libloggers/loggers/OwnSplitChannel.cpp index a2f332a8cae..227709bf450 100644 --- a/libs/libloggers/loggers/OwnSplitChannel.cpp +++ b/libs/libloggers/loggers/OwnSplitChannel.cpp @@ -60,7 +60,8 @@ void OwnSplitChannel::log(const Poco::Message & msg) elem.thread_number = msg_ext.thread_number; try { - elem.os_thread_id = CurrentThread::get().os_thread_id; + if (CurrentThread::isInitialized()) + elem.os_thread_id = CurrentThread::get().os_thread_id; } catch (...) { elem.os_thread_id = 0; From 845f4482aaca5195277c965521ba0520a5a4905c Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Mon, 12 Aug 2019 18:28:28 +0300 Subject: [PATCH 08/18] better --- libs/libloggers/loggers/OwnSplitChannel.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/libs/libloggers/loggers/OwnSplitChannel.cpp b/libs/libloggers/loggers/OwnSplitChannel.cpp index 227709bf450..e4f0c7e473a 100644 --- a/libs/libloggers/loggers/OwnSplitChannel.cpp +++ b/libs/libloggers/loggers/OwnSplitChannel.cpp @@ -58,14 +58,11 @@ void OwnSplitChannel::log(const Poco::Message & msg) elem.thread_name = getThreadName(); elem.thread_number = msg_ext.thread_number; - try - { - if (CurrentThread::isInitialized()) - elem.os_thread_id = CurrentThread::get().os_thread_id; - } catch (...) - { + + if (CurrentThread::isInitialized()) + elem.os_thread_id = CurrentThread::get().os_thread_id; + else elem.os_thread_id = 0; - } elem.query_id = msg_ext.query_id; From 1cab72450b7f8aec7a834e34a1fb099c359f1690 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 12 Aug 2019 19:06:34 +0300 Subject: [PATCH 09/18] Update CurrentThread.cpp --- dbms/src/Common/CurrentThread.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/CurrentThread.cpp b/dbms/src/Common/CurrentThread.cpp index d9343070808..cad7fe7cb94 100644 --- a/dbms/src/Common/CurrentThread.cpp +++ b/dbms/src/Common/CurrentThread.cpp @@ -28,7 +28,7 @@ void CurrentThread::updatePerformanceCounters() bool CurrentThread::isInitialized() { - return unlikely(current_thread); + return likely(current_thread); } ThreadStatus & CurrentThread::get() From ae18a6e4380b57303be487f59a3c1b6618c4f6b1 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 12 Aug 2019 19:07:53 +0300 Subject: [PATCH 10/18] Update CurrentThread.h --- dbms/src/Common/CurrentThread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/CurrentThread.h b/dbms/src/Common/CurrentThread.h index 638f3657eed..01e46fbeadc 100644 --- a/dbms/src/Common/CurrentThread.h +++ b/dbms/src/Common/CurrentThread.h @@ -33,7 +33,7 @@ class InternalTextLogsQueue; class CurrentThread { public: - //Return true in case of successful initializaiton + /// Return true in case of successful initializaiton static bool isInitialized(); /// Handler to current thread From 7bab9468b562e51fd67192ca66668ca3a46df909 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Mon, 12 Aug 2019 19:23:12 +0300 Subject: [PATCH 11/18] Skip introduction/info.md in single page docs (#6292) --- docs/tools/concatenate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/tools/concatenate.py b/docs/tools/concatenate.py index bddf63cbc23..4eb8fcf9562 100755 --- a/docs/tools/concatenate.py +++ b/docs/tools/concatenate.py @@ -23,6 +23,8 @@ def concatenate(lang, docs_path, single_page_file): logging.debug('Concatenating: ' + ', '.join(files_to_concatenate)) for path in files_to_concatenate: + if path.endswith('introduction/info.md'): + continue try: with open(os.path.join(lang_path, path)) as f: anchors = set() From 292758a321c00d4ab665cb87adbd7acac4106653 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 12 Aug 2019 19:33:21 +0300 Subject: [PATCH 12/18] Update CurrentThread.cpp --- dbms/src/Common/CurrentThread.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/CurrentThread.cpp b/dbms/src/Common/CurrentThread.cpp index cad7fe7cb94..446772f218d 100644 --- a/dbms/src/Common/CurrentThread.cpp +++ b/dbms/src/Common/CurrentThread.cpp @@ -28,7 +28,7 @@ void CurrentThread::updatePerformanceCounters() bool CurrentThread::isInitialized() { - return likely(current_thread); + return current_thread; } ThreadStatus & CurrentThread::get() From 95a38b9e1939b6ddb57b1fbc190af3e71a6de461 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Mon, 12 Aug 2019 19:43:15 +0300 Subject: [PATCH 13/18] Fixed builds on GCC-9 and Clang-8 --- .../tests/gtest_compressionCodec.cpp | 53 +++++++++++++------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index 7fca3d98da7..bbfe20d8813 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -1,13 +1,14 @@ #include - -#include -#include -#include #include -#include +#include #include +#include +#include +#include #include +#include +#include #include @@ -68,7 +69,7 @@ template const char* type_name() { #define MAKE_TYPE_NAME(TYPE) \ - if constexpr (std::is_same_v) return #TYPE; + if constexpr (std::is_same_v) return #TYPE MAKE_TYPE_NAME(UInt8); MAKE_TYPE_NAME(UInt16); @@ -168,14 +169,6 @@ template return result; } -struct CodecTestSequence -{ - std::string name; - std::vector serialized_data; - - DataTypePtr data_type; -}; - struct Codec { std::string codec_statement; @@ -189,6 +182,36 @@ struct Codec Codec() : Codec(std::string()) {} + + Codec(const Codec &) = default; + Codec & operator=(const Codec &) = default; + Codec(Codec &&) = default; + Codec & operator=(Codec &&) = default; +}; + + +struct CodecTestSequence +{ + std::string name; + std::vector serialized_data; + DataTypePtr data_type; + + CodecTestSequence() + : name(), + serialized_data(), + data_type() + {} + + CodecTestSequence(std::string name_, std::vector serialized_data_, DataTypePtr data_type_) + : name(name_), + serialized_data(serialized_data_), + data_type(data_type_) + {} + + CodecTestSequence(const CodecTestSequence &) = default; + CodecTestSequence & operator=(const CodecTestSequence &) = default; + CodecTestSequence(CodecTestSequence &&) = default; + CodecTestSequence & operator=(CodecTestSequence &&) = default; }; CodecTestSequence operator+(CodecTestSequence && left, CodecTestSequence && right) @@ -307,8 +330,8 @@ public: Expected expected; ASTPtr codec_ast; ParserCodec parser; - parser.parse(token_iterator, codec_ast, expected); + parser.parse(token_iterator, codec_ast, expected); return CompressionCodecFactory::instance().get(codec_ast, data_type); } From 6ebd0029265b2766a4e20ce323c400da6e3cba31 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 13 Aug 2019 01:19:20 +0300 Subject: [PATCH 14/18] Fixed build --- dbms/src/Compression/tests/gtest_compressionCodec.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index bbfe20d8813..5ff5519f5cf 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -325,7 +325,7 @@ public: const std::string codec_statement = "(" + codec_string + ")"; Tokens tokens(codec_statement.begin().base(), codec_statement.end().base()); - TokenIterator token_iterator(tokens); + IParser::Pos token_iterator(tokens); Expected expected; ASTPtr codec_ast; From c96fa2c08030ab0fdbd31f9b054bbaec51bbe13f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 13 Aug 2019 01:53:19 +0300 Subject: [PATCH 15/18] Fixed build --- .../src/Compression/tests/gtest_compressionCodec.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index 5ff5519f5cf..4ed547c54e5 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -24,6 +24,13 @@ #include +/// For the expansion of gtest macros. +#if defined(__clang__) + #pragma clang diagnostic ignored "-Wdeprecated" +#elif defined (__GNUC__) + #pragma GCC diagnostic ignored "-Wdeprecated-copy" +#endif + #include using namespace DB; @@ -182,11 +189,6 @@ struct Codec Codec() : Codec(std::string()) {} - - Codec(const Codec &) = default; - Codec & operator=(const Codec &) = default; - Codec(Codec &&) = default; - Codec & operator=(Codec &&) = default; }; From 60504bc2c85928a0fe83240f2026b0705d7c56c3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 13 Aug 2019 02:47:15 +0300 Subject: [PATCH 16/18] Change logger_name column in text_log to LowCardinality #6037 --- dbms/src/Interpreters/TextLog.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Interpreters/TextLog.cpp b/dbms/src/Interpreters/TextLog.cpp index a6d21524c29..489d0469ca0 100644 --- a/dbms/src/Interpreters/TextLog.cpp +++ b/dbms/src/Interpreters/TextLog.cpp @@ -39,7 +39,7 @@ Block TextLogElement::createBlock() {std::move(priority_datatype), "level"}, {std::make_shared(), "query_id"}, - {std::make_shared(), "logger_name"}, + {std::make_shared(std::make_shared()), "logger_name"}, {std::make_shared(), "message"}, {std::make_shared(), "revision"}, From 1edc2a264729e54bf791d5a092e5b472dc1d807a Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 13 Aug 2019 14:02:26 +0300 Subject: [PATCH 17/18] Add link to Hong Kong Meetup --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f25b2156b6..dccb75b4282 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ClickHouse is an open-source column-oriented database management system that all * You can also [fill this form](https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/) to meet Yandex ClickHouse team in person. ## Upcoming Events -* [ClickHouse Meetup in Mountain View](https://www.eventbrite.com/e/meetup-clickhouse-in-the-south-bay-registration-65935505873) on August 13. * [ClickHouse Meetup in Moscow](https://yandex.ru/promo/clickhouse/moscow-2019) on September 5. +* [ClickHouse Meetup in Hong Kong](https://www.meetup.com/Hong-Kong-Machine-Learning-Meetup/events/263580542/) on October 17. * [ClickHouse Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20. * [ClickHouse Meetup in Shanghai](https://www.huodongxing.com/event/4483760336000) on October 27. From 684580e3311c834eb1b2d899c6a98df057b76208 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 13 Aug 2019 14:04:23 +0300 Subject: [PATCH 18/18] Add link to Hong Kong Meetup to website front page (#6465) --- website/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/index.html b/website/index.html index 26090e5d857..017080d5647 100644 --- a/website/index.html +++ b/website/index.html @@ -94,7 +94,7 @@
- Upcoming Meetups: Mountain View on August 13, Moscow on September 5, Shenzhen on October 20 and Shanghai on October 27 + Upcoming Meetups: Moscow on September 5, Hong Kong on October 17, Shenzhen on October 20 and Shanghai on October 27