diff --git a/contrib/cppkafka b/contrib/cppkafka index 860c90e92ee..9b184d881c1 160000 --- a/contrib/cppkafka +++ b/contrib/cppkafka @@ -1 +1 @@ -Subproject commit 860c90e92eee6690aa74a2ca7b7c5c6930dffecd +Subproject commit 9b184d881c15cc50784b28688c7c99d3d764db24 diff --git a/contrib/librdkafka b/contrib/librdkafka index 363dcad5a23..51ae5f5fd8b 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit 363dcad5a23dc29381cc626620e68ae418b3af19 +Subproject commit 51ae5f5fd8b742e56f47a8bb0136344868818285 diff --git a/dbms/programs/server/CMakeLists.txt b/dbms/programs/server/CMakeLists.txt index 217447413d5..5cb08018065 100644 --- a/dbms/programs/server/CMakeLists.txt +++ b/dbms/programs/server/CMakeLists.txt @@ -10,7 +10,7 @@ set(CLICKHOUSE_SERVER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/TCPHandler.cpp ) -set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io daemon clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY}) +set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io PUBLIC daemon PRIVATE clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY}) if (USE_POCO_NETSSL) set(CLICKHOUSE_SERVER_LINK ${CLICKHOUSE_SERVER_LINK} PRIVATE ${Poco_NetSSL_LIBRARY} ${Poco_Crypto_LIBRARY}) endif () diff --git a/dbms/src/AggregateFunctions/QuantileTDigest.h b/dbms/src/AggregateFunctions/QuantileTDigest.h index ca7d4f2fb1a..c4ee76b6eed 100644 --- a/dbms/src/AggregateFunctions/QuantileTDigest.h +++ b/dbms/src/AggregateFunctions/QuantileTDigest.h @@ -85,7 +85,7 @@ class QuantileTDigest Params params; /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes. - static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray) - sizeof(Count) - sizeof(UInt32); + static constexpr size_t bytes_in_arena = 128 - sizeof(PODArray) - sizeof(Count) - sizeof(UInt32); using Summary = PODArray, bytes_in_arena>>; diff --git a/dbms/src/Columns/ColumnAggregateFunction.cpp b/dbms/src/Columns/ColumnAggregateFunction.cpp index 69bcdac2ab7..4652e4a08c8 100644 --- a/dbms/src/Columns/ColumnAggregateFunction.cpp +++ b/dbms/src/Columns/ColumnAggregateFunction.cpp @@ -255,6 +255,11 @@ size_t ColumnAggregateFunction::allocatedBytes() const return res; } +void ColumnAggregateFunction::protect() +{ + data.protect(); +} + MutableColumnPtr ColumnAggregateFunction::cloneEmpty() const { return create(func, Arenas(1, std::make_shared())); diff --git a/dbms/src/Columns/ColumnAggregateFunction.h b/dbms/src/Columns/ColumnAggregateFunction.h index 3fc76b4c047..a028a95d68c 100644 --- a/dbms/src/Columns/ColumnAggregateFunction.h +++ b/dbms/src/Columns/ColumnAggregateFunction.h @@ -157,6 +157,8 @@ public: size_t allocatedBytes() const override; + void protect() override; + void insertRangeFrom(const IColumn & from, size_t start, size_t length) override; void popBack(size_t n) override; diff --git a/dbms/src/Columns/ColumnArray.cpp b/dbms/src/Columns/ColumnArray.cpp index 4ceda666db7..eeb06b64f49 100644 --- a/dbms/src/Columns/ColumnArray.cpp +++ b/dbms/src/Columns/ColumnArray.cpp @@ -311,6 +311,13 @@ size_t ColumnArray::allocatedBytes() const } +void ColumnArray::protect() +{ + getData().protect(); + getOffsets().protect(); +} + + bool ColumnArray::hasEqualOffsets(const ColumnArray & other) const { if (offsets == other.offsets) diff --git a/dbms/src/Columns/ColumnArray.h b/dbms/src/Columns/ColumnArray.h index 3e1b586e755..d58dfba025a 100644 --- a/dbms/src/Columns/ColumnArray.h +++ b/dbms/src/Columns/ColumnArray.h @@ -78,6 +78,7 @@ public: void reserve(size_t n) override; size_t byteSize() const override; size_t allocatedBytes() const override; + void protect() override; ColumnPtr replicate(const Offsets & replicate_offsets) const override; ColumnPtr convertToFullColumnIfConst() const override; void getExtremes(Field & min, Field & max) const override; diff --git a/dbms/src/Columns/ColumnDecimal.h b/dbms/src/Columns/ColumnDecimal.h index 50a6d9d67fb..372b0c245c0 100644 --- a/dbms/src/Columns/ColumnDecimal.h +++ b/dbms/src/Columns/ColumnDecimal.h @@ -87,6 +87,7 @@ public: size_t size() const override { return data.size(); } size_t byteSize() const override { return data.size() * sizeof(data[0]); } size_t allocatedBytes() const override { return data.allocated_bytes(); } + void protect() override { data.protect(); } void reserve(size_t n) override { data.reserve(n); } void insertFrom(const IColumn & src, size_t n) override { data.push_back(static_cast(src).getData()[n]); } diff --git a/dbms/src/Columns/ColumnFixedString.h b/dbms/src/Columns/ColumnFixedString.h index 941314b8888..b773d7c8eb4 100644 --- a/dbms/src/Columns/ColumnFixedString.h +++ b/dbms/src/Columns/ColumnFixedString.h @@ -57,6 +57,11 @@ public: return chars.allocated_bytes() + sizeof(n); } + void protect() override + { + chars.protect(); + } + Field operator[](size_t index) const override { return String(reinterpret_cast(&chars[n * index]), n); diff --git a/dbms/src/Columns/ColumnLowCardinality.cpp b/dbms/src/Columns/ColumnLowCardinality.cpp index c919116112c..c9a475fd8a6 100644 --- a/dbms/src/Columns/ColumnLowCardinality.cpp +++ b/dbms/src/Columns/ColumnLowCardinality.cpp @@ -363,7 +363,6 @@ ColumnPtr ColumnLowCardinality::countKeys() const } - ColumnLowCardinality::Index::Index() : positions(ColumnUInt8::create()), size_of_type(sizeof(UInt8)) {} ColumnLowCardinality::Index::Index(MutableColumnPtr && positions) : positions(std::move(positions)) diff --git a/dbms/src/Columns/ColumnNullable.cpp b/dbms/src/Columns/ColumnNullable.cpp index b88cf60581b..d9a8ea4f825 100644 --- a/dbms/src/Columns/ColumnNullable.cpp +++ b/dbms/src/Columns/ColumnNullable.cpp @@ -291,6 +291,12 @@ size_t ColumnNullable::allocatedBytes() const return getNestedColumn().allocatedBytes() + getNullMapColumn().allocatedBytes(); } +void ColumnNullable::protect() +{ + getNestedColumn().protect(); + getNullMapColumn().protect(); +} + namespace { diff --git a/dbms/src/Columns/ColumnNullable.h b/dbms/src/Columns/ColumnNullable.h index c8453a29689..8012d03b0e8 100644 --- a/dbms/src/Columns/ColumnNullable.h +++ b/dbms/src/Columns/ColumnNullable.h @@ -71,6 +71,7 @@ public: void reserve(size_t n) override; size_t byteSize() const override; size_t allocatedBytes() const override; + void protect() override; ColumnPtr replicate(const Offsets & replicate_offsets) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void getExtremes(Field & min, Field & max) const override; diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 1717c02f1df..1443283783a 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -412,4 +412,11 @@ void ColumnString::getPermutationWithCollation(const Collator & collator, bool r } } + +void ColumnString::protect() +{ + getChars().protect(); + getOffsets().protect(); +} + } diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index 5ca05079bd5..a30a4ceb5a1 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -68,6 +68,8 @@ public: return chars.allocated_bytes() + offsets.allocated_bytes(); } + void protect() override; + MutableColumnPtr cloneResized(size_t to_size) const override; Field operator[](size_t n) const override diff --git a/dbms/src/Columns/ColumnTuple.cpp b/dbms/src/Columns/ColumnTuple.cpp index c235cd07c31..ec0bcc1f5b5 100644 --- a/dbms/src/Columns/ColumnTuple.cpp +++ b/dbms/src/Columns/ColumnTuple.cpp @@ -315,6 +315,12 @@ size_t ColumnTuple::allocatedBytes() const return res; } +void ColumnTuple::protect() +{ + for (auto & column : columns) + column->assumeMutableRef().protect(); +} + void ColumnTuple::getExtremes(Field & min, Field & max) const { const size_t tuple_size = columns.size(); diff --git a/dbms/src/Columns/ColumnTuple.h b/dbms/src/Columns/ColumnTuple.h index d146c8bff6c..c39a92e3c8c 100644 --- a/dbms/src/Columns/ColumnTuple.h +++ b/dbms/src/Columns/ColumnTuple.h @@ -71,6 +71,7 @@ public: void reserve(size_t n) override; size_t byteSize() const override; size_t allocatedBytes() const override; + void protect() override; void forEachSubcolumn(ColumnCallback callback) override; size_t tupleSize() const { return columns.size(); } diff --git a/dbms/src/Columns/ColumnUnique.h b/dbms/src/Columns/ColumnUnique.h index 85a9c498a94..5eee80dc9d8 100644 --- a/dbms/src/Columns/ColumnUnique.h +++ b/dbms/src/Columns/ColumnUnique.h @@ -80,6 +80,7 @@ public: bool isNumeric() const override { return column_holder->isNumeric(); } size_t byteSize() const override { return column_holder->byteSize(); } + void protect() override { column_holder->assumeMutableRef().protect(); } size_t allocatedBytes() const override { return column_holder->allocatedBytes() diff --git a/dbms/src/Columns/ColumnVector.h b/dbms/src/Columns/ColumnVector.h index 1c5a45ef6ad..9de84f95b4a 100644 --- a/dbms/src/Columns/ColumnVector.h +++ b/dbms/src/Columns/ColumnVector.h @@ -163,6 +163,11 @@ public: return data.allocated_bytes(); } + void protect() override + { + data.protect(); + } + void insertValue(const T value) { data.push_back(value); diff --git a/dbms/src/Columns/ColumnVectorHelper.h b/dbms/src/Columns/ColumnVectorHelper.h index 8a25812ffe7..d805f44218c 100644 --- a/dbms/src/Columns/ColumnVectorHelper.h +++ b/dbms/src/Columns/ColumnVectorHelper.h @@ -24,9 +24,10 @@ namespace DB class ColumnVectorHelper : public IColumn { public: + template const char * getRawDataBegin() const { - return *reinterpret_cast(reinterpret_cast(this) + sizeof(*this)); + return reinterpret_cast, 15, 16> *>(reinterpret_cast(this) + sizeof(*this))->raw_data(); } template diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 2560b9639ad..86a1097d368 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -253,6 +253,10 @@ public: /// Zero, if could be determined. virtual size_t allocatedBytes() const = 0; + /// Make memory region readonly with mprotect if it is large enough. + /// The operation is slow and performed only for debug builds. + virtual void protect() {} + /// If the column contains subcolumns (such as Array, Nullable, etc), do callback on them. /// Shallow: doesn't do recursive calls; don't do call for itself. using ColumnCallback = std::function; diff --git a/dbms/src/Common/Allocator.cpp b/dbms/src/Common/Allocator.cpp index ba0c7820187..92ff10eafb7 100644 --- a/dbms/src/Common/Allocator.cpp +++ b/dbms/src/Common/Allocator.cpp @@ -43,11 +43,30 @@ namespace ErrorCodes * * PS. This is also required, because tcmalloc can not allocate a chunk of memory greater than 16 GB. */ -static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20); +#ifdef NDEBUG + static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20); +#else + /// In debug build, use small mmap threshold to reproduce more memory stomping bugs. + /// Along with ASLR it will hopefully detect more issues than ASan. + /// The program may fail due to the limit on number of memory mappings. + static constexpr size_t MMAP_THRESHOLD = 4096; +#endif + static constexpr size_t MMAP_MIN_ALIGNMENT = 4096; static constexpr size_t MALLOC_MIN_ALIGNMENT = 8; +template +void * Allocator::mmap_hint() +{ +#if ALLOCATOR_ASLR + return reinterpret_cast(std::uniform_int_distribution(0x100000000000UL, 0x700000000000UL)(rng)); +#else + return nullptr; +#endif +} + + template void * Allocator::alloc(size_t size, size_t alignment) { @@ -61,7 +80,7 @@ void * Allocator::alloc(size_t size, size_t alignment) throw DB::Exception("Too large alignment " + formatReadableSizeWithBinarySuffix(alignment) + ": more than page size when allocating " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::BAD_ARGUMENTS); - buf = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + buf = mmap(mmap_hint(), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (MAP_FAILED == buf) DB::throwFromErrno("Allocator: Cannot mmap " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); diff --git a/dbms/src/Common/Allocator.h b/dbms/src/Common/Allocator.h index 9a2ab0b975c..d2a81f77b62 100644 --- a/dbms/src/Common/Allocator.h +++ b/dbms/src/Common/Allocator.h @@ -2,6 +2,19 @@ #include +#ifdef NDEBUG + /// If set to 1 - randomize memory mappings manually (address space layout randomization) to reproduce more memory stomping bugs. + /// Note that Linux doesn't do it by default. This may lead to worse TLB performance. + #define ALLOCATOR_ASLR 0 +#else + #define ALLOCATOR_ASLR 1 +#endif + +#if ALLOCATOR_ASLR + #include + #include +#endif + /** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena. * Also used in hash tables. @@ -14,6 +27,12 @@ template class Allocator { +#if ALLOCATOR_ASLR +private: + pcg64 rng{randomSeed()}; +#endif + void * mmap_hint(); + protected: static constexpr bool clear_memory = clear_memory_; diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index d3401427037..f974b2bdaf6 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -419,6 +419,7 @@ namespace ErrorCodes extern const int BAD_DATABASE_FOR_TEMPORARY_TABLE = 442; extern const int NO_COMMON_COLUMNS_WITH_PROTOBUF_SCHEMA = 443; extern const int UNKNOWN_PROTOBUF_FORMAT = 444; + extern const int CANNOT_MPROTECT = 445; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Common/PODArray.h b/dbms/src/Common/PODArray.h index 462842f8236..a7b8b02bb98 100644 --- a/dbms/src/Common/PODArray.h +++ b/dbms/src/Common/PODArray.h @@ -17,10 +17,19 @@ #include #include +#ifndef NDEBUG + #include +#endif + namespace DB { +namespace ErrorCodes +{ + extern const int CANNOT_MPROTECT; +} + inline constexpr size_t integerRoundUp(size_t value, size_t dividend) { return ((value + dividend - 1) / dividend) * dividend; @@ -108,6 +117,8 @@ protected: if (c_start == null) return; + unprotect(); + TAllocator::free(c_start - pad_left, allocated_bytes()); } @@ -120,6 +131,8 @@ protected: return; } + unprotect(); + ptrdiff_t end_diff = c_end - c_start; c_start = reinterpret_cast( @@ -155,6 +168,28 @@ protected: realloc(allocated_bytes() * 2, std::forward(allocator_params)...); } +#ifndef NDEBUG + /// Make memory region readonly with mprotect if it is large enough. + /// The operation is slow and performed only for debug builds. + void protectImpl(int prot) + { + static constexpr size_t PAGE_SIZE = 4096; + + char * left_rounded_up = reinterpret_cast((reinterpret_cast(c_start) - pad_left + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE); + char * right_rounded_down = reinterpret_cast((reinterpret_cast(c_end_of_storage) + pad_right) / PAGE_SIZE * PAGE_SIZE); + + if (right_rounded_down > left_rounded_up) + { + size_t length = right_rounded_down - left_rounded_up; + if (0 != mprotect(left_rounded_up, length, prot)) + throwFromErrno("Cannot mprotect memory region", ErrorCodes::CANNOT_MPROTECT); + } + } + + /// Restore memory protection in destructor or realloc for further reuse by allocator. + bool mprotected = false; +#endif + public: bool empty() const { return c_end == c_start; } size_t size() const { return (c_end - c_start) / ELEMENT_SIZE; } @@ -199,6 +234,23 @@ public: c_end += byte_size(1); } + void protect() + { +#ifndef NDEBUG + protectImpl(PROT_READ); + mprotected = true; +#endif + } + + void unprotect() + { +#ifndef NDEBUG + if (mprotected) + protectImpl(PROT_WRITE); + mprotected = false; +#endif + } + ~PODArrayBase() { dealloc(); @@ -402,6 +454,11 @@ public: void swap(PODArray & rhs) { +#ifndef NDEBUG + this->unprotect(); + rhs.unprotect(); +#endif + /// Swap two PODArray objects, arr1 and arr2, that satisfy the following conditions: /// - The elements of arr1 are stored on stack. /// - The elements of arr2 are stored on heap. @@ -450,7 +507,9 @@ public: }; if (!this->isInitialized() && !rhs.isInitialized()) + { return; + } else if (!this->isInitialized() && rhs.isInitialized()) { do_move(rhs, *this); @@ -494,9 +553,13 @@ public: rhs.c_end = rhs.c_start + this->byte_size(lhs_size); } else if (this->isAllocatedFromStack() && !rhs.isAllocatedFromStack()) + { swap_stack_heap(*this, rhs); + } else if (!this->isAllocatedFromStack() && rhs.isAllocatedFromStack()) + { swap_stack_heap(rhs, *this); + } else { std::swap(this->c_start, rhs.c_start); diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h index bb2f81fc81f..00bd2f4b67e 100644 --- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h +++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h @@ -113,6 +113,7 @@ namespace Graphite struct Pattern { std::shared_ptr regexp; + std::string regexp_str; AggregateFunctionPtr function; Retentions retentions; /// Must be ordered by 'age' descending. enum { TypeUndef, TypeRetention, TypeAggregation, TypeAll } type = TypeAll; /// The type of defined pattern, filled automatically @@ -124,6 +125,7 @@ namespace Graphite struct Params { + String config_name; String path_column_name; String time_column_name; String value_column_name; @@ -215,6 +217,7 @@ private: const Graphite::Pattern undef_pattern = { /// temporary empty pattern for selectPatternForPath nullptr, + "", nullptr, DB::Graphite::Retentions(), undef_pattern.TypeUndef, diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index 7f0267d6d59..7c77857345a 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -8,164 +8,271 @@ #include +#include + #include +#include #include #include #include +#include #ifdef __SSE4_2__ -#include +# include #endif namespace DB { /** Distance function implementation. - * We calculate all the trigrams from left string and count by the index of + * We calculate all the n-grams from left string and count by the index of * 16 bits hash of them in the map. - * Then calculate all the trigrams from the right string and calculate - * the trigram distance on the flight by adding and subtracting from the hashmap. + * Then calculate all the n-grams from the right string and calculate + * the n-gram distance on the flight by adding and subtracting from the hashmap. * Then return the map into the condition of which it was after the left string * calculation. If the right string size is big (more than 2**15 bytes), * the strings are not similar at all and we return 1. */ -struct TrigramDistanceImpl +template +struct NgramDistanceImpl { using ResultType = Float32; - using CodePoint = UInt32; - /// map_size for trigram difference + /// map_size for ngram difference. static constexpr size_t map_size = 1u << 16; - /// If the haystack size is bigger than this, behaviour is unspecified for this function + /// If the haystack size is bigger than this, behaviour is unspecified for this function. static constexpr size_t max_string_size = 1u << 15; + /// Default padding to read safely. + static constexpr size_t default_padding = 16; + + /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding. + static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1; + /** This fits mostly in L2 cache all the time. * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed * integer array. */ - using TrigramStats = UInt16[map_size]; + using NgramStats = UInt16[map_size]; - static ALWAYS_INLINE UInt16 trigramHash(CodePoint one, CodePoint two, CodePoint three) + static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points) { - UInt64 combined = (static_cast(one) << 32) | two; + return intHashCRC32(unalignedLoad(code_points)) & 0xFFFFu; + } + + static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points) + { + UInt64 combined = (static_cast(code_points[0]) << 32) | code_points[1]; #ifdef __SSE4_2__ - return _mm_crc32_u64(three, combined) & 0xFFFFu; + return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu; #else - return (intHashCRC32(combined) ^ intHashCRC32(three)) & 0xFFFFu; + return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu; #endif } - static ALWAYS_INLINE CodePoint readCodePoint(const char *& pos, const char * end) noexcept + template + static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) { - size_t length = UTF8::seqLength(*pos); - - if (pos + length > end) - length = end - pos; - - CodePoint res; - /// This is faster than just memcpy because of compiler optimizations with moving bytes. - switch (length) - { - case 1: - res = 0; - memcpy(&res, pos, 1); - break; - case 2: - res = 0; - memcpy(&res, pos, 2); - break; - case 3: - res = 0; - memcpy(&res, pos, 3); - break; - default: - memcpy(&res, pos, 4); - } - - pos += length; - return res; + ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); } - static inline size_t calculateNeedleStats(const char * data, const size_t size, TrigramStats & trigram_stats) noexcept + static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end) { - size_t len = 0; - const char * start = data; - const char * end = data + size; - CodePoint cp1 = 0; - CodePoint cp2 = 0; - CodePoint cp3 = 0; + /// Offset before which we copy some data. + constexpr size_t padding_offset = default_padding - N + 1; + /// We have an array like this for ASCII (N == 4, other cases are similar) + /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start + /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); + /// Now we have an array + /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// Doing unaligned read of 16 bytes and copy them like above + /// 16 is also chosen to do two `movups`. + /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. + memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); - while (start != end) + if constexpr (CaseInsensitive) { - cp1 = cp2; - cp2 = cp3; - cp3 = readCodePoint(start, end); - ++len; - if (len < 3) - continue; - ++trigram_stats[trigramHash(cp1, cp2, cp3)]; + /// We really need template lambdas with C++20 to do it inline + unrollLowering(code_points, std::make_index_sequence()); } - return std::max(static_cast(0), static_cast(len) - 2); + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; } - static inline UInt64 calculateHaystackStatsAndMetric(const char * data, const size_t size, TrigramStats & trigram_stats, size_t & distance) + static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end) { - size_t len = 0; - size_t trigram_cnt = 0; + /// The same copying as described in the function above. + memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); + + size_t num = N - 1; + while (num < default_padding && pos < end) + { + size_t length = UTF8::seqLength(*pos); + + if (pos + length > end) + length = end - pos; + + CodePoint res; + /// This is faster than just memcpy because of compiler optimizations with moving bytes. + switch (length) + { + case 1: + res = 0; + memcpy(&res, pos, 1); + break; + case 2: + res = 0; + memcpy(&res, pos, 2); + break; + case 3: + res = 0; + memcpy(&res, pos, 3); + break; + default: + memcpy(&res, pos, 4); + } + + /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte. + /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does. + /// For others, we don't care now. Lowering UTF is not a cheap operation. + if constexpr (CaseInsensitive) + { + switch (length) + { + case 4: + res &= ~(1u << (5 + 3 * CHAR_BIT)); + [[fallthrough]]; + case 3: + res &= ~(1u << (5 + 2 * CHAR_BIT)); + [[fallthrough]]; + case 2: + res &= ~(1u << (5 + CHAR_BIT)); + [[fallthrough]]; + default: + res &= ~(1u << 5); + } + } + + pos += length; + code_points[num++] = res; + } + return num; + } + + static ALWAYS_INLINE inline size_t calculateNeedleStats( + const char * data, + const size_t size, + NgramStats & ngram_stats, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt16 (*hash_functor)(const CodePoint *)) + { + // To prevent size_t overflow below. + if (size < N) + return 0; + const char * start = data; const char * end = data + size; - CodePoint cp1 = 0; - CodePoint cp2 = 0; - CodePoint cp3 = 0; + CodePoint cp[simultaneously_codepoints_num] = {}; + + /// read_code_points returns the position of cp where it stopped reading codepoints. + size_t found = read_code_points(cp, start, end); + /// We need to start for the first time here, because first N - 1 codepoints mean nothing. + size_t i = N - 1; + /// Initialize with this value because for the first time `found` does not initialize first N - 1 codepoints. + size_t len = -N + 1; + do + { + len += found - N + 1; + for (; i + N <= found; ++i) + ++ngram_stats[hash_functor(cp + i)]; + i = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + return len; + } + + static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( + const char * data, + const size_t size, + NgramStats & ngram_stats, + size_t & distance, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt16 (*hash_functor)(const CodePoint *)) + { + size_t ngram_cnt = 0; + const char * start = data; + const char * end = data + size; + CodePoint cp[simultaneously_codepoints_num] = {}; /// allocation tricks, most strings are relatively small static constexpr size_t small_buffer_size = 256; std::unique_ptr big_buffer; UInt16 small_buffer[small_buffer_size]; - UInt16 * trigram_storage = small_buffer; + UInt16 * ngram_storage = small_buffer; if (size > small_buffer_size) { - trigram_storage = new UInt16[size]; - big_buffer.reset(trigram_storage); + ngram_storage = new UInt16[size]; + big_buffer.reset(ngram_storage); } - while (start != end) + /// read_code_points returns the position of cp where it stopped reading codepoints. + size_t found = read_code_points(cp, start, end); + /// We need to start for the first time here, because first N - 1 codepoints mean nothing. + size_t iter = N - 1; + + do { - cp1 = cp2; - cp2 = cp3; - cp3 = readCodePoint(start, end); - ++len; - if (len < 3) - continue; + for (; iter + N <= found; ++iter) + { + UInt16 hash = hash_functor(cp + iter); + if (static_cast(ngram_stats[hash]) > 0) + --distance; + else + ++distance; - UInt16 hash = trigramHash(cp1, cp2, cp3); - - if (static_cast(trigram_stats[hash]) > 0) - --distance; - else - ++distance; - - trigram_storage[trigram_cnt++] = hash; - --trigram_stats[hash]; - } + ngram_storage[ngram_cnt++] = hash; + --ngram_stats[hash]; + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); /// Return the state of hash map to its initial. - for (size_t i = 0; i < trigram_cnt; ++i) - ++trigram_stats[trigram_storage[i]]; - - return trigram_cnt; + for (size_t i = 0; i < ngram_cnt; ++i) + ++ngram_stats[ngram_storage[i]]; + return ngram_cnt; } - static void constant_constant(const std::string & data, const std::string & needle, Float32 & res) + template + static inline size_t dispatchSearcher(Callback callback, Args &&... args) { - TrigramStats common_stats; + if constexpr (!UTF8) + return callback(std::forward(args)..., readASCIICodePoints, ASCIIHash); + else + return callback(std::forward(args)..., readUTF8CodePoints, UTF8Hash); + } + + static void constant_constant(std::string data, std::string needle, Float32 & res) + { + NgramStats common_stats; memset(common_stats, 0, sizeof(common_stats)); - size_t second_size = calculateNeedleStats(needle.data(), needle.size(), common_stats); + + /// We use unsafe versions of getting ngrams, so I decided to use padded strings. + const size_t needle_size = needle.size(); + const size_t data_size = data.size(); + needle.resize(needle_size + default_padding); + data.resize(data_size + default_padding); + + size_t second_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats); size_t distance = second_size; - if (data.size() <= max_string_size) + if (data_size <= max_string_size) { - size_t first_size = calculateHaystackStatsAndMetric(data.data(), data.size(), common_stats, distance); + size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance); res = distance * 1.f / std::max(first_size + second_size, size_t(1)); } else @@ -175,11 +282,18 @@ struct TrigramDistanceImpl } static void vector_constant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray & res) + const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) { - TrigramStats common_stats; + /// zeroing our map + NgramStats common_stats; memset(common_stats, 0, sizeof(common_stats)); - const size_t needle_stats_size = calculateNeedleStats(needle.data(), needle.size(), common_stats); + + /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case. + const size_t needle_size = needle.size(); + needle.resize(needle_size + default_padding); + + const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats); + size_t distance = needle_stats_size; size_t prev_offset = 0; for (size_t i = 0; i < offsets.size(); ++i) @@ -188,12 +302,13 @@ struct TrigramDistanceImpl const size_t haystack_size = offsets[i] - prev_offset - 1; if (haystack_size <= max_string_size) { - size_t haystack_stats_size - = calculateHaystackStatsAndMetric(reinterpret_cast(haystack), haystack_size, common_stats, distance); + size_t haystack_stats_size = dispatchSearcher( + calculateHaystackStatsAndMetric, reinterpret_cast(haystack), haystack_size, common_stats, distance); res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); } else { + /// if the strings are too big, we say they are completely not the same res[i] = 1.f; } distance = needle_stats_size; @@ -203,16 +318,39 @@ struct TrigramDistanceImpl }; -struct TrigramDistanceName +struct NgramDistanceName { - static constexpr auto name = "trigramDistance"; + static constexpr auto name = "ngramDistance"; }; -using FunctionTrigramsDistance = FunctionsStringSimilarity; +struct NgramDistanceCaseInsensitiveName +{ + static constexpr auto name = "ngramDistanceCaseInsensitive"; +}; + +struct NgramDistanceUTF8Name +{ + static constexpr auto name = "ngramDistanceUTF8"; +}; + +struct NgramDistanceUTF8CaseInsensitiveName +{ + static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8"; +}; + +using FunctionNgramDistance = FunctionsStringSimilarity, NgramDistanceName>; +using FunctionNgramDistanceCaseInsensitive + = FunctionsStringSimilarity, NgramDistanceCaseInsensitiveName>; +using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NgramDistanceUTF8Name>; +using FunctionNgramDistanceCaseInsensitiveUTF8 + = FunctionsStringSimilarity, NgramDistanceUTF8CaseInsensitiveName>; void registerFunctionsStringSimilarity(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); } } diff --git a/dbms/src/Functions/FunctionsStringSimilarity.h b/dbms/src/Functions/FunctionsStringSimilarity.h index 00c90e20569..c23d9be999a 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.h +++ b/dbms/src/Functions/FunctionsStringSimilarity.h @@ -12,8 +12,9 @@ namespace DB /** Calculate similarity metrics: * - * trigramDistance(haystack, needle) --- calculate so called 3-gram distance between haystack and needle. + * ngramDistance(haystack, needle) --- calculate n-gram distance between haystack and needle. * Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. + * Also support CaseInsensitive and UTF8 formats. */ namespace ErrorCodes diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h index 12c2d53819b..74836d4463d 100644 --- a/dbms/src/Interpreters/AggregationCommon.h +++ b/dbms/src/Interpreters/AggregationCommon.h @@ -102,23 +102,23 @@ static inline T ALWAYS_INLINE packFixed( switch (key_sizes[j]) { case 1: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index, 1); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<1>() + index, 1); offset += 1; break; case 2: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * 2, 2); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<2>() + index * 2, 2); offset += 2; break; case 4: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * 4, 4); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<4>() + index * 4, 4); offset += 4; break; case 8: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * 8, 8); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<8>() + index * 8, 8); offset += 8; break; default: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * key_sizes[j], key_sizes[j]); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<1>() + index * key_sizes[j], key_sizes[j]); offset += key_sizes[j]; } } @@ -168,23 +168,23 @@ static inline T ALWAYS_INLINE packFixed( switch (key_sizes[j]) { case 1: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i, 1); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<1>() + i, 1); offset += 1; break; case 2: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * 2, 2); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<2>() + i * 2, 2); offset += 2; break; case 4: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * 4, 4); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<4>() + i * 4, 4); offset += 4; break; case 8: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * 8, 8); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<8>() + i * 8, 8); offset += 8; break; default: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * key_sizes[j], key_sizes[j]); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<1>() + i * key_sizes[j], key_sizes[j]); offset += key_sizes[j]; } } diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp index bf9c5b3409d..01ff4c4cdac 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp @@ -513,13 +513,16 @@ void MergeTreeDataPart::loadIndex() for (size_t i = 0; i < marks_count; ++i) //-V756 for (size_t j = 0; j < key_size; ++j) - storage.primary_key_data_types[j]->deserializeBinary(*loaded_index[j].get(), index_file); + storage.primary_key_data_types[j]->deserializeBinary(*loaded_index[j], index_file); for (size_t i = 0; i < key_size; ++i) + { + loaded_index[i]->protect(); if (loaded_index[i]->size() != marks_count) throw Exception("Cannot read all data from index file " + index_path + "(expected size: " + toString(marks_count) + ", read: " + toString(loaded_index[i]->size()) + ")", ErrorCodes::CANNOT_READ_ALL_DATA); + } if (!index_file.eof()) throw Exception("Index file " + index_path + " is unexpectedly long", ErrorCodes::EXPECTED_END_OF_FILE); diff --git a/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp b/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp index 9091228d80a..89f5aaeafd5 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp @@ -132,6 +132,7 @@ void MergeTreeReaderStream::loadMarks() if (buffer.eof() || buffer.buffer().size() != file_size) throw Exception("Cannot read all marks from file " + mrk_path, ErrorCodes::CANNOT_READ_ALL_DATA); + res->protect(); return res; }; diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp index 103be508564..a64f376e3de 100644 --- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -101,7 +101,8 @@ static void appendGraphitePattern( { if (key == "regexp") { - pattern.regexp = std::make_shared(config.getString(config_element + ".regexp")); + pattern.regexp_str = config.getString(config_element + ".regexp"); + pattern.regexp = std::make_shared(pattern.regexp_str); } else if (key == "function") { @@ -165,6 +166,7 @@ static void setGraphitePatternsFromConfig(const Context & context, throw Exception("No '" + config_element + "' element in configuration file", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + params.config_name = config_element; params.path_column_name = config.getString(config_element + ".path_column_name", "Path"); params.time_column_name = config.getString(config_element + ".time_column_name", "Time"); params.value_column_name = config.getString(config_element + ".value_column_name", "Value"); diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp index d75eb71841e..fa1b768ac98 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.cpp +++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp @@ -1,175 +1,139 @@ #include +#include +#include -#include -#include -#include -#include -#include -#include -#include #include -#include - namespace DB { -namespace ErrorCodes -{ - extern const int NO_ELEMENTS_IN_CONFIG; -} - -namespace -{ - -using namespace Poco::Util; - -struct Pattern -{ - struct Retention - { - UInt64 age; - UInt64 precision; - }; - - std::string regexp; - std::string function; - std::vector retentions; - UInt16 priority; - UInt8 is_default; -}; - -static Pattern readOnePattern( - const AbstractConfiguration & config, - const std::string & path) -{ - Pattern pattern; - AbstractConfiguration::Keys keys; - - config.keys(path, keys); - - if (keys.empty()) - throw Exception("Empty pattern in Graphite rollup configuration", ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - for (const auto & key : keys) - { - const String key_path = path + "." + key; - - if (startsWith(key, "regexp")) - { - pattern.regexp = config.getString(key_path); - } - else if (startsWith(key, "function")) - { - pattern.function = config.getString(key_path); - } - else if (startsWith(key, "retention")) - { - pattern.retentions.push_back(Pattern::Retention{0, 0}); - pattern.retentions.back().age = config.getUInt64(key_path + ".age", 0); - pattern.retentions.back().precision = config.getUInt64(key_path + ".precision", 0); - } - } - - return pattern; -} - -static std::vector readPatterns( - const AbstractConfiguration & config, - const std::string & section) -{ - AbstractConfiguration::Keys keys; - std::vector result; - size_t count = 0; - - config.keys(section, keys); - - for (const auto & key : keys) - { - if (startsWith(key, "pattern")) - { - Pattern pattern(readOnePattern(config, section + "." + key)); - pattern.is_default = false; - pattern.priority = ++count; - result.push_back(pattern); - } - else if (startsWith(key, "default")) - { - Pattern pattern(readOnePattern(config, section + "." + key)); - pattern.is_default = true; - pattern.priority = std::numeric_limits::max(); - result.push_back(pattern); - } - } - - return result; -} - -static Strings getAllGraphiteSections(const AbstractConfiguration & config) -{ - Strings result; - - AbstractConfiguration::Keys keys; - config.keys(keys); - - for (const auto & key : keys) - { - if (startsWith(key, "graphite_")) - result.push_back(key); - } - - return result; -} - -} // namespace - NamesAndTypesList StorageSystemGraphite::getNamesAndTypes() { return { - {"config_name", std::make_shared()}, - {"regexp", std::make_shared()}, - {"function", std::make_shared()}, - {"age", std::make_shared()}, - {"precision", std::make_shared()}, - {"priority", std::make_shared()}, - {"is_default", std::make_shared()}, + {"config_name", std::make_shared()}, + {"regexp", std::make_shared()}, + {"function", std::make_shared()}, + {"age", std::make_shared()}, + {"precision", std::make_shared()}, + {"priority", std::make_shared()}, + {"is_default", std::make_shared()}, + {"Tables.database", std::make_shared(std::make_shared())}, + {"Tables.table", std::make_shared(std::make_shared())}, }; } +/* + * Looking for (Replicated)*GraphiteMergeTree and get all configuration parameters for them + */ +StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & context) const +{ + const Databases databases = context.getDatabases(); + Configs graphite_configs; + + for (const auto & db : databases) + { + for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next()) + { + auto & table = iterator->table(); + const MergeTreeData * table_data = nullptr; + + if (const StorageMergeTree * merge_tree = dynamic_cast(table.get())) + { + table_data = &merge_tree->getData(); + } + else if (const StorageReplicatedMergeTree * replicated_merge_tree = dynamic_cast(table.get())) + { + table_data = &replicated_merge_tree->getData(); + } + else + { + continue; + } + + if (table_data->merging_params.mode == MergeTreeData::MergingParams::Graphite) + { + const String & config_name = table_data->merging_params.graphite_params.config_name; + + if (!graphite_configs.count(config_name)) + { + Config new_config = + { + table_data->merging_params.graphite_params, + { table_data->getDatabaseName() }, + { table_data->getTableName() }, + }; + graphite_configs.emplace(config_name, new_config); + } + else + { + graphite_configs[config_name].databases.emplace_back(table_data->getDatabaseName()); + graphite_configs[config_name].tables.emplace_back(table_data->getTableName()); + } + } + } + } + + return graphite_configs; +} + void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { - const auto & config = context.getConfigRef(); + Configs graphite_configs = StorageSystemGraphite::getConfigs(context); - Strings sections = getAllGraphiteSections(config); - for (const auto & section : sections) + for (const auto & config : graphite_configs) { - const auto patterns = readPatterns(config, section); - for (const auto & pattern : patterns) + UInt16 priority = 0; + for (const auto & pattern : config.second.graphite_params.patterns) { + bool is_default = pattern.regexp == nullptr; + String regexp; + String function; + + if (is_default) + { + priority = std::numeric_limits::max(); + } + else + { + priority++; + regexp = pattern.regexp_str; + } + + if (pattern.function) + { + function = pattern.function->getName(); + } + if (!pattern.retentions.empty()) { - for (const auto & ret : pattern.retentions) + for (const auto & retention : pattern.retentions) { - res_columns[0]->insert(section); - res_columns[1]->insert(pattern.regexp); - res_columns[2]->insert(pattern.function); - res_columns[3]->insert(ret.age); - res_columns[4]->insert(ret.precision); - res_columns[5]->insert(pattern.priority); - res_columns[6]->insert(pattern.is_default); + size_t i = 0; + res_columns[i++]->insert(config.first); + res_columns[i++]->insert(regexp); + res_columns[i++]->insert(function); + res_columns[i++]->insert(retention.age); + res_columns[i++]->insert(retention.precision); + res_columns[i++]->insert(priority); + res_columns[i++]->insert(is_default); + res_columns[i++]->insert(config.second.databases); + res_columns[i++]->insert(config.second.tables); } } else { - res_columns[0]->insert(section); - res_columns[1]->insert(pattern.regexp); - res_columns[2]->insert(pattern.function); - res_columns[3]->insert(0); - res_columns[4]->insert(0); - res_columns[5]->insert(pattern.priority); - res_columns[6]->insert(pattern.is_default); + size_t i = 0; + res_columns[i++]->insert(config.first); + res_columns[i++]->insert(regexp); + res_columns[i++]->insert(function); + res_columns[i++]->insert(NULL); + res_columns[i++]->insert(NULL); + res_columns[i++]->insert(priority); + res_columns[i++]->insert(is_default); + res_columns[i++]->insert(config.second.databases); + res_columns[i++]->insert(config.second.tables); } } } diff --git a/dbms/src/Storages/System/StorageSystemGraphite.h b/dbms/src/Storages/System/StorageSystemGraphite.h index fa63c839857..b874e294782 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.h +++ b/dbms/src/Storages/System/StorageSystemGraphite.h @@ -1,7 +1,10 @@ #pragma once +#include #include +#include #include +#include #include namespace DB @@ -15,10 +18,21 @@ public: static NamesAndTypesList getNamesAndTypes(); + struct Config + { + Graphite::Params graphite_params; + Array databases; + Array tables; + }; + + using Configs = std::map; + + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; + StorageSystemGraphite::Configs getConfigs(const Context & context) const; }; } diff --git a/dbms/tests/integration/test_graphite_merge_tree/test.py b/dbms/tests/integration/test_graphite_merge_tree/test.py index 8e98c97e077..509fbac97d0 100644 --- a/dbms/tests/integration/test_graphite_merge_tree/test.py +++ b/dbms/tests/integration/test_graphite_merge_tree/test.py @@ -231,6 +231,50 @@ SELECT * FROM test.graphite; assert TSV(result) == TSV(expected) +def test_system_graphite_retentions(graphite_table): + expected = ''' +graphite_rollup \\\\.count$ sum 0 0 1 0 ['test'] ['graphite'] +graphite_rollup \\\\.max$ max 0 0 2 0 ['test'] ['graphite'] +graphite_rollup ^five_min\\\\. 31536000 14400 3 0 ['test'] ['graphite'] +graphite_rollup ^five_min\\\\. 5184000 3600 3 0 ['test'] ['graphite'] +graphite_rollup ^five_min\\\\. 0 300 3 0 ['test'] ['graphite'] +graphite_rollup ^one_min avg 31536000 600 4 0 ['test'] ['graphite'] +graphite_rollup ^one_min avg 7776000 300 4 0 ['test'] ['graphite'] +graphite_rollup ^one_min avg 0 60 4 0 ['test'] ['graphite'] + ''' + result = q('SELECT * from system.graphite_retentions') + + assert TSV(result) == TSV(expected) + + q(''' +DROP TABLE IF EXISTS test.graphite2; +CREATE TABLE test.graphite2 + (metric String, value Float64, timestamp UInt32, date Date, updated UInt32) + ENGINE = GraphiteMergeTree('graphite_rollup') + PARTITION BY toYYYYMM(date) + ORDER BY (metric, timestamp) + SETTINGS index_granularity=8192; + ''') + expected = ''' +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] + ''' + result = q(''' + SELECT + config_name, + Tables.database, + Tables.table + FROM system.graphite_retentions + ''') + assert TSV(result) == TSV(expected) + + def test_path_dangling_pointer(graphite_table): q(''' DROP TABLE IF EXISTS test.graphite2; diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml new file mode 100644 index 00000000000..16960811067 --- /dev/null +++ b/dbms/tests/performance/string_search/ngram_distance.xml @@ -0,0 +1,42 @@ + + Distance search performance test + + + search + + + + hits_100m_single + + + loop + + + + 5 + 10000 + + + 50 + 60000 + + + + SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistance(Title, 'baby dont hurt me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistance(Title, 'no more') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'чем занимаешься') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + + SELECT DISTINCT Title, ngramDistanceCaseInsensitiveUTF8(Title, 'Метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'как дЕлА') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'Чем зАнимаешЬся') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + + + + + diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.reference b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference new file mode 100644 index 00000000000..356cc5db466 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference @@ -0,0 +1,509 @@ +0 +0 +0 +0 +0 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +0 +77 +77 +77 +77 +77 +636 +636 +636 +636 +636 +1000 +1000 +1000 +1000 +1000 +0 +1000 +1000 +0 +77 +636 +1000 +привет как дела?... Херсон 297 +пап привет как дела - Яндекс.Видео 422 +привет как дела клип - Яндекс.Видео 435 +привет братан как дела - Яндекс.Видео 500 +привет 529 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 459 +пап привет как дела - Яндекс.Видео 511 +привет 529 +привет как дела клип - Яндекс.Видео 565 +привет братан как дела - Яндекс.Видео 583 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 524 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://metrica.yandex.com/ 655 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 619 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +http://metrica.yandex.com/ 724 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +0 +0 +0 +0 +0 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +0 +77 +77 +77 +77 +77 +636 +636 +636 +636 +636 +1000 +1000 +1000 +1000 +1000 +0 +1000 +1000 +429 +77 +636 +1000 +привет как дела?... Херсон 297 +пап привет как дела - Яндекс.Видео 422 +привет как дела клип - Яндекс.Видео 435 +привет братан как дела - Яндекс.Видео 500 +привет 529 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 676 +пап привет как дела - Яндекс.Видео 733 +привет как дела клип - Яндекс.Видео 739 +привет братан как дела - Яндекс.Видео 750 +привет 882 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 524 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 524 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://metrica.yandex.com/ 655 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 619 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +http://metrica.yandex.com/ 724 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела клип - Яндекс.Видео 182 +пап привет как дела - Яндекс.Видео 354 +привет братан как дела - Яндекс.Видео 382 +привет как дела?... Херсон 649 +привет 838 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +111 +111 +111 +111 +111 +429 +429 +429 +429 +429 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +111 +429 +1000 +привет как дела?... Херсон 254 +пап привет как дела - Яндекс.Видео 398 +привет как дела клип - Яндекс.Видео 412 +привет братан как дела - Яндекс.Видео 461 +привет 471 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 343 +пап привет как дела - Яндекс.Видео 446 +привет 471 +привет как дела клип - Яндекс.Видео 482 +привет братан как дела - Яндекс.Видео 506 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 579 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://metrica.yandex.com/ 704 +http://autometric.ru/ 727 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 684 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://autometric.ru/ 727 +http://metrica.yandex.com/ 778 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 769 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +111 +111 +111 +111 +111 +600 +600 +600 +600 +600 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +111 +600 +1000 +привет как дела?... Херсон 910 +пап привет как дела - Яндекс.Видео 928 +привет как дела клип - Яндекс.Видео 929 +привет братан как дела - Яндекс.Видео 955 +привет 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 672 +пап привет как дела - Яндекс.Видео 735 +привет как дела клип - Яндекс.Видео 741 +привет братан как дела - Яндекс.Видео 753 +привет 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 579 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 579 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://metrica.yandex.com/ 704 +http://autometric.ru/ 727 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 684 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://autometric.ru/ 727 +http://metrica.yandex.com/ 778 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 769 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.sql b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql new file mode 100644 index 00000000000..867e69f4fe7 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql @@ -0,0 +1,106 @@ +select round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; + +select round(1000 * ngramDistanceUTF8('', '')); +select round(1000 * ngramDistanceUTF8('абв', '')); +select round(1000 * ngramDistanceUTF8('', 'абв')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'абвгдеёжз')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'абвгдеёж')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'гдеёзд')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'ёёёёёёёё')); + +drop table if exists test.test_distance; +create table test.test_distance (Title String) engine = Memory; +insert into test.test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), (''); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'привет как дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'как привет дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metriks') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'yandex') as distance; + + +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5; + +select round(1000 * ngramDistanceCaseInsensitiveUTF8('', '')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв', '')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('', 'абв')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвГДЕёжз', 'АбвгдЕЁжз')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', 'АбвГдеёж')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', 'гдеёЗД')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('АБВГДеёжз', 'ЁЁЁЁЁЁЁЁ')); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'ПрИвЕт кАК ДЕЛа') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'как ПРИВЕТ дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'Metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'mEtrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metriKS') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'YanDEX') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'приВЕТ КАк ДеЛа КлИп - яндеКс.видео') as distance; + + +select round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5; + +select round(1000 * ngramDistance('', '')); +select round(1000 * ngramDistance('abc', '')); +select round(1000 * ngramDistance('', 'abc')); +select round(1000 * ngramDistance('abcdefgh', 'abcdefgh')); +select round(1000 * ngramDistance('abcdefgh', 'abcdefg')); +select round(1000 * ngramDistance('abcdefgh', 'defgh')); +select round(1000 * ngramDistance('abcdefgh', 'aaaaaaaa')); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'привет как дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'как привет дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metriks') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'yandex') as distance; + +select round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5; + +select round(1000 * ngramDistanceCaseInsensitive('', '')); +select round(1000 * ngramDistanceCaseInsensitive('abc', '')); +select round(1000 * ngramDistanceCaseInsensitive('', 'abc')); +select round(1000 * ngramDistanceCaseInsensitive('abCdefgH', 'Abcdefgh')); +select round(1000 * ngramDistanceCaseInsensitive('abcdefgh', 'abcdeFG')); +select round(1000 * ngramDistanceCaseInsensitive('AAAAbcdefgh', 'defgh')); +select round(1000 * ngramDistanceCaseInsensitive('ABCdefgH', 'aaaaaaaa')); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'ПрИвЕт кАК ДЕЛа') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'как ПРИВЕТ дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'Metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'mEtrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metriKS') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'YanDEX') as distance; + +drop table if exists test.test_distance; diff --git a/dbms/tests/queries/0_stateless/00909_trigram_distance.reference b/dbms/tests/queries/0_stateless/00909_trigram_distance.reference deleted file mode 100644 index 14dba2a2dcf..00000000000 --- a/dbms/tests/queries/0_stateless/00909_trigram_distance.reference +++ /dev/null @@ -1,119 +0,0 @@ -0 -0 -0 -0 -0 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -0 -0 -0 -0 -0 -77 -77 -77 -77 -77 -636 -636 -636 -636 -636 -1000 -1000 -1000 -1000 -1000 -0 -1000 -1000 -0 -77 -636 -1000 -привет как дела?... Херсон -пап привет как дела - Яндекс.Видео -привет как дела клип - Яндекс.Видео -привет братан как дела - Яндекс.Видео -привет -http://metric.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -http://metris.ru/ -http://metrika.ru/ - -привет как дела?... Херсон -пап привет как дела - Яндекс.Видео -привет -привет как дела клип - Яндекс.Видео -привет братан как дела - Яндекс.Видео -http://metric.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -http://metris.ru/ -http://metrika.ru/ - -http://metrika.ru/ -http://metric.ru/ -http://metris.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metric.ru/ -http://metrica.yandex.com/ -http://autometric.ru/ -http://metris.ru/ -http://metrika.ru/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metrika.ru/ -http://metric.ru/ -http://metris.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metric.ru/ -http://autometric.ru/ -http://metris.ru/ -http://metrika.ru/ -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео -http://metric.ru/ -http://autometric.ru/ -http://metris.ru/ -http://metrika.ru/ - diff --git a/dbms/tests/queries/0_stateless/00909_trigram_distance.sql b/dbms/tests/queries/0_stateless/00909_trigram_distance.sql deleted file mode 100644 index ca6a18d2513..00000000000 --- a/dbms/tests/queries/0_stateless/00909_trigram_distance.sql +++ /dev/null @@ -1,29 +0,0 @@ -select round(1000 * trigramDistance(materialize(''), '')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абв'), '')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize(''), 'абв')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; - -select round(1000 * trigramDistance('', '')); -select round(1000 * trigramDistance('абв', '')); -select round(1000 * trigramDistance('', 'абв')); -select round(1000 * trigramDistance('абвгдеёжз', 'абвгдеёжз')); -select round(1000 * trigramDistance('абвгдеёжз', 'абвгдеёж')); -select round(1000 * trigramDistance('абвгдеёжз', 'гдеёзд')); -select round(1000 * trigramDistance('абвгдеёжз', 'ёёёёёёёё')); - -drop table if exists test.test_distance; -create table test.test_distance (Title String) engine = Memory; -insert into test.test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), (''); - -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'привет как дела'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'как привет дела'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrika'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrica'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metriks'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrics'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'yandex'); - -drop table if exists test.test_distance; diff --git a/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference b/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference new file mode 100644 index 00000000000..6b303cbce8b --- /dev/null +++ b/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference @@ -0,0 +1 @@ +1 (1,2,3) 1 diff --git a/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql b/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql new file mode 100644 index 00000000000..938260c5123 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql @@ -0,0 +1 @@ +select 1 as x, (1,2,3) as y, x in y; diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index 34b44419cce..c6d90c89cb1 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -85,6 +85,22 @@ Columns: - `name`(`String`) – The name of the function. - `is_aggregate`(`UInt8`) — Whether the function is aggregate. +## system.graphite_retentions + +Contains information about parameters [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) which use in tables with [\*GraphiteMergeTree](table_engines/graphitemergetree.md) engines. + +Столбцы: +- `config_name` (String) - `graphite_rollup` parameter name. +- `regexp` (String) - A pattern for the metric name. +- `function` (String) - The name of the aggregating function. +- `age` (UInt64) - The minimum age of the data in seconds. +- `precision` (UInt64) - How precisely to define the age of the data in seconds. +- `priority` (UInt16) - Pattern priority. +- `is_default` (UInt8) - Is pattern default or not. +- `Tables.database` (Array(String)) - Array of databases names of tables, which use `config_name` parameter. +- `Tables.table` (Array(String)) - Array of tables names, which use `config_name` parameter. + + ## system.merges Contains information about merges and part mutations currently in process for tables in the MergeTree family. diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index b3b8b63d136..dce9917776c 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -70,5 +70,13 @@ For other regular expressions, the code is the same as for the 'match' function. The same thing as 'like', but negative. +## ngramDistance(haystack, needle) + +Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 -- the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throws an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. + +For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. + +Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. + [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index 82aec59ec29..7a4e69ca1cd 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -83,6 +83,23 @@ default_expression String - выражение для значения по ум - `name` (`String`) – Имя функции. - `is_aggregate` (`UInt8`) – Признак, является ли функция агрегатной. + +## system.graphite_retentions + +Содержит информацию о том, какие параметры [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) используются в таблицах с движками [\*GraphiteMergeTree](table_engines/graphitemergetree.md). + +Столбцы: +- `config_name` (String) - Имя параметра, используемого для `graphite_rollup`. +- `regexp` (String) - Шаблон имени метрики. +- `function` (String) - Имя агрегирующей функции. +- `age` (UInt64) - Минимальный возраст данных в секундах. +- `precision` (UInt64) - Точность определения возраста данных в секундах. +- `priority` (UInt16) - Приоритет раздела pattern. +- `is_default` (UInt8) - Является ли раздел pattern дефолтным. +- `Tables.database` (Array(String)) - Массив имён баз данных таблиц, использующих параметр `config_name`. +- `Tables.table` (Array(String)) - Массив имён таблиц, использующих параметр `config_name`. + + ## system.merges Содержит информацию о производящихся прямо сейчас слияниях и мутациях кусков для таблиц семейства MergeTree. diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index a79ea043716..4b335cce34c 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -9,7 +9,7 @@ Для поиска без учета регистра используйте функцию `positionCaseInsensitive`. ## positionUTF8(haystack, needle) -Так же, как `position`, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). +Так же, как `position`, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено -- то возвращает какой-нибудь результат (не кидает исключение). Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`. @@ -59,4 +59,12 @@ ## notLike(haystack, pattern), оператор haystack NOT LIKE pattern То же, что like, но с отрицанием. +## ngramDistance(haystack, needle) + +Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 -- чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строки из `haystack` больше 32КБ, расстояние всегда равно единице. + +Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. + +Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами -- могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` -- мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв. + [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) diff --git a/libs/libcommon/include/common/mremap.h b/libs/libcommon/include/common/mremap.h index f569ff05d4e..31ca74da827 100644 --- a/libs/libcommon/include/common/mremap.h +++ b/libs/libcommon/include/common/mremap.h @@ -12,7 +12,8 @@ #define MREMAP_MAYMOVE 1 -void * mremap(void * old_address, +void * mremap( + void * old_address, size_t old_size, size_t new_size, int flags = 0, @@ -23,7 +24,8 @@ void * mremap(void * old_address, #endif -inline void * clickhouse_mremap(void * old_address, +inline void * clickhouse_mremap( + void * old_address, size_t old_size, size_t new_size, int flags = 0, @@ -32,7 +34,8 @@ inline void * clickhouse_mremap(void * old_address, [[maybe_unused]] int mmap_fd = -1, [[maybe_unused]] off_t mmap_offset = 0) { - return mremap(old_address, + return mremap( + old_address, old_size, new_size, flags