From 7bf7242ad3dea783a689d566e6913701917b4780 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 10 May 2017 00:00:19 -0400 Subject: [PATCH] Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. --- .../src/Common/CombinedCardinalityEstimator.h | 2 +- dbms/src/Common/HashTable/Hash.h | 2 +- dbms/src/Common/HashTable/HashTable.h | 2 +- dbms/src/Common/HashTable/SmallTable.h | 2 +- dbms/src/Common/HashTable/TwoLevelHashTable.h | 12 ++--- .../HyperLogLogWithSmallSetOptimization.h | 2 +- dbms/src/Common/Increment.h | 18 +++---- dbms/src/Common/Macros.h | 3 +- dbms/src/Common/MemoryTracker.h | 8 +-- .../Common/OptimizedRegularExpression.inl.h | 34 ++++++------ dbms/src/Common/PODArray.h | 38 +++++++------- dbms/src/Common/PoolBase.h | 37 ++++++++----- dbms/src/Common/RadixSort.h | 19 +++---- dbms/src/Common/ShellCommand.h | 8 +-- dbms/src/Common/SimpleCache.h | 6 +-- dbms/src/Common/SipHash.h | 48 ++++++++--------- dbms/src/Common/StringSearcher.h | 36 ++++++------- dbms/src/Common/Throttler.h | 10 ++-- dbms/src/Common/VirtualColumnUtils.h | 4 +- dbms/src/Common/Volnitsky.h | 52 ++++++++++--------- dbms/src/Common/formatReadable.h | 4 +- dbms/src/Common/getFQDNOrHostName.h | 4 +- dbms/src/Common/typeid_cast.h | 2 +- 23 files changed, 184 insertions(+), 169 deletions(-) diff --git a/dbms/src/Common/CombinedCardinalityEstimator.h b/dbms/src/Common/CombinedCardinalityEstimator.h index 82c2951e44a..94d21064a42 100644 --- a/dbms/src/Common/CombinedCardinalityEstimator.h +++ b/dbms/src/Common/CombinedCardinalityEstimator.h @@ -23,7 +23,7 @@ static inline ContainerType max(const ContainerType & lhs, const ContainerType & } -/** For a small number of keys - an array of fixed size "on the stack." +/** For a small number of keys - an array of fixed size "on the stack". * For the average, HashSet is allocated. * For large, HyperLogLog is allocated. */ diff --git a/dbms/src/Common/HashTable/Hash.h b/dbms/src/Common/HashTable/Hash.h index b2733058cb7..a9517e3e5e1 100644 --- a/dbms/src/Common/HashTable/Hash.h +++ b/dbms/src/Common/HashTable/Hash.h @@ -5,7 +5,7 @@ /** Hash functions that are better than the trivial function std::hash. * - * Example: when aggregated by the visitor ID, the performance increase is more than 5 times. + * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times. * This is because of following reasons: * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits; * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits; diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index be762733897..a965b8e6e9d 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -695,7 +695,7 @@ public: /** Insert the key, - * return the iterator to a position that can be used for `placement new` of value, + * return an iterator to a position that can be used for `placement new` of value, * as well as the flag - whether a new key was inserted. * * You have to make `placement new` of value if you inserted a new key, diff --git a/dbms/src/Common/HashTable/SmallTable.h b/dbms/src/Common/HashTable/SmallTable.h index 64e90f6208a..823bc93bf5d 100644 --- a/dbms/src/Common/HashTable/SmallTable.h +++ b/dbms/src/Common/HashTable/SmallTable.h @@ -212,7 +212,7 @@ public: /** Insert the key, - * return the iterator to a position that can be used for `placement new` of value, + * return an iterator to a position that can be used for `placement new` of value, * as well as the flag - whether a new key was inserted. * * You have to make `placement new` of value if you inserted a new key, diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h index 29bbaca988a..6d4edf49fc7 100644 --- a/dbms/src/Common/HashTable/TwoLevelHashTable.h +++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h @@ -9,9 +9,9 @@ * * Usually works a little slower than a simple hash table. * However, it has advantages in some cases: - * - if you need to measure two hash tables together, then you can easily parallelize them by buckets; - * - lag during resizes is spread, since the small hash tables will be resized separately; - * - in theory, the cache resize is local in a larger range of sizes. + * - if you need to merge two hash tables together, then you can easily parallelize it by buckets; + * - delay during resizes is amortized, since the small hash tables will be resized separately; + * - in theory, resizes are cache-local in a larger range of sizes. */ template @@ -52,7 +52,7 @@ public: size_t hash(const Key & x) const { return Hash::operator()(x); } - /// NOTE Bad for hash tables for more than 2^32 cells. + /// NOTE Bad for hash tables with more than 2^32 cells. static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; } protected: @@ -95,7 +95,7 @@ public: { typename Source::const_iterator it = src.begin(); - /// It is assumed that the zero key (stored separately) when iterating is first. + /// It is assumed that the zero key (stored separately) is first in iteration order. if (it != src.end() && it.getPtr()->isZero(src)) { insert(*it); @@ -221,7 +221,7 @@ public: /** Insert the key, - * return the iterator to a position that can be used for `placement new` value, + * return an iterator to a position that can be used for `placement new` of value, * as well as the flag - whether a new key was inserted. * * You have to make `placement new` values if you inserted a new key, diff --git a/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h b/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h index b604d82d85b..5296a606121 100644 --- a/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h +++ b/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h @@ -9,7 +9,7 @@ namespace DB { -/** For a small number of keys - an array of fixed size "on the stack." +/** For a small number of keys - an array of fixed size "on the stack". * For large, HyperLogLog is allocated. * See also the more practical implementation in CombinedCardinalityEstimator.h, * where a hash table is also used for medium-sized sets. diff --git a/dbms/src/Common/Increment.h b/dbms/src/Common/Increment.h index fc8820975fa..c03c6ef5575 100644 --- a/dbms/src/Common/Increment.h +++ b/dbms/src/Common/Increment.h @@ -3,8 +3,8 @@ #include -/** Lets you receive an auto-increment number, storing it in a file. - * Designed for rare calls (not designed for performance). +/** Allows to get an auto-increment number, storing it in a file. + * Intended for rare calls (not designed for performance). */ class Increment { @@ -39,13 +39,13 @@ public: return getBunch(0, create_if_need); } - /** Get the next number and increase the count by `count`. - * If the `create_if_need` parameter is not set to true, then - * the file should already have a number written (if not - create the file manually with zero). - * - * To protect against race conditions between different processes, file locks are used. - * (But when the first file is created, the race condition is possible, so it's better to create the file in advance.) - */ + /** Get the next number and increase the counter by `count`. + * If the `create_if_need` parameter is not set to true, then + * the file should already have a number written (if not - create the file manually with zero). + * + * To protect against race conditions between different processes, file locks are used. + * (But when the first file is created, the race condition is possible, so it's better to create the file in advance.) + */ UInt64 getBunch(UInt64 count, bool create_if_need = false) { return static_cast(counter.add(static_cast(count), create_if_need) - count + 1); diff --git a/dbms/src/Common/Macros.h b/dbms/src/Common/Macros.h index dec296fcbfe..0ebf52afd02 100644 --- a/dbms/src/Common/Macros.h +++ b/dbms/src/Common/Macros.h @@ -4,10 +4,11 @@ #include #include + namespace DB { -/** Apply the macros from the config in the line. +/** Apply substitutions from the macros in config to the string. */ class Macros { diff --git a/dbms/src/Common/MemoryTracker.h b/dbms/src/Common/MemoryTracker.h index 7066e20ee17..c06fc33444e 100644 --- a/dbms/src/Common/MemoryTracker.h +++ b/dbms/src/Common/MemoryTracker.h @@ -102,10 +102,10 @@ public: }; -/** The MemoryTracker object is quite difficult to drag to all places where significant amounts of memory are allocated. - * Therefore, a thread-local pointer to used MemoryTracker or nullptr is used, if it does not need to be used. - * This pointer is set when memory consumption is monitored in this thread. - * So, you just need to drag it to all the threads that handle one request. +/** The MemoryTracker object is quite difficult to pass to all places where significant amounts of memory are allocated. + * Therefore, a thread-local pointer to used MemoryTracker is set, or nullptr if MemoryTracker does not need to be used. + * This pointer is set when memory consumption is monitored in current thread. + * So, you just need to pass it to all the threads that handle one request. */ extern __thread MemoryTracker * current_memory_tracker; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 278d9d3814d..ef6cb781a39 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -8,8 +8,9 @@ #define MIN_LENGTH_FOR_STRSTR 3 #define MAX_SUBPATTERNS 5 -template -void OptimizedRegularExpressionImpl::analyze( + +template +void OptimizedRegularExpressionImpl::analyze( const std::string & regexp, std::string & required_substring, bool & is_trivial, @@ -20,7 +21,8 @@ void OptimizedRegularExpressionImpl::analyze( * a string outside parentheses, * in which all metacharacters are escaped, * and also if there are no '|' outside the brackets, - * and also avoid substrings of the form `http://` or `www`. + * and also avoid substrings of the form `http://` or `www` and some other + * (this is the hack for typical use case in Yandex.Metrica). */ const char * begin = regexp.data(); const char * pos = begin; @@ -32,9 +34,9 @@ void OptimizedRegularExpressionImpl::analyze( bool has_alternative_on_depth_0 = false; /// Substring with a position. - typedef std::pair Substring; + using Substring = std::pair; + using Substrings = std::vector; - typedef std::vector Substrings; Substrings trivial_substrings(1); Substring * last_substring = &trivial_substrings.back(); @@ -157,7 +159,7 @@ void OptimizedRegularExpressionImpl::analyze( ++pos; break; - /// Quantifiers that allow a zero number. + /// Quantifiers that allow a zero number of occurences. case '{': in_curly_braces = true; case '?': case '*': @@ -208,7 +210,7 @@ void OptimizedRegularExpressionImpl::analyze( { if (((it->second == 0 && candidate_it->second != 0) || ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length)) - /// Tuning for the domain + /// Tuning for typical usage domain && (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://"))) && (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http"))) && (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www"))) @@ -241,12 +243,12 @@ void OptimizedRegularExpressionImpl::analyze( } -template -OptimizedRegularExpressionImpl::OptimizedRegularExpressionImpl(const std::string & regexp_, int options) +template +OptimizedRegularExpressionImpl::OptimizedRegularExpressionImpl(const std::string & regexp_, int options) { analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix); - /// 3 options are supported + /// Just three following options are supported if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL))) throw Poco::Exception("OptimizedRegularExpression: Unsupported option."); @@ -280,8 +282,8 @@ OptimizedRegularExpressionImpl::OptimizedRegularExpressionImpl(const std::str } -template -bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size) const +template +bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size) const { if (is_trivial) { @@ -309,8 +311,8 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subje } -template -bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, Match & match) const +template +bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, Match & match) const { if (is_trivial) { @@ -357,8 +359,8 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subje } -template -unsigned OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const +template +unsigned OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const { matches.clear(); diff --git a/dbms/src/Common/PODArray.h b/dbms/src/Common/PODArray.h index 0b46fecf5f9..e098756c8ae 100644 --- a/dbms/src/Common/PODArray.h +++ b/dbms/src/Common/PODArray.h @@ -24,7 +24,7 @@ namespace DB * To be more precise - for use in ColumnVector. * It differs from std::vector in that it does not initialize the elements. * - * Made uncopable so that there are no random copies. You can copy the data using `assign` method. + * Made noncopyable so that there are no accidential copies. You can copy the data using `assign` method. * * Only part of the std::vector interface is supported. * @@ -40,20 +40,20 @@ template (c_start); } - T * t_end() { return reinterpret_cast(c_end); } - T * t_end_of_storage() { return reinterpret_cast(c_end_of_storage); } + T * t_start() { return reinterpret_cast(c_start); } + T * t_end() { return reinterpret_cast(c_end); } + T * t_end_of_storage() { return reinterpret_cast(c_end_of_storage); } - const T * t_start() const { return reinterpret_cast(c_start); } - const T * t_end() const { return reinterpret_cast(c_end); } - const T * t_end_of_storage() const { return reinterpret_cast(c_end_of_storage); } + const T * t_start() const { return reinterpret_cast(c_start); } + const T * t_end() const { return reinterpret_cast(c_end); } + const T * t_end_of_storage() const { return reinterpret_cast(c_end_of_storage); } /// The amount of memory occupied by the num_elements of the elements. static size_t byte_size(size_t num_elements) { return num_elements * sizeof(T); } @@ -173,16 +173,16 @@ public: const T & operator[] (size_t n) const { return t_start()[n]; } T & front() { return t_start()[0]; } - T & back() { return t_end()[-1]; } + T & back() { return t_end()[-1]; } const T & front() const { return t_start()[0]; } const T & back() const { return t_end()[-1]; } - iterator begin() { return t_start(); } - iterator end() { return t_end(); } - const_iterator begin() const { return t_start(); } - const_iterator end() const { return t_end(); } - const_iterator cbegin() const { return t_start(); } - const_iterator cend() const { return t_end(); } + iterator begin() { return t_start(); } + iterator end() { return t_end(); } + const_iterator begin() const { return t_start(); } + const_iterator end() const { return t_end(); } + const_iterator cbegin() const { return t_start(); } + const_iterator cend() const { return t_end(); } void reserve(size_t n) { @@ -209,7 +209,7 @@ public: c_end = c_start + byte_size(n); } - /// Same as resize, but zeros new elements. + /// Same as resize, but zeroes new elements. void resize_fill(size_t n) { size_t old_size = size(); @@ -261,7 +261,7 @@ public: c_end -= byte_size(1); } - /// Do not insert a piece of yourself into the array. Because with the resize, the iterators on themselves can be invalidated. + /// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated. template void insert(It1 from_begin, It2 from_end) { diff --git a/dbms/src/Common/PoolBase.h b/dbms/src/Common/PoolBase.h index 4b81197ee0a..194d7e421ad 100644 --- a/dbms/src/Common/PoolBase.h +++ b/dbms/src/Common/PoolBase.h @@ -8,8 +8,17 @@ #include #include + +namespace DB +{ + namespace ErrorCodes + { + extern const int LOGICAL_ERROR; + } +} + /** A class from which you can inherit and get a pool of something. Used for database connection pools. - * The heir must provide a method for creating a new object to place in the pool. + * Descendant class must provide a method for creating a new object to place in the pool. */ template @@ -63,27 +72,27 @@ public: Entry() {} /// For deferred initialization. /** The `Entry` object protects the resource from being used by another thread. - * The following methods are forbidden for `rvalue`, so you can not write a similar to - * - * auto q = pool.Get()->query("SELECT .."); // Oops, after this line Entry was destroyed - * q.execute (); // Someone else can use this Connection - */ + * The following methods are forbidden for `rvalue`, so you can not write a similar to + * + * auto q = pool.Get()->query("SELECT .."); // Oops, after this line Entry was destroyed + * q.execute (); // Someone else can use this Connection + */ Object * operator->() && = delete; const Object * operator->() const && = delete; Object & operator*() && = delete; const Object & operator*() const && = delete; - Object * operator->() & { return &*data->data.object; } - const Object * operator->() const & { return &*data->data.object; } - Object & operator*() & { return *data->data.object; } - const Object & operator*() const & { return *data->data.object; } + Object * operator->() & { return &*data->data.object; } + const Object * operator->() const & { return &*data->data.object; } + Object & operator*() & { return *data->data.object; } + const Object & operator*() const & { return *data->data.object; } bool isNull() const { return data == nullptr; } PoolBase * getPool() const { if (!data) - throw DB::Exception("attempt to get pool from uninitialized entry"); + throw DB::Exception("Attempt to get pool from uninitialized entry", DB::ErrorCodes::LOGICAL_ERROR); return &data->data.pool; } @@ -95,7 +104,7 @@ public: virtual ~PoolBase() {} - /** Allocates the object for the job. With timeout < 0, the timeout is infinite. */ + /** Allocates the object. Wait for free object in pool for 'timeout'. With 'timeout' < 0, the timeout is infinite. */ Entry get(Poco::Timespan::TimeDiff timeout) { std::unique_lock lock(mutex); @@ -137,7 +146,7 @@ private: /** Pool. */ Objects items; - /** Block to access the pool. */ + /** Lock to access the pool. */ std::mutex mutex; std::condition_variable available; @@ -151,7 +160,7 @@ protected: items.reserve(max_items); } - /** Creates a new object to put in the pool. */ + /** Creates a new object to put into the pool. */ virtual ObjectPtr allocObject() = 0; }; diff --git a/dbms/src/Common/RadixSort.h b/dbms/src/Common/RadixSort.h index 47232b1f19d..ee844fa83a8 100644 --- a/dbms/src/Common/RadixSort.h +++ b/dbms/src/Common/RadixSort.h @@ -13,10 +13,10 @@ #include -/** Bitwise sort, has the following functionality: +/** Radix sort, has the following functionality: * Can sort unsigned, signed numbers, and floats. * Can sort an array of fixed length elements that contain something else besides the key. - * Customizable digit size. + * Customizable radix size. * * LSB, stable. * NOTE For some applications it makes sense to add MSB-radix-sort, @@ -49,7 +49,7 @@ struct RadixSortMallocAllocator template struct RadixSortFloatTransform { - /// Is it worth writing the result in memory, or is it better to do it every time again? + /// Is it worth writing the result in memory, or is it better to do calculation every time again? static constexpr bool transform_is_simple = false; static KeyBits forward(KeyBits x) @@ -74,7 +74,7 @@ struct RadixSortFloatTraits /// The type to which the key is transformed to do bit operations. This UInt is the same size as the key. using KeyBits = typename std::conditional::type; - static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, it bits, to do one pass - reshuffle of the array. + static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array. /// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits. using Transform = RadixSortFloatTransform; @@ -95,7 +95,7 @@ struct RadixSortIdentityTransform static constexpr bool transform_is_simple = true; static KeyBits forward(KeyBits x) { return x; } - static KeyBits backward(KeyBits x) { return x; } + static KeyBits backward(KeyBits x) { return x; } }; @@ -105,7 +105,7 @@ struct RadixSortSignedTransform static constexpr bool transform_is_simple = true; static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } - static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } + static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } }; @@ -150,7 +150,7 @@ struct RadixSort private: using Element = typename Traits::Element; using Key = typename Traits::Key; - using CountType = typename Traits::CountType; + using CountType = typename Traits::CountType; using KeyBits = typename Traits::KeyBits; static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS; @@ -174,9 +174,9 @@ public: { /// If the array is smaller than 256, then it is better to use another algorithm. - /// There are loops of NUM_PASSES. It is very important that they unfold in compile-time. + /// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time. - /// For each of the NUM_PASSES bits of the key, consider how many times each value of this piece met. + /// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met. CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0}; typename Traits::Allocator allocator; @@ -230,6 +230,7 @@ public: } /// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array. + /// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array. if (NUM_PASSES % 2) memcpy(arr, swap_buffer, size * sizeof(Element)); diff --git a/dbms/src/Common/ShellCommand.h b/dbms/src/Common/ShellCommand.h index ad2f4fdd0c6..a558216fcbf 100644 --- a/dbms/src/Common/ShellCommand.h +++ b/dbms/src/Common/ShellCommand.h @@ -10,8 +10,8 @@ namespace DB /** Lets you run the command, - * read it stdout, stderr, write to stdin, - * wait for completion. + * read it stdout and stderr; write to stdin; + * wait for completion. * * The implementation is similar to the popen function from POSIX (see libc source code). * @@ -20,8 +20,8 @@ namespace DB * with some overcommit settings, if the address space of the process is more than half the amount of available memory. * Also, changing memory maps - a fairly resource-intensive operation. * - * The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr running process, - * and also find out the code and the completion status. + * The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process, + * and also to obtain the return code and completion status. */ class ShellCommand { diff --git a/dbms/src/Common/SimpleCache.h b/dbms/src/Common/SimpleCache.h index 4620ea7e626..4de92baa9f5 100644 --- a/dbms/src/Common/SimpleCache.h +++ b/dbms/src/Common/SimpleCache.h @@ -7,9 +7,9 @@ /** The simplest cache for a free function. - * You can also pass a static class method or lambda without capturing. - * The size is unlimited. Values are not obsolete. - * To synchronize, use mutex. + * You can also pass a static class method or lambda without captures. + * The size is unlimited. Values are stored permanently and never evicted. + * Mutex is used for synchronization. * Suitable only for the simplest cases. * * Usage diff --git a/dbms/src/Common/SipHash.h b/dbms/src/Common/SipHash.h index f6f241df9d7..10e5d642ebb 100644 --- a/dbms/src/Common/SipHash.h +++ b/dbms/src/Common/SipHash.h @@ -3,17 +3,17 @@ /** SipHash is a fast cryptographic hash function for short strings. * Taken from here: https://www.131002.net/siphash/ * + * This is SipHash 2-4 variant. + * * Two changes are made: - * - returns 128 bits, not 64; + * - returns also 128 bits, not only 64; * - done streaming (can be calculated in parts). * * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL. * (~ 700 MB/sec, 15 million strings per second) */ -#include -#include -#include +#include #define ROTL(x,b) static_cast( ((x) << (b)) | ( (x) >> (64 - (b))) ) @@ -30,23 +30,20 @@ class SipHash { private: - using u64 = DB::UInt64; - using u8 = DB::UInt8; - - /// Status. - u64 v0; - u64 v1; - u64 v2; - u64 v3; + /// State. + UInt64 v0; + UInt64 v1; + UInt64 v2; + UInt64 v3; /// How many bytes have been processed. - u64 cnt; + UInt64 cnt; /// The current 8 bytes of input data. union { - u64 current_word; - u8 current_bytes[8]; + UInt64 current_word; + UInt8 current_bytes[8]; }; void finalize() @@ -68,7 +65,7 @@ private: public: /// Arguments - seed. - SipHash(u64 k0 = 0, u64 k1 = 0) + SipHash(UInt64 k0 = 0, UInt64 k1 = 0) { /// Initialize the state with some random bytes and seed. v0 = 0x736f6d6570736575ULL ^ k0; @@ -80,7 +77,7 @@ public: current_word = 0; } - void update(const char * data, u64 size) + void update(const char * data, UInt64 size) { const char * end = data + size; @@ -94,7 +91,7 @@ public: ++cnt; } - /// If you still do not have enough bytes to an 8-byte word. + /// If we still do not have enough bytes to an 8-byte word. if (cnt & 7) return; @@ -108,7 +105,7 @@ public: while (data + 8 <= end) { - current_word = *reinterpret_cast(data); + current_word = *reinterpret_cast(data); v3 ^= current_word; SIPROUND; @@ -138,18 +135,18 @@ public: void get128(char * out) { finalize(); - reinterpret_cast(out)[0] = v0 ^ v1; - reinterpret_cast(out)[1] = v2 ^ v3; + reinterpret_cast(out)[0] = v0 ^ v1; + reinterpret_cast(out)[1] = v2 ^ v3; } - void get128(u64 & lo, u64 & hi) + void get128(UInt64 & lo, UInt64 & hi) { finalize(); lo = v0 ^ v1; hi = v2 ^ v3; } - u64 get64() + UInt64 get64() { finalize(); return v0 ^ v1 ^ v2 ^ v3; @@ -160,6 +157,7 @@ public: #undef ROTL #undef SIPROUND +#include inline void sipHash128(const char * data, const size_t size, char * out) { @@ -168,7 +166,7 @@ inline void sipHash128(const char * data, const size_t size, char * out) hash.get128(out); } -inline DB::UInt64 sipHash64(const char * data, const size_t size) +inline UInt64 sipHash64(const char * data, const size_t size) { SipHash hash; hash.update(data, size); @@ -177,7 +175,7 @@ inline DB::UInt64 sipHash64(const char * data, const size_t size) #include -inline DB::UInt64 sipHash64(const std::string & s) +inline UInt64 sipHash64(const std::string & s) { return sipHash64(s.data(), s.size()); } diff --git a/dbms/src/Common/StringSearcher.h b/dbms/src/Common/StringSearcher.h index 9d83d4d19f9..ba1947f515c 100644 --- a/dbms/src/Common/StringSearcher.h +++ b/dbms/src/Common/StringSearcher.h @@ -19,15 +19,14 @@ namespace DB { - namespace ErrorCodes { extern const int UNSUPPORTED_PARAMETER; } -/** Variants for finding a substring in a string. - * In most cases, less productive than Volnitsky (see Volnitsky.h). +/** Variants for searching a substring in a string. + * In most cases, performance is less than Volnitsky (see Volnitsky.h). */ @@ -37,7 +36,7 @@ struct StringSearcherBase static constexpr auto n = sizeof(__m128i); const int page_size = getpagesize(); - bool page_safe(const void * const ptr) const + bool pageSafe(const void * const ptr) const { return ((page_size - 1) & reinterpret_cast(ptr)) <= page_size - n; } @@ -55,7 +54,7 @@ class StringSearcher : private StringSearcherBase private: using UTF8SequenceBuffer = UInt8[6]; - /// string to be searched for + /// substring to be searched for const UInt8 * const needle; const std::size_t needle_size; const UInt8 * const needle_end = needle + needle_size; @@ -135,8 +134,7 @@ public: if (!(dst_l_len == dst_u_len && dst_u_len == src_len)) throw DB::Exception{ "UTF8 sequences with different lowercase and uppercase lengths are not supported", - DB::ErrorCodes::UNSUPPORTED_PARAMETER - }; + DB::ErrorCodes::UNSUPPORTED_PARAMETER}; cache_actual_len += src_len; if (cache_actual_len < n) @@ -165,7 +163,7 @@ public: static const Poco::UTF8Encoding utf8; #if __SSE4_1__ - if (page_safe(pos)) + if (pageSafe(pos)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -230,7 +228,7 @@ public: while (haystack < haystack_end) { #if __SSE4_1__ - if (haystack + n <= haystack_end && page_safe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); @@ -249,7 +247,7 @@ public: const auto offset = __builtin_ctz(mask); haystack += offset; - if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack)) + if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -377,7 +375,7 @@ public: bool compare(const UInt8 * pos) const { #if __SSE4_1__ - if (page_safe(pos)) + if (pageSafe(pos)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -429,7 +427,7 @@ public: while (haystack < haystack_end) { #if __SSE4_1__ - if (haystack + n <= haystack_end && page_safe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); @@ -447,7 +445,7 @@ public: const auto offset = __builtin_ctz(mask); haystack += offset; - if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack)) + if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -559,7 +557,7 @@ public: bool compare(const UInt8 * pos) const { #if __SSE4_1__ - if (page_safe(pos)) + if (pageSafe(pos)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache); @@ -609,7 +607,7 @@ public: while (haystack < haystack_end) { #if __SSE4_1__ - if (haystack + n <= haystack_end && page_safe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack)) { /// find first character const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); @@ -627,7 +625,7 @@ public: const auto offset = __builtin_ctz(mask); haystack += offset; - if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack)) + if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) { /// check for first 16 octets const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); @@ -694,9 +692,9 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher; /** Uses functions from libc. - * It makes sense to use short strings when cheap initialization is required. - * There is no option for register-independent search for UTF-8 strings. - * It is required that the end of the lines be zero byte. + * It makes sense to use only with short haystacks when cheap initialization is required. + * There is no option for case-insensitive search for UTF-8 strings. + * It is required that strings are zero-terminated. */ struct LibCASCIICaseSensitiveStringSearcher diff --git a/dbms/src/Common/Throttler.h b/dbms/src/Common/Throttler.h index 75bf6490849..0b242b25110 100644 --- a/dbms/src/Common/Throttler.h +++ b/dbms/src/Common/Throttler.h @@ -1,11 +1,13 @@ #pragma once +#include /// nanosleep #include #include #include #include #include + namespace DB { @@ -15,12 +17,12 @@ namespace ErrorCodes } -/** Allows you to limit the speed of something (in pieces per second) using sleep. +/** Allows you to limit the speed of something (in entities per second) using sleep. * Specifics of work: * - only the average speed is considered, from the moment of the first call of `add` function; * if there were periods with low speed, then during some time after them, the speed will be higher; * - * Also allows you to set a limit on the maximum number of pieces. If you exceed, an exception is thrown. + * Also allows you to set a limit on the maximum number of entities. If exceeded, an exception will be thrown. */ class Throttler { @@ -56,7 +58,7 @@ public: if (max_speed) { - /// How much time would have gone for the speed to become `max_speed`. + /// How much time to wait for the average speed to become `max_speed`. UInt64 desired_ns = new_count * 1000000000 / max_speed; if (desired_ns > elapsed_ns) @@ -65,7 +67,7 @@ public: timespec sleep_ts; sleep_ts.tv_sec = sleep_ns / 1000000000; sleep_ts.tv_nsec = sleep_ns % 1000000000; - nanosleep(&sleep_ts, nullptr); /// NOTE Ends early in case of a signal. This is considered normal. + nanosleep(&sleep_ts, nullptr); /// NOTE Returns early in case of a signal. This is considered normal. } } } diff --git a/dbms/src/Common/VirtualColumnUtils.h b/dbms/src/Common/VirtualColumnUtils.h index cbe0120ea7e..b70245f0333 100644 --- a/dbms/src/Common/VirtualColumnUtils.h +++ b/dbms/src/Common/VirtualColumnUtils.h @@ -16,10 +16,10 @@ class Context; namespace VirtualColumnUtils { -/// Calculate the minimum numeric suffix to add to the row so that it is not present in the set +/// Calculate the minimum numeric suffix to add to the string so that it is not present in the set String chooseSuffix(const NamesAndTypesList & columns, const String & name); -/// Calculate the minimum total numeric suffix to add to each row, +/// Calculate the minimum total numeric suffix to add to each string, /// so that none is present in the set. String chooseSuffixForSet(const NamesAndTypesList & columns, const std::vector & names); diff --git a/dbms/src/Common/Volnitsky.h b/dbms/src/Common/Volnitsky.h index 18c5b1538d1..e1fda9f0bb0 100644 --- a/dbms/src/Common/Volnitsky.h +++ b/dbms/src/Common/Volnitsky.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -12,7 +13,7 @@ /** Search for a substring in a string by Volnitsky's algorithm * http://volnitsky.com/project/str_search/ * - * `haystack` and `needle` can contain null bytes. + * `haystack` and `needle` can contain zero bytes. * * Algorithm: * - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr; @@ -23,7 +24,7 @@ * - bigrams can be inserted several times if they occur in the needle several times; * - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end); * - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise; - * - if it did not work, we check the next cell of the hash table from the collision resolution chain; + * - if it did not match, we check the next cell of the hash table from the collision resolution chain; * - if not found, skip to haystack almost the size of the needle bytes; * * Unaligned memory access is used. @@ -39,34 +40,35 @@ template class VolnitskyBase { protected: - using offset_t = uint8_t; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. - using ngram_t = uint16_t; /// n-gram (2 bytes). + using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. + using Ngram = UInt16; /// n-gram (2 bytes). const UInt8 * const needle; const size_t needle_size; const UInt8 * const needle_end = needle + needle_size; /// For how long we move, if the n-gram from haystack is not found in the hash table. - const size_t step = needle_size - sizeof(ngram_t) + 1; + const size_t step = needle_size - sizeof(Ngram) + 1; /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1) - * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ - static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache. - offset_t hash[hash_size]; /// Hash table. + * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ + static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache (of common Intel CPUs). + Offset hash[hash_size]; /// Hash table. /// min haystack size to use main algorithm instead of fallback static constexpr auto min_haystack_size_for_algorithm = 20000; - const bool fallback; /// Do I need to use the fallback algorithm. + const bool fallback; /// Do we need to use the fallback algorithm. public: - /** haystack_size_hint - the expected total size of the haystack for `search` calls. Can not specify. + /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified). * If you specify it small enough, the fallback algorithm will be used, * since it is considered that it's useless to waste time initializing the hash table. */ VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0) : needle{reinterpret_cast(needle)}, needle_size{needle_size}, fallback{ - needle_size < 2 * sizeof(ngram_t) || needle_size >= std::numeric_limits::max() || - (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)} + needle_size < 2 * sizeof(Ngram) + || needle_size >= std::numeric_limits::max() + || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)} { if (fallback) return; @@ -74,7 +76,7 @@ public: memset(hash, 0, sizeof(hash)); /// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0 - for (auto i = static_cast(needle_size - sizeof(ngram_t)); i >= 0; --i) + for (auto i = static_cast(needle_size - sizeof(Ngram)); i >= 0; --i) self().putNGram(this->needle + i, i + 1, this->needle); } @@ -91,7 +93,7 @@ public: return self().search_fallback(haystack, haystack_end); /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle. - const auto * pos = haystack + needle_size - sizeof(ngram_t); + const auto * pos = haystack + needle_size - sizeof(Ngram); for (; pos <= haystack_end - needle_size; pos += step) { /// We look at all the cells of the hash table that can correspond to the n-gram from haystack. @@ -119,12 +121,12 @@ protected: CRTP & self() { return static_cast(*this); } const CRTP & self() const { return const_cast(this)->self(); } - static const ngram_t & toNGram(const UInt8 * const pos) + static const Ngram & toNGram(const UInt8 * const pos) { - return *reinterpret_cast(pos); + return *reinterpret_cast(pos); } - void putNGramBase(const ngram_t ngram, const int offset) + void putNGramBase(const Ngram ngram, const int offset) { /// Put the offset for the n-gram in the corresponding cell or the nearest free cell. size_t cell_num = ngram % hash_size; @@ -145,7 +147,7 @@ protected: union { - ngram_t n; + Ngram n; Chars chars; }; @@ -260,7 +262,7 @@ template <> struct VolnitskyImpl : VolnitskyBase struct VolnitskyImpl : VolnitskyBase struct VolnitskyImpl : VolnitskyBase struct VolnitskyImpl : VolnitskyBase -/// Displays the transmitted size in bytes as 123.45 GiB. +/// Displays the passed size in bytes as 123.45 GiB. void formatReadableSizeWithBinarySuffix(double value, DB::WriteBuffer & out, int precision = 2); std::string formatReadableSizeWithBinarySuffix(double value, int precision = 2); -/// Displays the transmitted size in bytes as 132.55 GB. +/// Displays the passed size in bytes as 132.55 GB. void formatReadableSizeWithDecimalSuffix(double value, DB::WriteBuffer & out, int precision = 2); std::string formatReadableSizeWithDecimalSuffix(double value, int precision = 2); diff --git a/dbms/src/Common/getFQDNOrHostName.h b/dbms/src/Common/getFQDNOrHostName.h index ea796426c85..a4367a72622 100644 --- a/dbms/src/Common/getFQDNOrHostName.h +++ b/dbms/src/Common/getFQDNOrHostName.h @@ -2,7 +2,7 @@ #include -/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the hostname utility with the -f flag. - * If it does not work, return hostname - similar to calling hostname without flags or uname -n. +/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the 'hostname' tool with the -f flag. + * If it does not work, return hostname - similar to calling 'hostname' without flags or 'uname -n'. */ const std::string & getFQDNOrHostName(); diff --git a/dbms/src/Common/typeid_cast.h b/dbms/src/Common/typeid_cast.h index a83a414f70e..e335f8f9672 100644 --- a/dbms/src/Common/typeid_cast.h +++ b/dbms/src/Common/typeid_cast.h @@ -16,7 +16,7 @@ namespace DB } -/** Checks match of type by comparing typeid. +/** Checks type by comparing typeid. * The exact match of the type is checked. That is, cast in the ancestor will be unsuccessful. * In the rest, behaves like a dynamic_cast. */