diff --git a/dbms/src/AggregateFunctions/UniqCombinedBiasData.h b/dbms/src/AggregateFunctions/UniqCombinedBiasData.h index 7206aabcf7e..0a69a211206 100644 --- a/dbms/src/AggregateFunctions/UniqCombinedBiasData.h +++ b/dbms/src/AggregateFunctions/UniqCombinedBiasData.h @@ -6,25 +6,25 @@ namespace DB { /** Data for HyperLogLogBiasEstimator in the uniqCombined function. -  * The development plan is as follows: -  * 1. Assemble ClickHouse. -  * 2. Run the script src/dbms/scripts/gen-bias-data.py, which returns one array for getRawEstimates() -  * and another array for getBiases(). -  * 3. Update `raw_estimates` and `biases` arrays. Also update the size of arrays in InterpolatedData. -  * 4. Assemble ClickHouse. -  * 5. Run the script src/dbms/scripts/linear-counting-threshold.py, which creates 3 files: -  * - raw_graph.txt (1st column: the present number of unique values; -  * 2nd column: relative error in the case of HyperLogLog without applying any corrections) -  * - linear_counting_graph.txt (1st column: the present number of unique values; -  * 2nd column: relative error in the case of HyperLogLog using LinearCounting) -  * - bias_corrected_graph.txt (1st column: the present number of unique values; -  * 2nd column: relative error in the case of HyperLogLog with the use of corrections from the algorithm HyperLogLog++) -  * 6. Generate a graph with gnuplot based on this data. -  * 7. Determine the minimum number of unique values at which it is better to correct the error -  * using its evaluation (ie, using the HyperLogLog++ algorithm) than applying the LinearCounting algorithm. -  * 7. Accordingly, update the constant in the function getThreshold() -  * 8. Assemble ClickHouse. -  */ + * The development plan is as follows: + * 1. Assemble ClickHouse. + * 2. Run the script src/dbms/scripts/gen-bias-data.py, which returns one array for getRawEstimates() + * and another array for getBiases(). + * 3. Update `raw_estimates` and `biases` arrays. Also update the size of arrays in InterpolatedData. + * 4. Assemble ClickHouse. + * 5. Run the script src/dbms/scripts/linear-counting-threshold.py, which creates 3 files: + * - raw_graph.txt (1st column: the present number of unique values; + * 2nd column: relative error in the case of HyperLogLog without applying any corrections) + * - linear_counting_graph.txt (1st column: the present number of unique values; + * 2nd column: relative error in the case of HyperLogLog using LinearCounting) + * - bias_corrected_graph.txt (1st column: the present number of unique values; + * 2nd column: relative error in the case of HyperLogLog with the use of corrections from the algorithm HyperLogLog++) + * 6. Generate a graph with gnuplot based on this data. + * 7. Determine the minimum number of unique values at which it is better to correct the error + * using its evaluation (ie, using the HyperLogLog++ algorithm) than applying the LinearCounting algorithm. + * 7. Accordingly, update the constant in the function getThreshold() + * 8. Assemble ClickHouse. + */ struct UniqCombinedBiasData { using InterpolatedData = std::array; diff --git a/dbms/src/AggregateFunctions/UniquesHashSet.h b/dbms/src/AggregateFunctions/UniquesHashSet.h index b0036d4832d..bc9a65c1bb6 100644 --- a/dbms/src/AggregateFunctions/UniquesHashSet.h +++ b/dbms/src/AggregateFunctions/UniquesHashSet.h @@ -15,33 +15,33 @@ /** Approximate calculation of anything, as usual, is constructed according to the following scheme: -  * - some data structure is used to calculate the value of X; -  * - Not all values are added to the data structure, but only selected ones (according to some selectivity criteria); -  * - after processing all elements, the data structure is in some state S; -  * - as an approximate value of X, the value calculated according to the maximum likelihood principle is returned: -  * at what real value X, the probability of finding the data structure in the obtained state S is maximal. -  */ + * - some data structure is used to calculate the value of X; + * - Not all values are added to the data structure, but only selected ones (according to some selectivity criteria); + * - after processing all elements, the data structure is in some state S; + * - as an approximate value of X, the value calculated according to the maximum likelihood principle is returned: + * at what real value X, the probability of finding the data structure in the obtained state S is maximal. + */ /** In particular, what is described below can be found by the name of the BJKST algorithm. -  */ + */ /** Very simple hash-set for approximate number of unique values. -  * Works like this: -  * - you can insert UInt64; -  * - before insertion, first the hash function UInt64 -> UInt32 is calculated; -  * - the original value is not saved (lost); -  * - further all operations are made with these hashes; -  * - hash table is constructed according to the scheme: -  * - open addressing (one buffer, position in buffer is calculated by taking remainder of division by its size); -  * - linear probing (if the cell already has a value, then the cell following it is taken, etc.); -  * - the missing value is zero-encoded; to remember presence of zero in set, separate variable of type bool is used; -  * - buffer growth by 2 times when filling more than 50%; -  * - if the set has more UNIQUES_HASH_MAX_SIZE elements, then all the elements are removed from the set, -  * not divisible by 2, and then all elements that do not divide by 2 are not inserted into the set; -  * - if the situation repeats, then only elements dividing by 4, etc., are taken. -  * - the size() method returns an approximate number of elements that have been inserted into the set; -  * - there are methods for quick reading and writing in binary and text form. -  */ + * Works like this: + * - you can insert UInt64; + * - before insertion, first the hash function UInt64 -> UInt32 is calculated; + * - the original value is not saved (lost); + * - further all operations are made with these hashes; + * - hash table is constructed according to the scheme: + * - open addressing (one buffer, position in buffer is calculated by taking remainder of division by its size); + * - linear probing (if the cell already has a value, then the cell following it is taken, etc.); + * - the missing value is zero-encoded; to remember presence of zero in set, separate variable of type bool is used; + * - buffer growth by 2 times when filling more than 50%; + * - if the set has more UNIQUES_HASH_MAX_SIZE elements, then all the elements are removed from the set, + * not divisible by 2, and then all elements that do not divide by 2 are not inserted into the set; + * - if the situation repeats, then only elements dividing by 4, etc., are taken. + * - the size() method returns an approximate number of elements that have been inserted into the set; + * - there are methods for quick reading and writing in binary and text form. + */ /// The maximum degree of buffer size before the values are discarded #define UNIQUES_HASH_MAX_SIZE_DEGREE 17 @@ -50,8 +50,8 @@ #define UNIQUES_HASH_MAX_SIZE (1ULL << (UNIQUES_HASH_MAX_SIZE_DEGREE - 1)) /** The number of least significant bits used for thinning. The remaining high-order bits are used to determine the position in the hash table. -  * (high-order bits are taken because the younger bits will be constant after dropping some of the values) -  */ + * (high-order bits are taken because the younger bits will be constant after dropping some of the values) + */ #define UNIQUES_HASH_BITS_FOR_SKIP (32 - UNIQUES_HASH_MAX_SIZE_DEGREE) /// Initial buffer size degree @@ -59,8 +59,8 @@ /** This hash function is not the most optimal, but UniquesHashSet states counted with it, -  * stored in many places on disks (in the Yandex.Metrika), so it continues to be used. -  */ + * stored in many places on disks (in the Yandex.Metrika), so it continues to be used. + */ struct UniquesHashSetDefaultHash { size_t operator() (UInt64 x) const diff --git a/dbms/src/Client/tests/test_connect.cpp b/dbms/src/Client/tests/test_connect.cpp index 75eb606cc97..50075cc24a6 100644 --- a/dbms/src/Client/tests/test_connect.cpp +++ b/dbms/src/Client/tests/test_connect.cpp @@ -9,9 +9,9 @@ /** In a loop it connects to the server and immediately breaks the connection. -  * Using the SO_LINGER option, we ensure that the connection is terminated by sending a RST packet (not FIN). -  * This behavior causes a bug in the TCPServer implementation in the Poco library. -  */ + * Using the SO_LINGER option, we ensure that the connection is terminated by sending a RST packet (not FIN). + * This behavior causes a bug in the TCPServer implementation in the Poco library. + */ int main(int argc, char ** argv) try { diff --git a/dbms/src/Common/InterruptListener.h b/dbms/src/Common/InterruptListener.h index eb818671037..37ec0533dd6 100644 --- a/dbms/src/Common/InterruptListener.h +++ b/dbms/src/Common/InterruptListener.h @@ -45,10 +45,10 @@ static int sigtimedwait(const sigset_t *set, siginfo_t *info, const struct times /** As long as there exists an object of this class - it blocks the INT signal, at the same time it lets you know if it came. -  * This is necessary so that you can interrupt the execution of the request with Ctrl+C. -  * Use only one instance of this class at a time. -  * If `check` method returns true (the signal has arrived), the next call will wait for the next signal. -  */ + * This is necessary so that you can interrupt the execution of the request with Ctrl+C. + * Use only one instance of this class at a time. + * If `check` method returns true (the signal has arrived), the next call will wait for the next signal. + */ class InterruptListener { private: