ClickHouse/dbms/AggregateFunctions/QuantileExact.h

#pragma once

#include <Common/PODArray.h>
#include <Common/NaNUtils.h>
#include <Core/Types.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
#include <IO/VarInt.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int NOT_IMPLEMENTED;
    extern const int BAD_ARGUMENTS;
}

/** Calculates quantile by collecting all values into array
  *  and applying n-th element (introselect) algorithm for the resulting array.
  *
  * It uses O(N) memory and it is very inefficient in case of high amount of identical values.
  * But it is very CPU efficient for not large datasets.
  */
template <typename Value>
struct QuantileExact
{
    /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.
    static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray<Value>);
    using Array = PODArrayWithStackMemory<Value, bytes_in_arena>;
    Array array;

    void add(const Value & x)
    {
        /// We must skip NaNs as they are not compatible with comparison sorting.
        if (!isNaN(x))
            array.push_back(x);
    }

    template <typename Weight>
    void add(const Value &, const Weight &)
    {
        throw Exception("Method add with weight is not implemented for QuantileExact", ErrorCodes::NOT_IMPLEMENTED);
    }

    void merge(const QuantileExact & rhs)
    {
        array.insert(rhs.array.begin(), rhs.array.end());
    }

    void serialize(WriteBuffer & buf) const
    {
        size_t size = array.size();
        writeVarUInt(size, buf);
        buf.write(reinterpret_cast<const char *>(array.data()), size * sizeof(array[0]));
    }

    void deserialize(ReadBuffer & buf)
    {
        size_t size = 0;
        readVarUInt(size, buf);
        array.resize(size);
        buf.read(reinterpret_cast<char *>(array.data()), size * sizeof(array[0]));
    }

    /// Get the value of the `level` quantile. The level must be between 0 and 1.
    Value get(Float64 level)
    {
        if (!array.empty())
        {
            size_t n = level < 1
                ? level * array.size()
                : (array.size() - 1);

            std::nth_element(array.begin(), array.begin() + n, array.end());    /// NOTE You can think of the radix-select algorithm.
            return array[n];
        }

        return std::numeric_limits<Value>::quiet_NaN();
    }

    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
    {
        if (!array.empty())
        {
            size_t prev_n = 0;
            for (size_t i = 0; i < size; ++i)
            {
                auto level = levels[indices[i]];

                size_t n = level < 1
                    ? level * array.size()
                    : (array.size() - 1);

                std::nth_element(array.begin() + prev_n, array.begin() + n, array.end());

                result[indices[i]] = array[n];
                prev_n = n;
            }
        }
        else
        {
            for (size_t i = 0; i < size; ++i)
                result[i] = Value();
        }
    }
};

/// QuantileExactExclusive is equivalent to Excel PERCENTILE.EXC, R-6, SAS-4, SciPy-(0,0)
template <typename Value>
struct QuantileExactExclusive : public QuantileExact<Value>
{
    using QuantileExact<Value>::array;

    /// Get the value of the `level` quantile. The level must be between 0 and 1 excluding bounds.
    Float64 getFloat(Float64 level)
    {
        if (!array.empty())
        {
            if (level == 0. || level == 1.)
                throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);

            Float64 h = level * (array.size() + 1);
            auto n = static_cast<size_t>(h);

            if (n >= array.size())
                return array[array.size() - 1];
            else if (n < 1)
                return array[0];

            std::nth_element(array.begin(), array.begin() + n - 1, array.end());
            auto nth_element = std::min_element(array.begin() + n, array.end());

            return array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
        }

        return std::numeric_limits<Float64>::quiet_NaN();
    }

    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
    {
        if (!array.empty())
        {
            size_t prev_n = 0;
            for (size_t i = 0; i < size; ++i)
            {
                auto level = levels[indices[i]];
                if (level == 0. || level == 1.)
                    throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);

                Float64 h = level * (array.size() + 1);
                auto n = static_cast<size_t>(h);

                if (n >= array.size())
                    result[indices[i]] = array[array.size() - 1];
                else if (n < 1)
                    result[indices[i]] = array[0];
                else
                {
                    std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());
                    auto nth_element = std::min_element(array.begin() + n, array.end());

                    result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
                    prev_n = n - 1;
                }
            }
        }
        else
        {
            for (size_t i = 0; i < size; ++i)
                result[i] = std::numeric_limits<Float64>::quiet_NaN();
        }
    }
};

/// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1)
template <typename Value>
struct QuantileExactInclusive : public QuantileExact<Value>
{
    using QuantileExact<Value>::array;

    /// Get the value of the `level` quantile. The level must be between 0 and 1 including bounds.
    Float64 getFloat(Float64 level)
    {
        if (!array.empty())
        {
            Float64 h = level * (array.size() - 1) + 1;
            auto n = static_cast<size_t>(h);

            if (n >= array.size())
                return array[array.size() - 1];
            else if (n < 1)
                return array[0];

            std::nth_element(array.begin(), array.begin() + n - 1, array.end());
            auto nth_element = std::min_element(array.begin() + n, array.end());

            return array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
        }

        return std::numeric_limits<Float64>::quiet_NaN();
    }

    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
    {
        if (!array.empty())
        {
            size_t prev_n = 0;
            for (size_t i = 0; i < size; ++i)
            {
                auto level = levels[indices[i]];

                Float64 h = level * (array.size() - 1) + 1;
                auto n = static_cast<size_t>(h);

                if (n >= array.size())
                    result[indices[i]] = array[array.size() - 1];
                else if (n < 1)
                    result[indices[i]] = array[0];
                else
                {
                    std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());
                    auto nth_element = std::min_element(array.begin() + n, array.end());

                    result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
                    prev_n = n - 1;
                }
            }
        }
        else
        {
            for (size_t i = 0; i < size; ++i)
                result[i] = std::numeric_limits<Float64>::quiet_NaN();
        }
    }
};

}
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`#pragma once`

Fix missing includes 2017-12-26 19:00:20 +00:00			`#include <Common/PODArray.h>`
Fixed handling of NaNs in aggregate functions that use comparison based sorting #2012 2018-03-14 05:03:51 +00:00			`#include <Common/NaNUtils.h>`
Grouped includes (40 of 182) 2019-02-10 17:40:52 +00:00			`#include <Core/Types.h>`
Fix missing includes 2017-12-26 19:00:20 +00:00			`#include <IO/WriteBuffer.h>`
			`#include <IO/ReadBuffer.h>`
			`#include <IO/VarInt.h>`

Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00
			`namespace DB`
			`{`

			`namespace ErrorCodes`
			`{`
			`extern const int NOT_IMPLEMENTED;`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`extern const int BAD_ARGUMENTS;`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`}`

Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 20:25:22 +00:00			`/** Calculates quantile by collecting all values into array`
			`* and applying n-th element (introselect) algorithm for the resulting array.`
			`*`
Aggregate function for entropy 2019-02-02 14:27:43 +00:00			`* It uses O(N) memory and it is very inefficient in case of high amount of identical values.`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 20:25:22 +00:00			`* But it is very CPU efficient for not large datasets.`
			`*/`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`template <typename Value>`
			`struct QuantileExact`
			`{`
			`/// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 08:39:21 +00:00			`static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray<Value>);`
Fix initial size of some inline PODArray's. A template parameter of PODArray named INITIAL_SIZE didn't make its units clear, which made some callers to erroneously assume that it specifies the number of elements and not the number of bytes. Rename it, fix the wrong usages and, where possible, use the PODArrayWithStackMemory typedef for arrays with inline memory. 2019-06-28 12:51:01 +00:00			`using Array = PODArrayWithStackMemory<Value, bytes_in_arena>;`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`Array array;`

			`void add(const Value & x)`
			`{`
Fixed handling of NaNs in aggregate functions that use comparison based sorting #2012 2018-03-14 05:03:51 +00:00			`/// We must skip NaNs as they are not compatible with comparison sorting.`
			`if (!isNaN(x))`
			`array.push_back(x);`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`}`

			`template <typename Weight>`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 08:39:21 +00:00			`void add(const Value &, const Weight &)`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`{`
			`throw Exception("Method add with weight is not implemented for QuantileExact", ErrorCodes::NOT_IMPLEMENTED);`
			`}`

			`void merge(const QuantileExact & rhs)`
			`{`
			`array.insert(rhs.array.begin(), rhs.array.end());`
			`}`

			`void serialize(WriteBuffer & buf) const`
			`{`
			`size_t size = array.size();`
			`writeVarUInt(size, buf);`
Avoid temporary nullptr references [#CLICKHOUSE-2] 2018-09-02 03:00:04 +00:00			`buf.write(reinterpret_cast<const char >(array.data()), size sizeof(array[0]));`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`}`

			`void deserialize(ReadBuffer & buf)`
			`{`
			`size_t size = 0;`
			`readVarUInt(size, buf);`
			`array.resize(size);`
Avoid temporary nullptr references [#CLICKHOUSE-2] 2018-09-02 03:00:04 +00:00			`buf.read(reinterpret_cast<char >(array.data()), size sizeof(array[0]));`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`}`

			/// Get the value of the `level` quantile. The level must be between 0 and 1.
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`Value get(Float64 level)`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`{`
			`if (!array.empty())`
			`{`
			`size_t n = level < 1`
			`? level * array.size()`
			`: (array.size() - 1);`

			`std::nth_element(array.begin(), array.begin() + n, array.end()); /// NOTE You can think of the radix-select algorithm.`
			`return array[n];`
			`}`

return nan when quantileExact with empty float column 2018-08-13 08:33:51 +00:00			`return std::numeric_limits<Value>::quiet_NaN();`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`}`

			/// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
			`/// indices - an array of index levels such that the corresponding elements will go in ascending order.`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`{`
			`if (!array.empty())`
			`{`
			`size_t prev_n = 0;`
			`for (size_t i = 0; i < size; ++i)`
			`{`
			`auto level = levels[indices[i]];`

			`size_t n = level < 1`
			`? level * array.size()`
			`: (array.size() - 1);`

			`std::nth_element(array.begin() + prev_n, array.begin() + n, array.end());`

			`result[indices[i]] = array[n];`
			`prev_n = n;`
			`}`
			`}`
			`else`
			`{`
			`for (size_t i = 0; i < size; ++i)`
			`result[i] = Value();`
			`}`
			`}`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`};`

Fixed comment 2019-08-14 11:18:46 +00:00			`/// QuantileExactExclusive is equivalent to Excel PERCENTILE.EXC, R-6, SAS-4, SciPy-(0,0)`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`template <typename Value>`
			`struct QuantileExactExclusive : public QuantileExact<Value>`
			`{`
			`using QuantileExact<Value>::array;`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00
			/// Get the value of the `level` quantile. The level must be between 0 and 1 excluding bounds.
virtual functions deleted 2019-08-19 16:03:25 +00:00			`Float64 getFloat(Float64 level)`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`{`
			`if (!array.empty())`
			`{`
Fixed comment 2019-08-14 11:18:46 +00:00			`if (level == 0. \|\| level == 1.)`
			`throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);`

QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`Float64 h = level * (array.size() + 1);`
			`auto n = static_cast<size_t>(h);`

			`if (n >= array.size())`
			`return array[array.size() - 1];`
			`else if (n < 1)`
			`return array[0];`

			`std::nth_element(array.begin(), array.begin() + n - 1, array.end());`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`auto nth_element = std::min_element(array.begin() + n, array.end());`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`return array[n - 1] + (h - n) * (*nth_element - array[n - 1]);`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`}`

			`return std::numeric_limits<Float64>::quiet_NaN();`
			`}`

virtual functions deleted 2019-08-19 16:03:25 +00:00			`void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`{`
			`if (!array.empty())`
			`{`
			`size_t prev_n = 0;`
			`for (size_t i = 0; i < size; ++i)`
			`{`
			`auto level = levels[indices[i]];`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`if (level == 0. \|\| level == 1.)`
			`throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00
			`Float64 h = level * (array.size() + 1);`
			`auto n = static_cast<size_t>(h);`

			`if (n >= array.size())`
			`result[indices[i]] = array[array.size() - 1];`
			`else if (n < 1)`
			`result[indices[i]] = array[0];`
			`else`
			`{`
			`std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`auto nth_element = std::min_element(array.begin() + n, array.end());`

			`result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]);`
			`prev_n = n - 1;`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for (size_t i = 0; i < size; ++i)`
			`result[i] = std::numeric_limits<Float64>::quiet_NaN();`
			`}`
			`}`
			`};`

Added a check for double whitespaces 2019-08-23 18:30:04 +00:00			`/// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1)`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`template <typename Value>`
			`struct QuantileExactInclusive : public QuantileExact<Value>`
			`{`
			`using QuantileExact<Value>::array;`

			/// Get the value of the `level` quantile. The level must be between 0 and 1 including bounds.
virtual functions deleted 2019-08-19 16:03:25 +00:00			`Float64 getFloat(Float64 level)`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`{`
			`if (!array.empty())`
			`{`
			`Float64 h = level * (array.size() - 1) + 1;`
			`auto n = static_cast<size_t>(h);`

			`if (n >= array.size())`
			`return array[array.size() - 1];`
			`else if (n < 1)`
			`return array[0];`

			`std::nth_element(array.begin(), array.begin() + n - 1, array.end());`
			`auto nth_element = std::min_element(array.begin() + n, array.end());`

			`return array[n - 1] + (h - n) * (*nth_element - array[n - 1]);`
			`}`

			`return std::numeric_limits<Float64>::quiet_NaN();`
			`}`

virtual functions deleted 2019-08-19 16:03:25 +00:00			`void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)`
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`{`
			`if (!array.empty())`
			`{`
			`size_t prev_n = 0;`
			`for (size_t i = 0; i < size; ++i)`
			`{`
			`auto level = levels[indices[i]];`

			`Float64 h = level * (array.size() - 1) + 1;`
			`auto n = static_cast<size_t>(h);`

			`if (n >= array.size())`
			`result[indices[i]] = array[array.size() - 1];`
			`else if (n < 1)`
			`result[indices[i]] = array[0];`
			`else`
			`{`
			`std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());`
			`auto nth_element = std::min_element(array.begin() + n, array.end());`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00
Added QuantileExactInclusive function. Deleted redundant virtuals. 2019-08-14 11:13:04 +00:00			`result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]);`
			`prev_n = n - 1;`
QuantileExactExclusive function added. 2019-08-13 19:12:31 +00:00			`}`
			`}`
			`}`
			`else`
			`{`
			`for (size_t i = 0; i < size; ++i)`
			`result[i] = std::numeric_limits<Float64>::quiet_NaN();`
			`}`
			`}`
Simplification of aggregate functions: development [#CLICKHOUSE-2]. 2017-12-20 07:36:30 +00:00			`};`

			`}`