From 2218ebebbfaac4d5114a476eafe40f71bdd25455 Mon Sep 17 00:00:00 2001
From: Gabriel Mendes <gabrielmcg44@gmail.com>
Date: Wed, 18 Sep 2024 05:15:57 -0300
Subject: [PATCH] initial commit, tested function

---
 .../functions/array-functions.md              |  35 +++
 .../functions/array-functions.md              |  37 +++
 .../functions/array-functions.md              |  35 +++
 src/Functions/array/arrayAUCUnscaled.cpp      | 212 ++++++++++++++++++
 tests/fuzz/all.dict                           |   1 +
 tests/fuzz/dictionaries/functions.dict        |   1 +
 tests/fuzz/dictionaries/old.dict              |   1 +
 .../03237_array_auc_unscaled.reference        |  25 +++
 .../0_stateless/03237_array_auc_unscaled.sql  |  30 +++
 .../aspell-ignore/en/aspell-dict.txt          |   1 +
 10 files changed, 378 insertions(+)
 create mode 100644 src/Functions/array/arrayAUCUnscaled.cpp
 create mode 100644 tests/queries/0_stateless/03237_array_auc_unscaled.reference
 create mode 100644 tests/queries/0_stateless/03237_array_auc_unscaled.sql

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index ad971ae7554..89178cf8c5c 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -2116,6 +2116,41 @@ Result:
 └───────────────────────────────────────────────┘
 ```
 
+## arrayAUC
+
+Calculate unscaled AUC (Area Under the Curve, which is a concept in machine learning, see more details: <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>), i.e. without dividing it by total true positives and total false positives.
+
+**Syntax**
+
+``` sql
+arrayAUCUnscaled(arr_scores, arr_labels)
+```
+
+**Arguments**
+
+- `arr_scores` — scores prediction model gives.
+- `arr_labels` — labels of samples, usually 1 for positive sample and 0 for negative sample.
+
+**Returned value**
+
+Returns unscaled AUC value with type Float64.
+
+**Example**
+
+Query:
+
+``` sql
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
+```
+
+Result:
+
+``` text
+┌─arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐
+│                                                   3.0 │
+└───────────────────────────────────────────────────────┘
+```
+
 ## arrayMap(func, arr1, ...)
 
 Returns an array obtained from the original arrays by application of `func(arr1[i], ..., arrN[i])` for each element. Arrays `arr1` ... `arrN` must have the same number of elements.
diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md
index 825e3f06be2..7923e9af945 100644
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@@ -1654,6 +1654,43 @@ SELECT arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
 └────────────────────────────────────────---──┘
 ```
 
+## arrayAUCUnscaled {#arrayaucunscaled}
+
+Вычисляет площадь под кривой без нормализации.
+
+**Синтаксис**
+
+``` sql
+arrayAUCUnscaled(arr_scores, arr_labels)
+```
+
+**Аргументы**
+
+- `arr_scores` — оценка, которую дает модель предсказания.
+- `arr_labels` — ярлыки выборок, обычно 1 для содержательных выборок и 0 для бессодержательных выборок.
+
+**Возвращаемое значение**
+
+Значение площади под кривой без нормализации.
+
+Тип данных: `Float64`.
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
+```
+
+Результат:
+
+``` text
+┌─arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐
+│                                                   3.0 │
+└────────────────────────────────────────---────────────┘
+```
+
 ## arrayProduct {#arrayproduct}
 
 Возвращает произведение элементов [массива](../../sql-reference/data-types/array.md).
diff --git a/docs/zh/sql-reference/functions/array-functions.md b/docs/zh/sql-reference/functions/array-functions.md
index 69db34e4a36..5ff3e6a424c 100644
--- a/docs/zh/sql-reference/functions/array-functions.md
+++ b/docs/zh/sql-reference/functions/array-functions.md
@@ -1221,6 +1221,41 @@ select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
 └───────────────────────────────────────────────┘
 ```
 
+## arrayAUCUnscaled {#arrayaucunscaled}
+
+计算没有归一化的AUC (ROC曲线下的面积，这是机器学习中的一个概念，更多细节请查看：https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)。
+
+**语法**
+
+``` sql
+arrayAUCUnscaled(arr_scores, arr_labels)
+```
+
+**参数**
+
+- `arr_scores` — 分数预测模型给出。
+- `arr_labels` — 样本的标签，通常为 1 表示正样本，0 表示负样本。
+
+**返回值**
+
+返回 Float64 类型的非标准化 AUC 值。
+
+**示例**
+
+查询语句:
+
+``` sql
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
+```
+
+结果:
+
+``` text
+┌─arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐
+│                                                   3.0 │
+└───────────────────────────────────────────────────────┘
+```
+
 ## arrayMap(func, arr1, ...) {#array-map}
 
 将从 `func` 函数的原始应用中获得的数组返回给 `arr` 数组中的每个元素。
diff --git a/src/Functions/array/arrayAUCUnscaled.cpp b/src/Functions/array/arrayAUCUnscaled.cpp
new file mode 100644
index 00000000000..2cf0d072218
--- /dev/null
+++ b/src/Functions/array/arrayAUCUnscaled.cpp
@@ -0,0 +1,212 @@
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeArray.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnArray.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/FunctionFactory.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int ILLEGAL_COLUMN;
+    extern const int BAD_ARGUMENTS;
+}
+
+
+/** The function takes two arrays: scores and labels.
+  * Label can be one of two values: positive and negative.
+  * Score can be arbitrary number.
+  *
+  * These values are considered as the output of classifier. We have some true labels for objects.
+  * And classifier assigns some scores to objects that predict these labels in the following way:
+  * - we can define arbitrary threshold on score and predict that the label is positive if the score is greater than the threshold:
+  *
+  * f(object) = score
+  * predicted_label = score > threshold
+  *
+  * This way classifier may predict positive or negative value correctly - true positive or true negative
+  *   or have false positive or false negative result.
+  * Verying the threshold we can get different probabilities of false positive or false negatives or true positives, etc...
+  *
+  * We can also calculate the True Positive Rate and the False Positive Rate:
+  *
+  * TPR (also called "sensitivity", "recall" or "probability of detection")
+  *  is the probability of classifier to give positive result if the object has positive label:
+  * TPR = P(score > threshold | label = positive)
+  *
+  * FPR is the probability of classifier to give positive result if the object has negative label:
+  * FPR = P(score > threshold | label = negative)
+  *
+  * We can draw a curve of values of FPR and TPR with different threshold on [0..1] x [0..1] unit square.
+  * This curve is named "ROC curve" (Receiver Operating Characteristic).
+  *
+  * For ROC we can calculate, literally, Area Under the Curve, that will be in the range of [0..1].
+  * The higher the AUC the better the classifier.
+  *
+  * AUC also is as the probability that the score for positive label is greater than the score for negative label.
+  *
+  * https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
+  * https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+  *
+  * To calculate AUC, we will draw points of (FPR, TPR) for different thresholds = score_i.
+  * FPR_raw = countIf(score > score_i, label = negative) = count negative labels above certain score
+  * TPR_raw = countIf(score > score_i, label = positive) = count positive labels above certain score
+  *
+  * Let's look at the example:
+  * arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
+  *
+  * 1. We have pairs: (-, 0.1), (-, 0.4), (+, 0.35), (+, 0.8)
+  *
+  * 2. Let's sort by score: (-, 0.1), (+, 0.35), (-, 0.4), (+, 0.8)
+  *
+  * 3. Let's draw the points:
+  *
+  * threshold = 0,       TPR_raw = 2,    FPR_raw = 2
+  * threshold = 0.1,     TPR_raw = 2,    FPR_raw = 1
+  * threshold = 0.35,    TPR_raw = 1,    FPR_raw = 1
+  * threshold = 0.4,     TPR_raw = 1,    FPR_raw = 0
+  * threshold = 0.8,     TPR_raw = 0,    FPR_raw = 0
+  *
+  * The "curve" will be present by a line that moves one step either towards right or top on each threshold change.
+  */
+
+class FunctionArrayAUCUnscaled : public IFunction
+{
+public:
+    static constexpr auto name = "arrayAUCUnscaled";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayAUCUnscaled>(); }
+
+private:
+    static Float64 apply(
+        const IColumn & scores,
+        const IColumn & labels,
+        ColumnArray::Offset current_offset,
+        ColumnArray::Offset next_offset)
+    {
+        struct ScoreLabel
+        {
+            Float64 score;
+            bool label;
+        };
+
+        size_t size = next_offset - current_offset;
+        PODArrayWithStackMemory<ScoreLabel, 1024> sorted_labels(size);
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            bool label = labels.getFloat64(current_offset + i) > 0;
+            sorted_labels[i].score = scores.getFloat64(current_offset + i);
+            sorted_labels[i].label = label;
+        }
+
+        /// Sorting scores in descending order to traverse the ROC curve from left to right
+        std::sort(sorted_labels.begin(), sorted_labels.end(), [](const auto & lhs, const auto & rhs) { return lhs.score > rhs.score; });
+
+        Float64 area = 0.0;
+        Float64 prev_score = sorted_labels[0].score;
+        size_t prev_fp = 0, prev_tp = 0;
+        size_t curr_fp = 0, curr_tp = 0;
+        for (size_t i = 0; i < size; ++i)
+        {
+            // Only increment the area when the score changes
+            if (sorted_labels[i].score != prev_score)
+            {
+                area += (curr_fp - prev_fp) * (curr_tp + prev_tp) / 2.0; // Trapezoidal area under curve (might degenerate to zero or to a rectangle)
+                prev_fp = curr_fp;
+                prev_tp = curr_tp;
+                prev_score = sorted_labels[i].score;
+            }
+
+            if (sorted_labels[i].label)
+                curr_tp += 1; /// The curve moves one step up.
+            else
+                curr_fp += 1; /// The curve moves one step right.
+        }
+
+        area += (curr_fp - prev_fp) * (curr_tp + prev_tp) / 2.0;
+
+        return area;
+    }
+
+    static void vector(
+        const IColumn & scores,
+        const IColumn & labels,
+        const ColumnArray::Offsets & offsets,
+        PaddedPODArray<Float64> & result,
+        size_t input_rows_count)
+    {
+        result.resize(input_rows_count);
+
+        ColumnArray::Offset current_offset = 0;
+        for (size_t i = 0; i < input_rows_count; ++i)
+        {
+            auto next_offset = offsets[i];
+            result[i] = apply(scores, labels, current_offset, next_offset);
+            current_offset = next_offset;
+        }
+    }
+
+public:
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 2; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return false; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        for (size_t i = 0; i < getNumberOfArguments(); ++i)
+        {
+            const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[i].get());
+            if (!array_type)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "All arguments for function {} must be an array.", getName());
+
+            const auto & nested_type = array_type->getNestedType();
+            if (!isNativeNumber(nested_type) && !isEnum(nested_type))
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} cannot process values of type {}",
+                                getName(), nested_type->getName());
+        }
+
+        return std::make_shared<DataTypeFloat64>();
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        ColumnPtr col1 = arguments[0].column->convertToFullColumnIfConst();
+        ColumnPtr col2 = arguments[1].column->convertToFullColumnIfConst();
+
+        const ColumnArray * col_array1 = checkAndGetColumn<ColumnArray>(col1.get());
+        if (!col_array1)
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName());
+
+        const ColumnArray * col_array2 = checkAndGetColumn<ColumnArray>(col2.get());
+        if (!col_array2)
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column {} of second argument of function {}", arguments[1].column->getName(), getName());
+
+        if (!col_array1->hasEqualOffsets(*col_array2))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Array arguments for function {} must have equal sizes", getName());
+
+        auto col_res = ColumnVector<Float64>::create();
+
+        vector(
+            col_array1->getData(),
+            col_array2->getData(),
+            col_array1->getOffsets(),
+            col_res->getData(),
+            input_rows_count);
+
+        return col_res;
+    }
+};
+
+
+REGISTER_FUNCTION(ArrayAUCUnscaled)
+{
+    factory.registerFunction<FunctionArrayAUCUnscaled>();
+}
+
+}
diff --git a/tests/fuzz/all.dict b/tests/fuzz/all.dict
index 30af3746fca..6cb198a3e48 100644
--- a/tests/fuzz/all.dict
+++ b/tests/fuzz/all.dict
@@ -1216,6 +1216,7 @@
 "argMinState"
 "array"
 "arrayAUC"
+"arrayAUCUnscaled"
 "arrayAll"
 "arrayAvg"
 "arrayCompact"
diff --git a/tests/fuzz/dictionaries/functions.dict b/tests/fuzz/dictionaries/functions.dict
index e562595fb67..302aab97c2d 100644
--- a/tests/fuzz/dictionaries/functions.dict
+++ b/tests/fuzz/dictionaries/functions.dict
@@ -529,6 +529,7 @@
 "argMinState"
 "array"
 "arrayAUC"
+"arrayAUCUnscaled"
 "arrayAll"
 "arrayAvg"
 "arrayCompact"
diff --git a/tests/fuzz/dictionaries/old.dict b/tests/fuzz/dictionaries/old.dict
index 61914c3b283..6ecb5503ca4 100644
--- a/tests/fuzz/dictionaries/old.dict
+++ b/tests/fuzz/dictionaries/old.dict
@@ -19,6 +19,7 @@
 "Array"
 "arrayAll"
 "arrayAUC"
+"arrayAUCUnscaled"
 "arrayCompact"
 "arrayConcat"
 "arrayCount"
diff --git a/tests/queries/0_stateless/03237_array_auc_unscaled.reference b/tests/queries/0_stateless/03237_array_auc_unscaled.reference
new file mode 100644
index 00000000000..63204682fd4
--- /dev/null
+++ b/tests/queries/0_stateless/03237_array_auc_unscaled.reference
@@ -0,0 +1,25 @@
+3
+3
+3
+3
+3
+3
+3
+3
+3
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+0.5
+1
+0
+1.5
+2
+1.5
diff --git a/tests/queries/0_stateless/03237_array_auc_unscaled.sql b/tests/queries/0_stateless/03237_array_auc_unscaled.sql
new file mode 100644
index 00000000000..d4f07c42118
--- /dev/null
+++ b/tests/queries/0_stateless/03237_array_auc_unscaled.sql
@@ -0,0 +1,30 @@
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], cast([0, 0, 1, 1] as Array(Int8)));
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], cast([-1, -1, 1, 1] as Array(Int8)));
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], cast(['false', 'false', 'true', 'true'] as Array(Enum8('false' = 0, 'true' = 1))));
+select arrayAUCUnscaled([0.1, 0.4, 0.35, 0.8], cast(['false', 'false', 'true', 'true'] as Array(Enum8('false' = -1, 'true' = 1))));
+select arrayAUCUnscaled(cast([10, 40, 35, 80] as Array(UInt8)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([10, 40, 35, 80] as Array(UInt16)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([10, 40, 35, 80] as Array(UInt32)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([10, 40, 35, 80] as Array(UInt64)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([-10, -40, -35, -80] as Array(Int8)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([-10, -40, -35, -80] as Array(Int16)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([-10, -40, -35, -80] as Array(Int32)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([-10, -40, -35, -80] as Array(Int64)), [0, 0, 1, 1]);
+select arrayAUCUnscaled(cast([-0.1, -0.4, -0.35, -0.8] as Array(Float32)) , [0, 0, 1, 1]);
+select arrayAUCUnscaled([0, 3, 5, 6, 7.5, 8], [1, 0, 1, 0, 0, 0]);
+select arrayAUCUnscaled([0.1, 0.35, 0.4, 0.8], [1, 0, 1, 0]);
+SELECT arrayAUCUnscaled([], []); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT arrayAUCUnscaled([1], [1]);
+SELECT arrayAUCUnscaled([1], []); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT arrayAUCUnscaled([], [1]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT arrayAUCUnscaled([1, 2], [3]); -- { serverError BAD_ARGUMENTS }
+SELECT arrayAUCUnscaled([1], [2, 3]); -- { serverError BAD_ARGUMENTS }
+SELECT arrayAUCUnscaled([1, 1], [1, 1]);
+SELECT arrayAUCUnscaled([1, 1], [0, 0]);
+SELECT arrayAUCUnscaled([1, 1], [0, 1]);
+SELECT arrayAUCUnscaled([0, 1], [0, 1]);
+SELECT arrayAUCUnscaled([1, 0], [0, 1]);
+SELECT arrayAUCUnscaled([0, 0, 1], [0, 1, 1]);
+SELECT arrayAUCUnscaled([0, 1, 1], [0, 1, 1]);
+SELECT arrayAUCUnscaled([0, 1, 1], [0, 0, 1]);
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 3467f21c812..f658b19e8a7 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -1153,6 +1153,7 @@ argMin
 argmax
 argmin
 arrayAUC
+arrayAUCUnscaled
 arrayAll
 arrayAvg
 arrayCompact