Merge pull request #72823 from ucasfl/norm-gini

Add function `arrayNormalizedGini`
This commit is contained in:
Robert Schulze 2024-12-17 19:02:02 +00:00 committed by GitHub
commit 994e606153
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 490 additions and 0 deletions

View File

@ -3222,6 +3222,41 @@ Result:
└─────────┘
```
## arrayNormalizedGini
Calculates the normalized Gini coefficient.
**Syntax**
```sql
arrayNormalizedGini(predicted, label)
```
**Arguments**
- `predicted` — Predicted values ([Array(T)](../data-types/array.md))
- `label` — Actual values ([Array(T)](../data-types/array.md))
**Returned Value**
- A tuple containing the Gini coefficients of the predicted values, the Gini coefficient of the normalized values, and the normalized Gini coefficient (= the ratio of the former two Gini coefficients).
**Examples**
Query:
```sql
SELECT arrayNormalizedGini([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2]);
```
Result:
```
┌─arrayNormalizedGini([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2])──────────┐
│ (0.18055555555555558,0.2638888888888889,0.6842105263157896) │
└─────────────────────────────────────────────────────────────┘
```
## Distance functions
All supported functions are described in [distance functions documentation](../../sql-reference/functions/distance-functions.md).

View File

@ -0,0 +1,415 @@
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/castTypeToEither.h>
#include <Interpreters/Context_fwd.h>
#include <numeric>
#include <pdqsort.h>
namespace DB
{
static constexpr size_t MAX_ARRAY_SIZE = 1 << 20;
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_ARRAY_SIZE;
}
struct Impl
{
template <typename T1, typename T2>
static void vectorConst(
const PaddedPODArray<T1> & array_predicted_data,
const ColumnArray::Offsets & array_predicted_offsets,
const PaddedPODArray<T2> & array_labels_const,
PaddedPODArray<Float64> & col_gini_predicted,
PaddedPODArray<Float64> & col_gini_labels,
PaddedPODArray<Float64> & col_gini_normalized)
{
size_t size = col_gini_predicted.size();
size_t array_size = array_labels_const.size();
if (array_size > MAX_ARRAY_SIZE)
throw Exception(
ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size in function arrayNormalizedGini: {}, maximum: {}",
array_size,
MAX_ARRAY_SIZE);
for (size_t i = 0; i < size; ++i)
{
size_t array1_size = array_predicted_offsets[i] - array_predicted_offsets[i - 1];
if (array1_size != array_size)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "All arrays in function arrayNormalizedGini should have same size");
// Why we need to create a new array here every loop, because array2 will be sorted in calculateNormalizedGini.
PODArrayWithStackMemory<T2, 1024> array2(array_labels_const.begin(), array_labels_const.end());
auto [gini_predicted, gini_labels, gini_normalized] = calculateNormalizedGini(array_predicted_data, array_predicted_offsets[i - 1], array2, array_size);
col_gini_predicted[i] = gini_predicted;
col_gini_labels[i] = gini_labels;
col_gini_normalized[i] = gini_normalized;
}
}
template <typename T1, typename T2>
static void vectorVector(
const PaddedPODArray<T1> & array_predicted_data,
const ColumnArray::Offsets & array_predicted_offsets,
const PaddedPODArray<T2> & array_labels_data,
const ColumnArray::Offsets & array_labels_offsets,
PaddedPODArray<Float64> & col_gini_predicted,
PaddedPODArray<Float64> & col_gini_labels,
PaddedPODArray<Float64> & col_gini_normalized)
{
size_t size = col_gini_predicted.size();
size_t array_size = size > 0 ? array_predicted_offsets[0] - array_predicted_offsets[-1] : 0;
if (array_size > MAX_ARRAY_SIZE)
throw Exception(
ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in arrayNormalizedGini: {}, maximum: {}", array_size, MAX_ARRAY_SIZE);
for (size_t i = 0; i < size; ++i)
{
size_t array1_size = array_predicted_offsets[i] - array_predicted_offsets[i - 1];
size_t array2_size = array_labels_offsets[i] - array_labels_offsets[i - 1];
if (array1_size != array_size || array2_size != array_size)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "All arrays in function arrayNormalizedGini should have same size");
PODArrayWithStackMemory<T2, 1024> array2(array_labels_data.data() + array_labels_offsets[i - 1], array_labels_data.data() + array_labels_offsets[i]);
auto [gini_predicted, gini_labels, gini_normalized] = calculateNormalizedGini(array_predicted_data, array_predicted_offsets[i - 1], array2, array_size);
col_gini_predicted[i] = gini_predicted;
col_gini_labels[i] = gini_labels;
col_gini_normalized[i] = gini_normalized;
}
}
template <typename T1, typename T2>
static void constVector(
const PaddedPODArray<T1> & array_predicted_const,
const PaddedPODArray<T2> & array_labels_data,
const ColumnArray::Offsets & array_labels_offsets,
PaddedPODArray<Float64> & col_gini_predicted,
PaddedPODArray<Float64> & col_gini_labels,
PaddedPODArray<Float64> & col_gini_normalized)
{
size_t size = col_gini_predicted.size();
size_t array_size = array_predicted_const.size();
if (array_size > MAX_ARRAY_SIZE)
throw Exception(
ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in arrayNormalizedGini: {}, maximum: {}", array_size, MAX_ARRAY_SIZE);
for (size_t i = 0; i < size; ++i)
{
size_t array1_size = array_labels_offsets[i] - array_labels_offsets[i - 1];
if (array1_size != array_size)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "All arrays in function arrayNormalizedGini should have same size");
PODArrayWithStackMemory<T2, 1024> array2(array_labels_data.data() + array_labels_offsets[i - 1], array_labels_data.data() + array_labels_offsets[i]);
auto [gini_predicted, gini_labels, gini_normalized] = calculateNormalizedGini(array_predicted_const, 0, array2, array_size);
col_gini_predicted[i] = gini_predicted;
col_gini_labels[i] = gini_labels;
col_gini_normalized[i] = gini_normalized;
}
}
private:
template <typename T1, typename T2>
static std::tuple<Float64, Float64, Float64> calculateNormalizedGini(
const PaddedPODArray<T1> & array1, size_t offset,
PODArrayWithStackMemory<T2, 1024> & array2, size_t array_size)
{
auto sort_idx = sortIndexes(array1, offset, array_size);
PODArrayWithStackMemory<T2, 1024> sorted_array2(array_size);
for (size_t i = 0; i < array_size; ++i)
sorted_array2[i] = array2[sort_idx[i]];
Float64 total_sum = std::accumulate(array2.begin(), array2.end(), 0.0);
PODArrayWithStackMemory<Float64, 1024> pred_cumsum_ratio(array_size);
Float64 pred_cumsum = 0;
for (size_t i = 0; i < array_size; ++i)
{
pred_cumsum += sorted_array2[i] / total_sum;
pred_cumsum_ratio[i] = pred_cumsum;
}
pdqsort(array2.begin(), array2.end());
PODArrayWithStackMemory<Float64, 1024> ltv_cumsum_ratio(array_size);
Float64 ltv_cumsum = 0;
for (size_t i = 0; i < array_size; ++i)
{
ltv_cumsum += array2[i] / total_sum;
ltv_cumsum_ratio[i] = ltv_cumsum;
}
Float64 random_gain_cumsum_ratio = 0.5 * (array_size + 1);
Float64 accumulate_pred_ratio = std::accumulate(pred_cumsum_ratio.begin(), pred_cumsum_ratio.end(), 0.0);
Float64 accumulate_ltv_ratio = std::accumulate(ltv_cumsum_ratio.begin(), ltv_cumsum_ratio.end(), 0.0);
Float64 pred_gini = (random_gain_cumsum_ratio - accumulate_pred_ratio) / array_size;
Float64 gini_labels = (random_gain_cumsum_ratio - accumulate_ltv_ratio) / array_size;
return std::make_tuple(pred_gini, gini_labels, pred_gini / gini_labels);
}
template <typename T>
static PODArrayWithStackMemory<size_t, 1024> sortIndexes(const PaddedPODArray<T> & array, size_t offset, size_t array_size)
{
PODArrayWithStackMemory<size_t, 1024> idx(array_size);
std::iota(idx.begin(), idx.end(), 0);
pdqsort(idx.begin(), idx.end(), [&array, offset](size_t i1, size_t i2) { return array[i1 + offset] < array[i2 + offset]; });
return idx;
}
};
/**
* Calculate the normalized Gini coefficient. See https://arxiv.org/pdf/1912.07753
*/
class FunctionArrayNormalizedGini : public IFunction
{
public:
static constexpr auto name = "arrayNormalizedGini";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayNormalizedGini>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() != 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be 2",
getName(), arguments.size());
const DataTypeArray * arg1_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
if (arg1_type == nullptr || !isNumber(arg1_type->getNestedType()))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"First argument for function {} must be an Array of numeric type, got {}",
getName(),
arguments[0]->getName());
const DataTypeArray * arg2_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
if (arg2_type == nullptr || !isNumber(arg2_type->getNestedType()))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Second argument for function {} must be an Array of numeric typegot {}",
getName(),
arguments[1]->getName());
return std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeFloat64>(), std::make_shared<DataTypeFloat64>(), std::make_shared<DataTypeFloat64>()});
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const auto & col_predicted = arguments[0].column;
const auto & col_labels = arguments[1].column;
Columns result(3);
for (size_t i = 0; i < 3; ++i)
result[i] = DataTypeFloat64().createColumn();
if (const ColumnArray * array_predicted = checkAndGetColumn<ColumnArray>(col_predicted.get()))
{
const auto & array_predicted_offsets = array_predicted->getOffsets();
const auto & array_predicted_type = typeid_cast<const DataTypeArray *>(arguments[0].type.get())->getNestedType();
if (const ColumnConst * array_labels_const = checkAndGetColumn<ColumnConst>(col_labels.get()))
{
const ColumnArray * column_array_const_internal_array = checkAndGetColumn<ColumnArray>(array_labels_const->getDataColumnPtr().get());
const auto & array_labels_type = typeid_cast<const DataTypeArray *>(arguments[1].type.get())->getNestedType();
if (castBothTypes(
array_predicted_type.get(),
array_labels_type.get(),
[&](const auto & type_predicted, const auto & type_labels)
{
using TypePredicted = typename std::decay_t<decltype(type_predicted)>::FieldType;
const ColumnVector<TypePredicted> * array_predicted_data = checkAndGetColumn<ColumnVector<TypePredicted>>(array_predicted->getDataPtr().get());
using TypeLabels = typename std::decay_t<decltype(type_labels)>::FieldType;
const ColumnVector<TypeLabels> * col_labels_data = checkAndGetColumn<ColumnVector<TypeLabels>>(column_array_const_internal_array->getDataPtr().get());
auto col_gini_predicted = ColumnFloat64::create(input_rows_count);
auto col_gini_labels = ColumnFloat64::create(input_rows_count);
auto col_gini_normalized = ColumnFloat64::create(input_rows_count);
Impl::vectorConst(
array_predicted_data->getData(),
array_predicted_offsets,
col_labels_data->getData(),
col_gini_predicted->getData(),
col_gini_labels->getData(),
col_gini_normalized->getData());
result[0] = std::move(col_gini_predicted);
result[1] = std::move(col_gini_labels);
result[2] = std::move(col_gini_normalized);
return true;
}))
{
return ColumnTuple::create(result);
}
}
else
{
const ColumnArray * array_labels = checkAndGetColumn<ColumnArray>(col_labels.get());
const auto & array_label_offsets = array_labels->getOffsets();
const auto & array_labels_type = typeid_cast<const DataTypeArray *>(arguments[1].type.get())->getNestedType();
if (castBothTypes(
array_predicted_type.get(),
array_labels_type.get(),
[&](const auto & type_predicted, const auto & type_labels)
{
using TypePredicted = typename std::decay_t<decltype(type_predicted)>::FieldType;
const ColumnVector<TypePredicted> * array_predicted_data = checkAndGetColumn<ColumnVector<TypePredicted>>(array_predicted->getDataPtr().get());
using TypeLabels = typename std::decay_t<decltype(type_labels)>::FieldType;
const ColumnVector<TypeLabels> * col_labels_data = checkAndGetColumn<ColumnVector<TypeLabels>>(array_labels->getDataPtr().get());
auto col_gini_predicted = ColumnFloat64::create(input_rows_count);
auto col_gini_labels = ColumnFloat64::create(input_rows_count);
auto col_gini_normalized = ColumnFloat64::create(input_rows_count);
Impl::vectorVector(
array_predicted_data->getData(),
array_predicted_offsets,
col_labels_data->getData(),
array_label_offsets,
col_gini_predicted->getData(),
col_gini_labels->getData(),
col_gini_normalized->getData());
result[0] = std::move(col_gini_predicted);
result[1] = std::move(col_gini_labels);
result[2] = std::move(col_gini_normalized);
return true;
}))
{
return ColumnTuple::create(result);
}
}
}
else if (const ColumnConst * array_predicted_const = checkAndGetColumn<ColumnConst>(col_predicted.get()))
{
/// Note that const-const case is handled by useDefaultImplementationForConstants.
const ColumnArray * column_array_const = checkAndGetColumn<ColumnArray>(array_predicted_const->getDataColumnPtr().get());
const auto & array_predicted_type = typeid_cast<const DataTypeArray *>(arguments[0].type.get())->getNestedType();
const ColumnArray * array_labels = checkAndGetColumn<ColumnArray>(col_labels.get());
const auto & array_label_offsets = array_labels->getOffsets();
const auto & array_labels_type = typeid_cast<const DataTypeArray *>(arguments[1].type.get())->getNestedType();
if (castBothTypes(
array_predicted_type.get(),
array_labels_type.get(),
[&](const auto & type_predicted, const auto & type_labels)
{
using TypePredicted = typename std::decay_t<decltype(type_predicted)>::FieldType;
const ColumnVector<TypePredicted> * array_predicted_data = checkAndGetColumn<ColumnVector<TypePredicted>>(column_array_const->getDataPtr().get());
using TypeLabels = typename std::decay_t<decltype(type_labels)>::FieldType;
const ColumnVector<TypeLabels> * col_labels_data = checkAndGetColumn<ColumnVector<TypeLabels>>(array_labels->getDataPtr().get());
auto col_gini_predicted = ColumnFloat64::create(input_rows_count);
auto col_gini_labels = ColumnFloat64::create(input_rows_count);
auto col_gini_normalized = ColumnFloat64::create(input_rows_count);
Impl::constVector(
array_predicted_data->getData(),
col_labels_data->getData(),
array_label_offsets,
col_gini_predicted->getData(),
col_gini_labels->getData(),
col_gini_normalized->getData());
result[0] = std::move(col_gini_predicted);
result[1] = std::move(col_gini_labels);
result[2] = std::move(col_gini_normalized);
return true;
}))
{
return ColumnTuple::create(result);
}
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column of argument of function {}", getName());
}
private:
template <typename F>
static bool castType(const IDataType * type, F && f)
{
return castTypeToEither<
DataTypeInt8,
DataTypeInt16,
DataTypeInt32,
DataTypeInt64,
DataTypeUInt8,
DataTypeUInt16,
DataTypeUInt32,
DataTypeUInt64,
DataTypeFloat32,
DataTypeFloat64>(type, std::forward<F>(f));
}
template <typename F>
static bool castBothTypes(const IDataType * left, const IDataType * right, F && f)
{
return castType(left, [&](const auto & left_)
{
return castType(right, [&](const auto & right_)
{
return f(left_, right_);
});
});
}
};
REGISTER_FUNCTION(NormalizedGini)
{
FunctionDocumentation::Description doc_description = "Calculates the normalized Gini coefficient.";
FunctionDocumentation::Syntax doc_syntax = "arrayNormalizedGini(predicted, label)";
FunctionDocumentation::Arguments doc_arguments = {{"predicted", "Predicted value (Array(T))."}, {"label", "Actual value (Array(T))."}};
FunctionDocumentation::ReturnedValue doc_returned_value = "A tuple containing the Gini coefficients of the predicted values, the Gini coefficient of the normalized values, and the normalized Gini coefficient (= the ratio of the former two Gini coefficients).";
FunctionDocumentation::Examples doc_examples
= {{"Example",
"SELECT arrayNormalizedGini([0.9, 0.3, 0.8, 0.7],[6, 1, 0, 2]);",
"(0.18055555555555558,0.2638888888888889,0.6842105263157896)"}};
FunctionDocumentation::Categories doc_categories = {"Array"};
factory.registerFunction<FunctionArrayNormalizedGini>(
{doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::Case::Sensitive);
}
}

View File

@ -0,0 +1,14 @@
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18888888888888894,0.3,0.6296296296296299)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)
(0.18055555555555558,0.2638888888888889,0.6842105263157896)

View File

@ -0,0 +1,24 @@
SELECT arrayNormalizedGini([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2]);
SELECT arrayNormalizedGini([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2, 1]); -- { serverError ILLEGAL_COLUMN }
SELECT arrayNormalizedGini([0.9, 0.3, 0.8, 0.75, 0.65, 0.6, 0.78, 0.7, 0.05, 0.4, 0.4, 0.05, 0.5, 0.1, 0.1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
SELECT arrayNormalizedGini(arrayResize([1], 2000000), arrayResize([1], 2000000)); -- { serverError TOO_LARGE_ARRAY_SIZE }
DROP TABLE IF EXISTS t;
CREATE TABLE t
(
`a1` Array(Float32),
`a2` Array(UInt32)
)
ENGINE = MergeTree
ORDER BY tuple();
INSERT INTO t VALUES ([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2]), ([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2]), ([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2]), ([0.9, 0.3, 0.8, 0.7], [6, 1, 0, 2]);
SELECT arrayNormalizedGini(a1, a2) FROM t;
SELECT arrayNormalizedGini(a1, [6, 1, 0, 2]) FROM t;
SELECT arrayNormalizedGini([0.9, 0.3, 0.8, 0.7], a2) FROM t;
DROP TABLE t;

View File

@ -50,6 +50,7 @@ Autocompletion
AvroConfluent
AzureQueue
Azurite
arrayNormalizedGini
BFloat
BIGINT
BIGSERIAL
@ -350,6 +351,7 @@ GetResolution
GetUnidirectionalEdge
GetUnidirectionalEdgeBoundary
GetUnidirectionalEdgesFromHexagon
Gini
GitLab
GlobalThread
GlobalThreadActive