Added 'anyHeavy' aggregate function [#METR-21691].

This commit is contained in:
Alexey Milovidov 2016-06-26 15:48:04 +03:00
parent 6ba6d01491
commit b408afa74f
2 changed files with 90 additions and 0 deletions

View File

@ -133,6 +133,16 @@ struct SingleValueDataFixed
else
return false;
}
bool isEqualTo(const Self & to) const
{
return has() && to.value == value;
}
bool isEqualTo(const IColumn & column, size_t row_num) const
{
return has() && static_cast<const ColumnVector<T> &>(column).getData()[row_num] == value;
}
};
@ -334,6 +344,16 @@ struct __attribute__((__packed__, __aligned__(1))) SingleValueDataString
else
return false;
}
bool isEqualTo(const Self & to) const
{
return has() && to.getStringRef() == getStringRef();
}
bool isEqualTo(const IColumn & column, size_t row_num) const
{
return has() && static_cast<const ColumnString &>(column).getDataAtWithTerminatingZero(row_num) == getStringRef();
}
};
static_assert(
@ -476,6 +496,16 @@ struct SingleValueDataGeneric
else
return false;
}
bool isEqualTo(const IColumn & column, size_t row_num) const
{
return has() && value == column[row_num];
}
bool isEqualTo(const Self & to) const
{
return has() && to.value == value;
}
};
@ -529,6 +559,60 @@ struct AggregateFunctionAnyLastData : Data
};
/** Implement 'heavy hitters' algorithm.
* Selects most frequent value if its frequency is more than 50%.
* Otherwise, selects some arbitary value.
* http://www.cs.umd.edu/~samir/498/karp.pdf
*/
template <typename Data>
struct AggregateFunctionAnyHeavyData : Data
{
size_t counter = 0;
using Self = AggregateFunctionAnyHeavyData<Data>;
bool changeIfBetter(const IColumn & column, size_t row_num)
{
if (this->isEqualTo(column, row_num))
{
++counter;
}
else
{
if (counter == 0)
{
this->change(column, row_num);
return true;
}
else
--counter;
}
return false;
}
bool changeIfBetter(const Self & to)
{
if (this->isEqualTo(to))
{
counter += to.counter;
}
else
{
if (counter < to.counter)
{
this->change(to);
return true;
}
else
counter -= to.counter;
}
return false;
}
static const char * name() { return "anyHeavy"; }
};
template <typename Data>
class AggregateFunctionsSingleValue final : public IUnaryAggregateFunction<Data, AggregateFunctionsSingleValue<Data> >
{

View File

@ -17,6 +17,11 @@ AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, co
return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyLastData>(name, argument_types));
}
AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, const DataTypes & argument_types)
{
return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyHeavyData>(name, argument_types));
}
AggregateFunctionPtr createAggregateFunctionMin(const std::string & name, const DataTypes & argument_types)
{
return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMinData>(name, argument_types));
@ -43,6 +48,7 @@ void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory & factory)
{
factory.registerFunction({"any"}, createAggregateFunctionAny);
factory.registerFunction({"anyLast"}, createAggregateFunctionAnyLast);
factory.registerFunction({"anyHeavy"}, createAggregateFunctionAnyHeavy);
factory.registerFunction({"min"}, createAggregateFunctionMin);
factory.registerFunction({"max"}, createAggregateFunctionMax);
factory.registerFunction({"argMin"}, createAggregateFunctionArgMin);