renamed anyHash64 to cityHash64, made them identical in unary case. [#METR-11774]

This commit is contained in:
Michael Kolupaev 2014-07-04 13:42:56 +04:00
parent 2e1e33da7e
commit 9c7ac77624
2 changed files with 89 additions and 49 deletions

View File

@ -36,12 +36,13 @@ namespace DB
* Быстрая некриптографическая хэш функция для строк:
* cityHash64: String -> UInt64
*
* Некриптографический хеш от кортежа значений любых типов (использует cityHash64 для строк и intHash64 для чисел):
* cityHash64: any* -> UInt64
*
* Быстрая некриптографическая хэш функция от любого целого числа:
* intHash32: number -> UInt32
* intHash64: number -> UInt64
*
* Некриптографический хеш от кортежа значений любых типов (использует cityHash64 для строк и intHash64 для чисел):
* anyHash64: any* -> UInt64
*/
struct HalfMD5Impl
@ -71,14 +72,6 @@ struct SipHash64Impl
}
};
struct CityHash64Impl
{
static UInt64 apply(const char * begin, size_t size)
{
return CityHash64(begin, size);
}
};
struct IntHash32Impl
{
typedef UInt32 ReturnType;
@ -258,10 +251,10 @@ UInt64 toInteger<Float64>(Float64 x)
}
class FunctionAnyHash64 : public IFunction
class FunctionCityHash64 : public IFunction
{
private:
template <typename FromType>
template <typename FromType, bool first>
void executeIntType(const IColumn * column, ColumnUInt64::Container_t & vec_to)
{
if (const ColumnVector<FromType> * col_from = typeid_cast<const ColumnVector<FromType> *>(column))
@ -269,14 +262,27 @@ private:
const typename ColumnVector<FromType>::Container_t & vec_from = col_from->getData();
size_t size = vec_from.size();
for (size_t i = 0; i < size; ++i)
vec_to[i] = Hash128to64(uint128(vec_to[i], IntHash64Impl::apply(toInteger(vec_from[i]))));
{
UInt64 h = IntHash64Impl::apply(toInteger(vec_from[i]));
if (first)
vec_to[i] = h;
else
vec_to[i] = Hash128to64(uint128(vec_to[i], h));
}
}
else if (const ColumnConst<FromType> * col_from = typeid_cast<const ColumnConst<FromType> *>(column))
{
UInt64 hash = IntHash64Impl::apply(toInteger(col_from->getData()));
size_t size = vec_to.size();
for (size_t i = 0; i < size; ++i)
vec_to[i] = Hash128to64(uint128(vec_to[i], hash));
if (first)
{
vec_to.assign(size, hash);
}
else
{
for (size_t i = 0; i < size; ++i)
vec_to[i] = Hash128to64(uint128(vec_to[i], hash));
}
}
else
throw Exception("Illegal column " + column->getName()
@ -284,6 +290,7 @@ private:
ErrorCodes::ILLEGAL_COLUMN);
}
template <bool first>
void executeString(const IColumn * column, ColumnUInt64::Container_t & vec_to)
{
if (const ColumnString * col_from = typeid_cast<const ColumnString *>(column))
@ -293,26 +300,45 @@ private:
size_t size = offsets.size();
for (size_t i = 0; i < size; ++i)
vec_to[i] = Hash128to64(uint128(vec_to[i], CityHash64Impl::apply(
{
UInt64 h = CityHash64(
reinterpret_cast<const char *>(&data[i == 0 ? 0 : offsets[i - 1]]),
i == 0 ? offsets[i] - 1 : (offsets[i] - 1 - offsets[i - 1]))));
i == 0 ? offsets[i] - 1 : (offsets[i] - 1 - offsets[i - 1]));
if (first)
vec_to[i] = h;
else
vec_to[i] = Hash128to64(uint128(vec_to[i], h));
}
}
else if (const ColumnFixedString * col_from = typeid_cast<const ColumnFixedString *>(column))
{
const typename ColumnString::Chars_t & data = col_from->getChars();
size_t n = col_from->getN();
size_t size = data.size() / n;
for (size_t i = 0; i < size; ++i)
vec_to[i] = Hash128to64(uint128(vec_to[i], CityHash64Impl::apply(
reinterpret_cast<const char *>(&data[i * n]), n)));
{
UInt64 h = CityHash64(reinterpret_cast<const char *>(&data[i * n]), n);
if (first)
vec_to[i] = h;
else
vec_to[i] = Hash128to64(uint128(vec_to[i], h));
}
}
else if (const ColumnConstString * col_from = typeid_cast<const ColumnConstString *>(column))
{
UInt64 hash = CityHash64Impl::apply(col_from->getData().data(), col_from->getData().size());
UInt64 hash = CityHash64(col_from->getData().data(), col_from->getData().size());
size_t size = vec_to.size();
for (size_t i = 0; i < size; ++i)
vec_to[i] = Hash128to64(uint128(vec_to[i], hash));
if (first)
{
vec_to.assign(size, hash);
}
else
{
for (size_t i = 0; i < size; ++i)
{
vec_to[i] = Hash128to64(uint128(vec_to[i], hash));
}
}
}
else
throw Exception("Illegal column " + column->getName()
@ -320,6 +346,7 @@ private:
ErrorCodes::ILLEGAL_COLUMN);
}
template <bool first>
void executeArray(const IDataType * type, const IColumn * column, ColumnUInt64::Container_t & vec_to)
{
const IDataType * nested_type = &*typeid_cast<const DataTypeArray *>(type)->getNestedType();
@ -330,8 +357,8 @@ private:
const ColumnArray::Offsets_t & offsets = col_from->getOffsets();
size_t nested_size = nested_column->size();
ColumnUInt64::Container_t vec_temp(nested_size, 0);
executeAny(nested_type, nested_column, vec_temp);
ColumnUInt64::Container_t vec_temp(nested_size);
executeAny<true>(nested_type, nested_column, vec_temp);
size_t size = offsets.size();
@ -339,7 +366,13 @@ private:
{
size_t begin = i == 0 ? 0 : offsets[i - 1];
size_t end = offsets[i];
vec_to[i] = Hash128to64(uint128(vec_to[i], IntHash64Impl::apply(end - begin)));
UInt64 h = IntHash64Impl::apply(end - begin);
if (first)
vec_to[i] = h;
else
vec_to[i] = Hash128to64(uint128(vec_to[i], h));
for (size_t j = begin; j < end; ++j)
vec_to[i] = Hash128to64(uint128(vec_to[i], vec_temp[j]));
}
@ -348,7 +381,7 @@ private:
{
/// NOTE: тут, конечно, можно обойтись без материалиации столбца.
ColumnPtr full_column = col_from->convertToFullColumn();
executeArray(type, &*full_column, vec_to);
executeArray<first>(type, &*full_column, vec_to);
}
else
throw Exception("Illegal column " + column->getName()
@ -356,23 +389,24 @@ private:
ErrorCodes::ILLEGAL_COLUMN);
}
template <bool first>
void executeAny(const IDataType * from_type, const IColumn * icolumn, ColumnUInt64::Container_t & vec_to)
{
if (typeid_cast<const DataTypeUInt8 * >(from_type)) executeIntType<UInt8 >(icolumn, vec_to);
else if (typeid_cast<const DataTypeUInt16 * >(from_type)) executeIntType<UInt16>(icolumn, vec_to);
else if (typeid_cast<const DataTypeUInt32 * >(from_type)) executeIntType<UInt32>(icolumn, vec_to);
else if (typeid_cast<const DataTypeUInt64 * >(from_type)) executeIntType<UInt64>(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt8 * >(from_type)) executeIntType<Int8 >(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt16 * >(from_type)) executeIntType<Int16 >(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt32 * >(from_type)) executeIntType<Int32 >(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt64 * >(from_type)) executeIntType<Int64 >(icolumn, vec_to);
else if (typeid_cast<const DataTypeDate * >(from_type)) executeIntType<UInt16>(icolumn, vec_to);
else if (typeid_cast<const DataTypeDateTime * >(from_type)) executeIntType<UInt32>(icolumn, vec_to);
else if (typeid_cast<const DataTypeFloat32 * >(from_type)) executeIntType<Float32>(icolumn, vec_to);
else if (typeid_cast<const DataTypeFloat64 * >(from_type)) executeIntType<Float64>(icolumn, vec_to);
else if (typeid_cast<const DataTypeString * >(from_type)) executeString(icolumn, vec_to);
else if (typeid_cast<const DataTypeFixedString * >(from_type)) executeString(icolumn, vec_to);
else if (typeid_cast<const DataTypeArray * >(from_type)) executeArray(from_type, icolumn, vec_to);
if (typeid_cast<const DataTypeUInt8 * >(from_type)) executeIntType<UInt8, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeUInt16 * >(from_type)) executeIntType<UInt16, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeUInt32 * >(from_type)) executeIntType<UInt32, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeUInt64 * >(from_type)) executeIntType<UInt64, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt8 * >(from_type)) executeIntType<Int8, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt16 * >(from_type)) executeIntType<Int16, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt32 * >(from_type)) executeIntType<Int32, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeInt64 * >(from_type)) executeIntType<Int64, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeDate * >(from_type)) executeIntType<UInt16, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeDateTime * >(from_type)) executeIntType<UInt32, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeFloat32 * >(from_type)) executeIntType<Float32, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeFloat64 * >(from_type)) executeIntType<Float64, first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeString * >(from_type)) executeString < first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeFixedString *>(from_type)) executeString < first>(icolumn, vec_to);
else if (typeid_cast<const DataTypeArray * >(from_type)) executeArray < first>(from_type, icolumn, vec_to);
else
throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
@ -382,7 +416,7 @@ public:
/// Получить имя функции.
String getName() const
{
return "anyHash64";
return "cityHash64";
}
/// Получить тип результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение.
@ -399,15 +433,23 @@ public:
block.getByPosition(result).column = col_to;
ColumnUInt64::Container_t & vec_to = col_to->getData();
vec_to.assign(rows, 0ul);
for (size_t index : arguments)
if (arguments.empty())
{
const ColumnWithNameAndType & column = block.getByPosition(index);
/// Случайное число из /dev/urandom используется как хеш пустого кортежа.
vec_to.assign(rows, 0xe28dbde7fe22e41c);
}
for (size_t i = 0; i < arguments.size(); ++i)
{
const ColumnWithNameAndType & column = block.getByPosition(arguments[i]);
const IDataType * from_type = &*column.type;
const IColumn * icolumn = &*column.column;
executeAny(from_type, icolumn, vec_to);
if (i == 0)
executeAny<true>(from_type, icolumn, vec_to);
else
executeAny<false>(from_type, icolumn, vec_to);
}
}
};
@ -421,7 +463,6 @@ struct NameIntHash64 { static const char * get() { return "intHash64"; } };
typedef FunctionStringHash64<HalfMD5Impl, NameHalfMD5> FunctionHalfMD5;
typedef FunctionStringHash64<SipHash64Impl, NameSipHash64> FunctionSipHash64;
typedef FunctionStringHash64<CityHash64Impl, NameCityHash64> FunctionCityHash64;
typedef FunctionIntHash<IntHash32Impl, NameIntHash32> FunctionIntHash32;
typedef FunctionIntHash<IntHash64Impl, NameIntHash64> FunctionIntHash64;

View File

@ -149,7 +149,6 @@ FunctionPtr FunctionFactory::get(
else if (name == "cityHash64") return new FunctionCityHash64;
else if (name == "intHash32") return new FunctionIntHash32;
else if (name == "intHash64") return new FunctionIntHash64;
else if (name == "anyHash64") return new FunctionAnyHash64;
else if (name == "IPv4NumToString") return new FunctionIPv4NumToString;
else if (name == "IPv4StringToNum") return new FunctionIPv4StringToNum;