ClickHouse/src/Functions/fuzzBits.cpp

174 lines
5.9 KiB
C++
Raw Normal View History

2020-05-28 02:51:37 +00:00
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
2021-05-17 07:30:42 +00:00
#include <Functions/IFunction.h>
2020-05-28 02:51:37 +00:00
#include <pcg_random.hpp>
#include <Common/randomSeed.h>
#include <common/arithmeticOverflow.h>
#include <memory>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int DECIMAL_OVERFLOW;
2020-05-31 23:40:38 +00:00
extern const int ARGUMENT_OUT_OF_BOUND;
2020-05-28 02:51:37 +00:00
}
namespace
{
inline UInt8 getXorMask(UInt64 rand, double prob)
{
UInt8 res = 0;
for (int i = 0; i < 8; ++i)
{
UInt8 rand8 = rand;
rand >>= 8;
res <<= 1;
res |= (rand8 < prob * (1u << 8));
}
return res;
}
void fuzzBits(const char8_t * ptr_in, char8_t * ptr_out, size_t len, double prob)
{
pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead.
for (size_t i = 0; i < len; ++i)
{
UInt64 rand = rng();
auto mask = getXorMask(rand, prob);
ptr_out[i] = ptr_in[i] ^ mask;
}
}
class FunctionFuzzBits : public IFunction
{
public:
static constexpr auto name = "fuzzBits";
static FunctionPtr create(ContextConstPtr) { return std::make_shared<FunctionFuzzBits>(); }
2020-05-28 02:51:37 +00:00
String getName() const override { return name; }
bool isVariadic() const override { return false; }
size_t getNumberOfArguments() const override { return 2; }
2020-05-29 02:06:21 +00:00
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } // indexing from 0
2020-05-28 02:51:37 +00:00
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!isStringOrFixedString(arguments[0].type))
throw Exception(
"First argument of function " + getName() + " must be String or FixedString", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
2020-05-29 02:06:21 +00:00
if (!arguments[1].column || !isFloat(arguments[1].type))
2020-05-28 02:51:37 +00:00
throw Exception("Second argument of function " + getName() + " must be constant float", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return arguments[0].type;
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
2020-05-28 02:51:37 +00:00
{
2020-10-18 19:00:13 +00:00
auto col_in_untyped = arguments[0].column;
const double inverse_probability = assert_cast<const ColumnConst &>(*arguments[1].column).getValue<double>();
2020-05-28 02:51:37 +00:00
2020-05-31 23:40:38 +00:00
if (inverse_probability < 0.0 || 1.0 < inverse_probability)
{
throw Exception("Second argument of function " + getName() + " must be from `0.0` to `1.0`", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
}
2020-05-29 02:06:21 +00:00
if (const ColumnConst * col_in_untyped_const = checkAndGetColumnConstStringOrFixedString(col_in_untyped.get()))
{
col_in_untyped = col_in_untyped_const->getDataColumnPtr();
}
2020-05-28 02:51:37 +00:00
if (const ColumnString * col_in = checkAndGetColumn<ColumnString>(col_in_untyped.get()))
{
auto col_to = ColumnString::create();
ColumnString::Chars & chars_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
2020-11-16 03:46:05 +00:00
size_t col_in_rows = col_in->getOffsets().size();
2020-05-28 02:51:37 +00:00
2020-11-16 03:46:05 +00:00
if (col_in_rows >= input_rows_count)
{
chars_to.resize(col_in->getChars().size());
// TODO: Maybe we can share `col_in->getOffsets()` to `offsets_to.resize` like clever pointers? They are same
offsets_to.resize(input_rows_count);
const auto * ptr_in = col_in->getChars().data();
auto * ptr_to = chars_to.data();
fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability);
for (size_t i = 0; i < input_rows_count; ++i)
{
offsets_to[i] = col_in->getOffsets()[i];
ptr_to[offsets_to[i] - 1] = 0;
}
}
else
2020-05-28 02:51:37 +00:00
{
2020-11-16 03:46:05 +00:00
assert(col_in_rows == 1);
chars_to.resize(col_in->getChars().size() * input_rows_count);
offsets_to.resize(input_rows_count);
size_t offset = col_in->getOffsets()[0];
const auto * ptr_in = col_in->getChars().data();
auto * ptr_to = chars_to.data();
for (size_t i = 0; i < input_rows_count; ++i)
{
fuzzBits(ptr_in, ptr_to + i * offset, offset, inverse_probability);
offsets_to[i] = (i + 1) * offset;
ptr_to[offsets_to[i] - 1] = 0;
}
2020-05-28 02:51:37 +00:00
}
2020-10-18 19:00:13 +00:00
return col_to;
2020-05-28 02:51:37 +00:00
}
2020-05-29 02:06:21 +00:00
else if (const ColumnFixedString * col_in_fixed = checkAndGetColumn<ColumnFixedString>(col_in_untyped.get()))
2020-05-28 02:51:37 +00:00
{
2020-05-29 02:06:21 +00:00
const auto n = col_in_fixed->getN();
2020-05-28 02:51:37 +00:00
auto col_to = ColumnFixedString::create(n);
ColumnFixedString::Chars & chars_to = col_to->getChars();
size_t total_size;
if (common::mulOverflow(input_rows_count, n, total_size))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
chars_to.resize(total_size);
2020-05-29 02:06:21 +00:00
const auto * ptr_in = col_in_fixed->getChars().data();
2020-05-28 02:51:37 +00:00
auto * ptr_to = chars_to.data();
fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability);
2020-05-29 02:06:21 +00:00
2020-10-18 19:00:13 +00:00
return col_to;
2020-05-28 02:51:37 +00:00
}
else
{
throw Exception(
2020-10-18 19:00:13 +00:00
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
2020-05-28 02:51:37 +00:00
ErrorCodes::ILLEGAL_COLUMN);
}
}
};
2020-09-07 18:00:37 +00:00
}
2020-05-28 02:51:37 +00:00
void registerFunctionFuzzBits(FunctionFactory & factory)
{
factory.registerFunction<FunctionFuzzBits>();
}
}