ClickHouse/src/Functions/fuzzBits.cpp

187 lines
6.5 KiB
C++

#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <pcg_random.hpp>
#include <Common/randomSeed.h>
#include <base/arithmeticOverflow.h>
#include <memory>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int DECIMAL_OVERFLOW;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
}
namespace
{
inline UInt8 getXorMask(UInt64 rand, double prob)
{
UInt8 res = 0;
for (int i = 0; i < 8; ++i)
{
UInt8 rand8 = rand;
rand >>= 8;
res <<= 1;
res |= (rand8 < prob * (1u << 8));
}
return res;
}
void fuzzBits(const char8_t * ptr_in, char8_t * ptr_out, size_t len, double prob)
{
pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead.
for (size_t i = 0; i < len; ++i)
{
UInt64 rand = rng();
auto mask = getXorMask(rand, prob);
ptr_out[i] = ptr_in[i] ^ mask;
}
}
class FunctionFuzzBits : public IFunction
{
public:
static constexpr auto name = "fuzzBits";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionFuzzBits>(); }
String getName() const override { return name; }
bool isVariadic() const override { return false; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 2; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } // indexing from 0
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!isStringOrFixedString(arguments[0].type))
throw Exception(
"First argument of function " + getName() + " must be String or FixedString", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!arguments[1].column || !isFloat(arguments[1].type))
throw Exception("Second argument of function " + getName() + " must be constant float", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return arguments[0].type;
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
auto col_in_untyped = arguments[0].column;
const double inverse_probability = assert_cast<const ColumnConst &>(*arguments[1].column).getValue<double>();
if (inverse_probability < 0.0 || 1.0 < inverse_probability)
{
throw Exception("Second argument of function " + getName() + " must be from `0.0` to `1.0`", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
}
if (const ColumnConst * col_in_untyped_const = checkAndGetColumnConstStringOrFixedString(col_in_untyped.get()))
{
col_in_untyped = col_in_untyped_const->getDataColumnPtr();
}
if (const ColumnString * col_in = checkAndGetColumn<ColumnString>(col_in_untyped.get()))
{
auto col_to = ColumnString::create();
ColumnString::Chars & chars_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
size_t col_in_rows = col_in->getOffsets().size();
if (col_in_rows >= input_rows_count)
{
chars_to.resize(col_in->getChars().size());
// TODO: Maybe we can share `col_in->getOffsets()` to `offsets_to.resize` like clever pointers? They are same
offsets_to.resize(input_rows_count);
const auto * ptr_in = col_in->getChars().data();
auto * ptr_to = chars_to.data();
fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability);
for (size_t i = 0; i < input_rows_count; ++i)
{
offsets_to[i] = col_in->getOffsets()[i];
ptr_to[offsets_to[i] - 1] = 0;
}
}
else
{
assert(col_in_rows == 1);
chars_to.resize(col_in->getChars().size() * input_rows_count);
offsets_to.resize(input_rows_count);
size_t offset = col_in->getOffsets()[0];
const auto * ptr_in = col_in->getChars().data();
auto * ptr_to = chars_to.data();
for (size_t i = 0; i < input_rows_count; ++i)
{
fuzzBits(ptr_in, ptr_to + i * offset, offset, inverse_probability);
offsets_to[i] = (i + 1) * offset;
ptr_to[offsets_to[i] - 1] = 0;
}
}
return col_to;
}
else if (const ColumnFixedString * col_in_fixed = checkAndGetColumn<ColumnFixedString>(col_in_untyped.get()))
{
const auto n = col_in_fixed->getN();
const auto col_in_rows = col_in_fixed->size();
auto col_to = ColumnFixedString::create(n);
ColumnFixedString::Chars & chars_to = col_to->getChars();
size_t total_size;
if (common::mulOverflow(input_rows_count, n, total_size))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
chars_to.resize(total_size);
const auto * ptr_in = col_in_fixed->getChars().data();
auto * ptr_to = chars_to.data();
if (col_in_rows >= input_rows_count)
fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability);
else if (col_in_rows != 1)
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"1 != col_in_rows {} < input_rows_count {}", col_in_rows, input_rows_count);
else
for (size_t i = 0; i < input_rows_count; ++i)
fuzzBits(ptr_in, ptr_to + i * n, n, inverse_probability);
return col_to;
}
else
{
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
}
};
}
void registerFunctionFuzzBits(FunctionFactory & factory)
{
factory.registerFunction<FunctionFuzzBits>();
}
}