#include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int TOO_LARGE_STRING_SIZE; extern const int ILLEGAL_COLUMN; extern const int DECIMAL_OVERFLOW; } namespace { inline UInt8 getXorMask(UInt64 rand, double prob) { UInt8 res = 0; for (int i = 0; i < 8; ++i) { UInt8 rand8 = rand; rand >>= 8; res <<= 1; res |= (rand8 < prob * (1u << 8)); } return res; } void fuzzBits(const char8_t * ptr_in, char8_t * ptr_out, size_t len, double prob) { pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead. for (size_t i = 0; i < len; ++i) { UInt64 rand = rng(); auto mask = getXorMask(rand, prob); ptr_out[i] = ptr_in[i] ^ mask; } } } class FunctionFuzzBits : public IFunction { public: static constexpr auto name = "fuzzBits"; static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return false; } size_t getNumberOfArguments() const override { return 2; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (!isStringOrFixedString(arguments[0].type)) throw Exception( "First argument of function " + getName() + " must be String or FixedString", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); if (!arguments[1].column || !isFloat(arguments[1].type) || !isColumnConst(*arguments[1].column)) throw Exception("Second argument of function " + getName() + " must be constant float", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return arguments[0].type; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override { const auto col_in_untyped = block.getByPosition(arguments[0]).column; const double inverse_probability = assert_cast(*block.getByPosition(arguments[1]).column).getValue(); if (const ColumnString * col_in = checkAndGetColumn(col_in_untyped.get())) { auto col_to = ColumnString::create(); ColumnString::Chars & chars_to = col_to->getChars(); ColumnString::Offsets & offsets_to = col_to->getOffsets(); chars_to.resize(col_in->getChars().size()); // TODO: Maybe we can share `col_in->getOffsets()` to `offsets_to.resize` like clever pointers? They are same offsets_to.resize(input_rows_count); const auto * ptr_in = col_in->getChars().data(); auto * ptr_to = chars_to.data(); fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability); for (size_t i = 0; i < input_rows_count; ++i) { offsets_to[i] = col_in->getOffsets()[i]; ptr_to[offsets_to[i] - 1] = 0; } block.getByPosition(result).column = std::move(col_to); } else if (const ColumnFixedString * col_fixed_in = checkAndGetColumn(col_in_untyped.get())) { const auto n = col_fixed_in->getN(); auto col_to = ColumnFixedString::create(n); ColumnFixedString::Chars & chars_to = col_to->getChars(); size_t total_size; if (common::mulOverflow(input_rows_count, n, total_size)) throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); chars_to.resize(total_size); const auto * ptr_in = col_fixed_in->getChars().data(); auto * ptr_to = chars_to.data(); fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability); } else { throw Exception( "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); } } }; void registerFunctionFuzzBits(FunctionFactory & factory) { factory.registerFunction(); } }