ClickHouse/src/Functions/randomPrintableASCII.cpp

121 lines
4.3 KiB
C++
Raw Normal View History

2019-12-28 23:11:02 +00:00
#include <Functions/IFunctionImpl.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
2020-05-11 00:23:47 +00:00
#include <pcg_random.hpp>
#include <Common/randomSeed.h>
2019-12-28 23:11:02 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
2019-12-29 01:12:19 +00:00
extern const int TOO_LARGE_STRING_SIZE;
2019-12-28 23:11:02 +00:00
}
2020-09-07 18:00:37 +00:00
namespace
{
2019-12-28 23:11:02 +00:00
/** Generate random string of specified length with printable ASCII characters, almost uniformly distributed.
* First argument is length, other optional arguments are ignored and used to prevent common subexpression elimination to get different values.
*/
class FunctionRandomPrintableASCII : public IFunction
{
public:
static constexpr auto name = "randomPrintableASCII";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomPrintableASCII>(); }
String getName() const override
{
return name;
}
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
2020-03-09 03:38:43 +00:00
if (arguments.empty())
2019-12-29 14:34:23 +00:00
throw Exception("Function " + getName() + " requires at least one argument: the size of resulting string",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (arguments.size() > 2)
throw Exception("Function " + getName() + " requires at most two arguments: the size of resulting string and optional disambiguation tag",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
2019-12-28 23:11:02 +00:00
const IDataType & length_type = *arguments[0];
if (!isNumber(length_type))
throw Exception("First argument of function " + getName() + " must have numeric type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
2020-10-19 15:27:41 +00:00
ColumnPtr executeImpl(ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
2019-12-28 23:11:02 +00:00
{
auto col_to = ColumnString::create();
ColumnString::Chars & data_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
offsets_to.resize(input_rows_count);
2020-05-11 00:23:47 +00:00
pcg64_fast rng(randomSeed());
2020-10-19 15:27:41 +00:00
const IColumn & length_column = *arguments[0].column;
2019-12-28 23:11:02 +00:00
IColumn::Offset offset = 0;
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t length = length_column.getUInt(row_num);
2019-12-29 01:12:19 +00:00
if (length > (1 << 30))
throw Exception("Too large string size in function " + getName(), ErrorCodes::TOO_LARGE_STRING_SIZE);
2019-12-28 23:11:02 +00:00
IColumn::Offset next_offset = offset + length + 1;
data_to.resize(next_offset);
offsets_to[row_num] = next_offset;
2020-01-02 21:23:10 +00:00
auto * data_to_ptr = data_to.data(); /// avoid assert on array indexing after end
2019-12-28 23:11:02 +00:00
for (size_t pos = offset, end = offset + length; pos < end; pos += 4) /// We have padding in column buffers that we can overwrite.
{
2020-05-11 00:23:47 +00:00
UInt64 rand = rng();
2019-12-28 23:11:02 +00:00
UInt16 rand1 = rand;
UInt16 rand2 = rand >> 16;
UInt16 rand3 = rand >> 32;
UInt16 rand4 = rand >> 48;
/// Printable characters are from range [32; 126].
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
2020-01-02 21:23:10 +00:00
data_to_ptr[pos + 0] = 32 + ((rand1 * 95) >> 16);
data_to_ptr[pos + 1] = 32 + ((rand2 * 95) >> 16);
data_to_ptr[pos + 2] = 32 + ((rand3 * 95) >> 16);
data_to_ptr[pos + 3] = 32 + ((rand4 * 95) >> 16);
2019-12-28 23:11:02 +00:00
2019-12-29 14:32:46 +00:00
/// NOTE gcc failed to vectorize this code (aliasing of char?)
2019-12-28 23:11:02 +00:00
/// TODO Implement SIMD optimizations from Danila Kutenin.
}
data_to[offset + length] = 0;
offset = next_offset;
}
2020-10-19 15:27:41 +00:00
return col_to;
2019-12-28 23:11:02 +00:00
}
};
2020-09-07 18:00:37 +00:00
}
2019-12-28 23:11:02 +00:00
void registerFunctionRandomPrintableASCII(FunctionFactory & factory)
{
factory.registerFunction<FunctionRandomPrintableASCII>();
}
}