ClickHouse/src/Functions/randomStringUTF8.cpp

154 lines
5.0 KiB
C++
Raw Normal View History

2020-05-17 04:43:53 +00:00
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
2021-05-17 07:30:42 +00:00
#include <Functions/IFunction.h>
2020-05-17 04:43:53 +00:00
#include <pcg_random.hpp>
#include <Common/UTF8Helpers.h>
#include <Common/randomSeed.h>
2021-10-02 07:13:14 +00:00
#include <base/defines.h>
2020-05-17 04:43:53 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_STRING_SIZE;
}
2020-09-07 18:00:37 +00:00
namespace
{
2020-05-17 04:43:53 +00:00
/* Generate string with a UTF-8 encoded text.
* Take a single argument - length of result string in Unicode code points.
2020-05-18 01:42:46 +00:00
* ATTENTION: Method generate only assignable code points (excluded 4-13 planes).
2020-05-17 04:43:53 +00:00
* See https://en.wikipedia.org/wiki/Plane_(Unicode) */
class FunctionRandomStringUTF8 : public IFunction
{
public:
static constexpr auto name = "randomStringUTF8";
2021-06-01 12:20:52 +00:00
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionRandomStringUTF8>(); }
2020-05-17 04:43:53 +00:00
String getName() const override { return name; }
bool isVariadic() const override { return false; }
size_t getNumberOfArguments() const override { return 1; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
2020-05-17 13:07:37 +00:00
if (!isNumber(*arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument of function {} must have numeric type", getName());
2020-05-17 04:43:53 +00:00
return std::make_shared<DataTypeString>();
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
2021-06-22 16:21:23 +00:00
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
2020-05-17 04:43:53 +00:00
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
2020-05-17 04:43:53 +00:00
{
auto col_to = ColumnString::create();
ColumnString::Chars & data_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
if (input_rows_count == 0)
2020-10-19 15:27:41 +00:00
return col_to;
2020-05-17 04:43:53 +00:00
offsets_to.resize(input_rows_count);
const IColumn & col_length = *arguments[0].column;
size_t total_codepoints = 0;
2020-05-17 04:43:53 +00:00
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t codepoints = col_length.getUInt(row_num);
total_codepoints += codepoints;
2020-05-17 04:43:53 +00:00
}
/* As we generate only assigned planes, the mathematical expectation of the number of bytes
* per generated code point ~= 3.85. So, reserving for coefficient 4 will not be an overhead
2020-05-18 01:42:46 +00:00
*/
2020-05-17 04:43:53 +00:00
if (total_codepoints > (1 << 29))
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size in function {}", getName());
2020-05-17 04:43:53 +00:00
size_t max_byte_size = total_codepoints * 4 + input_rows_count;
data_to.resize(max_byte_size);
2020-05-17 04:43:53 +00:00
2023-05-11 09:27:43 +00:00
const auto generate_code_point = [](UInt32 rand)
{
2020-05-17 04:43:53 +00:00
/// We want to generate number in [0x0, 0x70000) and shift it if need
/// Generate highest byte in [0, 6]
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
UInt32 code_point = (rand >> 16) * 7u;
2020-05-17 13:07:37 +00:00
code_point &= ~0xFFFF;
2020-05-17 04:43:53 +00:00
code_point |= rand & 0xFFFF; // and other bytes obtaining in a simple way
if (code_point >= 0x40000)
{
code_point += 0xa0000; // shift if it is in 14-16 plane
return code_point;
}
if (0xD7FF < code_point && code_point < 0xE000) // this range will not be valid in isValidUTF8
{
2020-05-17 13:22:52 +00:00
/// The distribution will be slightly non-uniform but we don't care.
2020-05-17 04:43:53 +00:00
return 0u;
}
2020-05-17 13:07:37 +00:00
2020-05-17 04:43:53 +00:00
return code_point;
};
pcg64_fast rng(randomSeed());
2020-05-17 04:43:53 +00:00
IColumn::Offset offset = 0;
2020-05-17 04:43:53 +00:00
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t codepoints = col_length.getUInt(row_num);
2020-05-17 04:43:53 +00:00
auto * pos = data_to.data() + offset;
for (size_t i = 0; i < codepoints; i +=2)
2020-05-17 04:43:53 +00:00
{
UInt64 rand = rng(); /// that's the bottleneck
2020-05-17 04:43:53 +00:00
UInt32 code_point1 = generate_code_point(static_cast<UInt32>(rand));
2021-01-28 01:08:48 +00:00
size_t bytes1 = UTF8::convertCodePointToUTF8(code_point1, pos, 4);
chassert(bytes1 <= 4);
pos += bytes1;
if (i + 1 != codepoints)
{
UInt32 code_point2 = generate_code_point(static_cast<UInt32>(rand >> 32u));
size_t bytes2 = UTF8::convertCodePointToUTF8(code_point2, pos, 4);
chassert(bytes2 <= 4);
pos += bytes2;
}
2020-05-17 04:43:53 +00:00
}
*pos = 0;
++pos;
offset = pos - data_to.data();
2020-05-17 04:43:53 +00:00
offsets_to[row_num] = offset;
}
data_to.resize(offset);
2020-05-17 04:43:53 +00:00
2020-10-19 15:27:41 +00:00
return col_to;
2020-05-17 04:43:53 +00:00
}
};
2020-09-07 18:00:37 +00:00
}
REGISTER_FUNCTION(RandomStringUTF8)
2020-05-17 04:43:53 +00:00
{
factory.registerFunction<FunctionRandomStringUTF8>();
}
}