diff --git a/src/Functions/randomString.cpp b/src/Functions/randomString.cpp index 2b95825d449..4ea470f0a65 100644 --- a/src/Functions/randomString.cpp +++ b/src/Functions/randomString.cpp @@ -4,8 +4,8 @@ #include #include #include -#include #include +#include namespace DB diff --git a/src/Functions/randomStringUTF8.cpp b/src/Functions/randomStringUTF8.cpp new file mode 100644 index 00000000000..fdea2d29952 --- /dev/null +++ b/src/Functions/randomStringUTF8.cpp @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int TOO_LARGE_STRING_SIZE; +} + + +/* Generate string with a UTF-8 encoded text. + * Take a single argument - length of result string in Unicode code points. + * ATTENTION: Method generate only assignable code points(excluded 4-13 planes). + * See https://en.wikipedia.org/wiki/Plane_(Unicode) */ + +class FunctionRandomStringUTF8 : public IFunction +{ +public: + static constexpr auto name = "randomStringUTF8"; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + bool isVariadic() const override { return false; } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isNumber(*arguments[0])) + throw Exception("First argument of function " + getName() + " must have numeric type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override + { + auto col_to = ColumnString::create(); + ColumnString::Chars & data_to = col_to->getChars(); + ColumnString::Offsets & offsets_to = col_to->getOffsets(); + + if (input_rows_count == 0) + { + block.getByPosition(result).column = std::move(col_to); + return; + } + + offsets_to.resize(input_rows_count); + + const IColumn & length_column = *block.getByPosition(arguments[0]).column; + size_t summary_utf8_len = 0; + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + { + size_t utf8_len = length_column.getUInt(row_num); + summary_utf8_len += utf8_len; + } + + /* As we generate only assigned planes, the mathematical expectation of the number of bytes + * per generated code point ~= 3.85. So, reserving for coefficient 4 will not be an overhead + */ + + if (summary_utf8_len > (1 << 29)) + throw Exception("Too large string size in function " + getName(), ErrorCodes::TOO_LARGE_STRING_SIZE); + + size_t size_in_bytes_with_margin = summary_utf8_len * 4 + input_rows_count; + data_to.resize(size_in_bytes_with_margin); + pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead. + + const auto generate_code_point = [](UInt32 rand) -> UInt32 { + /// We want to generate number in [0x0, 0x70000) and shift it if need + + /// Generate highest byte in [0, 6] + /// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + UInt32 code_point = (rand >> 16) * 7u; + code_point &= ~0xFFFF; + code_point |= rand & 0xFFFF; // and other bytes obtaining in a simple way + + if (code_point >= 0x40000) + { + code_point += 0xa0000; // shift if it is in 14-16 plane + return code_point; + } + + if (0xD7FF < code_point && code_point < 0xE000) // this range will not be valid in isValidUTF8 + { + /// The distribution will be slightly non-uniform but we don't care. + return 0u; + } + + return code_point; + }; + + IColumn::Offset offset = 0; + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + { + size_t utf8_len = length_column.getUInt(row_num); + auto * pos = data_to.data() + offset; + + size_t last_writen_bytes = 0; + size_t i = 0; + for (; i < utf8_len; i += 2) + { + UInt64 rand = rng(); + + UInt32 code_point1 = generate_code_point(rand); + UInt32 code_point2 = generate_code_point(rand >> 32); + + /// We have padding in column buffers that we can overwrite. + pos += UTF8::convert(code_point1, pos, sizeof(int)); + last_writen_bytes = UTF8::convert(code_point2, pos, sizeof(int)); + pos += last_writen_bytes; + } + offset = pos - data_to.data() + 1; + if (i > utf8_len) + { + offset -= last_writen_bytes; + } + offsets_to[row_num] = offset; + } + + + /// Put zero bytes in between. + auto * pos = data_to.data(); + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + pos[offsets_to[row_num] - 1] = 0; + + block.getByPosition(result).column = std::move(col_to); + } +}; + +void registerFunctionRandomStringUTF8(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/src/Functions/registerFunctionsRandom.cpp b/src/Functions/registerFunctionsRandom.cpp index 896fee24ae5..3638474c4fe 100644 --- a/src/Functions/registerFunctionsRandom.cpp +++ b/src/Functions/registerFunctionsRandom.cpp @@ -9,6 +9,7 @@ void registerFunctionGenerateUUIDv4(FunctionFactory & factory); void registerFunctionRandomPrintableASCII(FunctionFactory & factory); void registerFunctionRandomString(FunctionFactory & factory); void registerFunctionRandomFixedString(FunctionFactory & factory); +void registerFunctionRandomStringUTF8(FunctionFactory & factory); void registerFunctionsRandom(FunctionFactory & factory) { @@ -19,6 +20,7 @@ void registerFunctionsRandom(FunctionFactory & factory) registerFunctionRandomPrintableASCII(factory); registerFunctionRandomString(factory); registerFunctionRandomFixedString(factory); + registerFunctionRandomStringUTF8(factory); } } diff --git a/tests/performance/random_string_utf8.xml b/tests/performance/random_string_utf8.xml new file mode 100644 index 00000000000..60e3b69e79f --- /dev/null +++ b/tests/performance/random_string_utf8.xml @@ -0,0 +1,12 @@ + + + + + SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(10)) + SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(100)) + SELECT count() FROM zeros(100000) WHERE NOT ignore(randomStringUTF8(1000)) + SELECT count() FROM zeros(10000) WHERE NOT ignore(randomStringUTF8(10000)) + SELECT count() FROM zeros(10000000) WHERE NOT ignore(randomStringUTF8(rand() % 10)) + SELECT count() FROM zeros(10000000) WHERE NOT ignore(randomStringUTF8(rand() % 100)) + SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(rand() % 1000)) + diff --git a/tests/queries/0_stateless/01278_random_string_utf8.reference b/tests/queries/0_stateless/01278_random_string_utf8.reference new file mode 100644 index 00000000000..36ae0ace76a --- /dev/null +++ b/tests/queries/0_stateless/01278_random_string_utf8.reference @@ -0,0 +1,4 @@ +100 +String +1 + diff --git a/tests/queries/0_stateless/01278_random_string_utf8.sql b/tests/queries/0_stateless/01278_random_string_utf8.sql new file mode 100644 index 00000000000..f2c0a6c41c7 --- /dev/null +++ b/tests/queries/0_stateless/01278_random_string_utf8.sql @@ -0,0 +1,5 @@ +SELECT randomStringUTF8('string'); -- { serverError 43 } +SELECT lengthUTF8(randomStringUTF8(100)); +SELECT toTypeName(randomStringUTF8(10)); +SELECT isValidUTF8(randomStringUTF8(100000)); +SELECT randomStringUTF8(0);