mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 07:01:59 +00:00
Added function randomStringUTF8
This commit is contained in:
parent
caced89773
commit
675509ed23
@ -1,15 +1,14 @@
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <pcg_random.hpp>
|
||||
#include <Common/randomSeed.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
@ -27,10 +26,7 @@ public:
|
||||
static constexpr auto name = "randomPrintableASCII";
|
||||
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomPrintableASCII>(); }
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return name;
|
||||
}
|
||||
String getName() const override { return name; }
|
||||
|
||||
bool isVariadic() const override { return true; }
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
@ -38,11 +34,13 @@ public:
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (arguments.empty())
|
||||
throw Exception("Function " + getName() + " requires at least one argument: the size of resulting string",
|
||||
throw Exception(
|
||||
"Function " + getName() + " requires at least one argument: the size of resulting string",
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
if (arguments.size() > 2)
|
||||
throw Exception("Function " + getName() + " requires at most two arguments: the size of resulting string and optional disambiguation tag",
|
||||
throw Exception(
|
||||
"Function " + getName() + " requires at most two arguments: the size of resulting string and optional disambiguation tag",
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
const IDataType & length_type = *arguments[0];
|
||||
@ -77,8 +75,9 @@ public:
|
||||
data_to.resize(next_offset);
|
||||
offsets_to[row_num] = next_offset;
|
||||
|
||||
auto * data_to_ptr = data_to.data(); /// avoid assert on array indexing after end
|
||||
for (size_t pos = offset, end = offset + length; pos < end; pos += 4) /// We have padding in column buffers that we can overwrite.
|
||||
auto * data_to_ptr = data_to.data(); /// avoid assert on array indexing after end
|
||||
for (size_t pos = offset, end = offset + length; pos < end;
|
||||
pos += 4) /// We have padding in column buffers that we can overwrite.
|
||||
{
|
||||
UInt64 rand = rng();
|
||||
|
||||
|
@ -4,8 +4,8 @@
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <pcg_random.hpp>
|
||||
#include <common/unaligned.h>
|
||||
#include <Common/randomSeed.h>
|
||||
#include <common/unaligned.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -44,9 +44,8 @@ public:
|
||||
"Function " + getName() + " requires at most two arguments: the size of resulting string and optional disambiguation tag",
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
const IDataType & length_type = *arguments[0];
|
||||
if (!isNumber(length_type))
|
||||
throw Exception("First argument of function " + getName() + " must have numeric type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
if (!isUnsignedInteger(*arguments[0]))
|
||||
throw Exception("First argument for function " + getName() + " must be unsigned integer", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
153
src/Functions/randomStringUTF8.cpp
Normal file
153
src/Functions/randomStringUTF8.cpp
Normal file
@ -0,0 +1,153 @@
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <pcg_random.hpp>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <Common/randomSeed.h>
|
||||
|
||||
#include <common/defines.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int TOO_LARGE_STRING_SIZE;
|
||||
}
|
||||
|
||||
|
||||
/* Generate string with a UTF-8 encoded text.
|
||||
* Take a single argument - length of result string in Unicode code points.
|
||||
* ATTENTION: Method generate only assignable code points(excluded 4-13 planes).
|
||||
* See https://en.wikipedia.org/wiki/Plane_(Unicode) */
|
||||
|
||||
class FunctionRandomStringUTF8 : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "randomStringUTF8";
|
||||
|
||||
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomStringUTF8>(); }
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
bool isVariadic() const override { return false; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isUnsignedInteger(*arguments[0]))
|
||||
throw Exception("First argument for function " + getName() + " must be unsigned integer", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
bool isDeterministic() const override { return false; }
|
||||
bool isDeterministicInScopeOfQuery() const override { return false; }
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
|
||||
{
|
||||
auto col_to = ColumnString::create();
|
||||
ColumnString::Chars & data_to = col_to->getChars();
|
||||
ColumnString::Offsets & offsets_to = col_to->getOffsets();
|
||||
|
||||
if (input_rows_count == 0)
|
||||
{
|
||||
block.getByPosition(result).column = std::move(col_to);
|
||||
return;
|
||||
}
|
||||
|
||||
offsets_to.resize(input_rows_count);
|
||||
|
||||
const IColumn & length_column = *block.getByPosition(arguments[0]).column;
|
||||
size_t summary_utf8_len = 0;
|
||||
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
|
||||
{
|
||||
size_t utf8_len = length_column.getUInt(row_num);
|
||||
summary_utf8_len += utf8_len;
|
||||
}
|
||||
|
||||
/* As we generate only assigned planes, the mathematical expectation of the number of bytes
|
||||
* per generated code point ~= 3.85. So, reserving for coefficient 4 will not be an overhead
|
||||
*/
|
||||
|
||||
if (summary_utf8_len > (1 << 29))
|
||||
throw Exception("Too large string size in function " + getName(), ErrorCodes::TOO_LARGE_STRING_SIZE);
|
||||
|
||||
size_t size_in_bytes_with_margin = summary_utf8_len * 4 + input_rows_count;
|
||||
data_to.resize(size_in_bytes_with_margin);
|
||||
pcg64_fast rng(randomSeed()); /// TODO It is inefficient. We should use SIMD PRNG instead.
|
||||
|
||||
auto generate_code_point = [](UInt32 rand) {
|
||||
/// We want to generate number in [0x0, 0x70000) and shift it if need
|
||||
|
||||
/// Generate highest byte in [0, 6]
|
||||
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
||||
UInt32 code_point = (rand >> 16) * 7u;
|
||||
code_point &= ((-1) ^ 0xFFFF);
|
||||
code_point |= rand & 0xFFFF; // and other bytes obtaining in a simple way
|
||||
|
||||
if (code_point >= 0x40000)
|
||||
{
|
||||
code_point += 0xa0000; // shift if it is in 14-16 plane
|
||||
return code_point;
|
||||
}
|
||||
|
||||
if (0xD7FF < code_point && code_point < 0xE000) // this range will not be valid in isValidUTF8
|
||||
{
|
||||
/* TODO(reviewer) choose with @axolm variant:
|
||||
* 1. Not to do this if (isValidUTF8 can return 0)
|
||||
* 2. just return 0
|
||||
* 3. capture rng in lambda and do while(code_point is bad) { recalc... }
|
||||
* 4. ...
|
||||
* */
|
||||
return 0u;
|
||||
}
|
||||
return code_point;
|
||||
};
|
||||
|
||||
IColumn::Offset offset = 0;
|
||||
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
|
||||
{
|
||||
size_t utf8_len = length_column.getUInt(row_num);
|
||||
auto * pos = data_to.data() + offset;
|
||||
|
||||
size_t last_writed_bytes = 0;
|
||||
size_t i = 0;
|
||||
for (; i < utf8_len; i += 2)
|
||||
{
|
||||
UInt64 rand = rng();
|
||||
|
||||
UInt32 code_point1 = generate_code_point(rand);
|
||||
UInt32 code_point2 = generate_code_point(rand >> 32);
|
||||
|
||||
/// We have padding in column buffers that we can overwrite.
|
||||
pos += UTF8::convert(*reinterpret_cast<int *>(&code_point1), pos, sizeof(int));
|
||||
last_writed_bytes = UTF8::convert(*reinterpret_cast<int *>(&code_point2), pos, sizeof(int));
|
||||
pos += last_writed_bytes;
|
||||
}
|
||||
offset = pos - data_to.data() + 1;
|
||||
if (i > utf8_len)
|
||||
{
|
||||
offset -= last_writed_bytes;
|
||||
}
|
||||
offsets_to[row_num] = offset;
|
||||
}
|
||||
|
||||
|
||||
/// Put zero bytes in between.
|
||||
auto * pos = data_to.data();
|
||||
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
|
||||
pos[offsets_to[row_num] - 1] = 0;
|
||||
|
||||
block.getByPosition(result).column = std::move(col_to);
|
||||
}
|
||||
};
|
||||
|
||||
void registerFunctionRandomStringUTF8(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionRandomStringUTF8>();
|
||||
}
|
||||
}
|
@ -8,6 +8,7 @@ void registerFunctionRandConstant(FunctionFactory & factory);
|
||||
void registerFunctionGenerateUUIDv4(FunctionFactory & factory);
|
||||
void registerFunctionRandomPrintableASCII(FunctionFactory & factory);
|
||||
void registerFunctionRandomString(FunctionFactory & factory);
|
||||
void registerFunctionRandomStringUTF8(FunctionFactory & factory);
|
||||
|
||||
void registerFunctionsRandom(FunctionFactory & factory)
|
||||
{
|
||||
@ -17,6 +18,7 @@ void registerFunctionsRandom(FunctionFactory & factory)
|
||||
registerFunctionGenerateUUIDv4(factory);
|
||||
registerFunctionRandomPrintableASCII(factory);
|
||||
registerFunctionRandomString(factory);
|
||||
registerFunctionRandomStringUTF8(factory);
|
||||
}
|
||||
|
||||
}
|
||||
|
12
tests/performance/random_string_utf8.xml
Normal file
12
tests/performance/random_string_utf8.xml
Normal file
@ -0,0 +1,12 @@
|
||||
<test>
|
||||
|
||||
|
||||
|
||||
<query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(10))</query>
|
||||
<query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(100))</query>
|
||||
<query>SELECT count() FROM zeros(100000) WHERE NOT ignore(randomStringUTF8(1000))</query>
|
||||
<query>SELECT count() FROM zeros(10000) WHERE NOT ignore(randomStringUTF8(10000))</query>
|
||||
<query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(randomStringUTF8(rand() % 10))</query>
|
||||
<query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(randomStringUTF8(rand() % 100))</query>
|
||||
<query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(rand() % 1000))</query>
|
||||
</test>
|
@ -0,0 +1,4 @@
|
||||
100
|
||||
String
|
||||
1
|
||||
|
8
tests/queries/0_stateless/01278_random_string_utf8.sql
Normal file
8
tests/queries/0_stateless/01278_random_string_utf8.sql
Normal file
@ -0,0 +1,8 @@
|
||||
SELECT randomStringUTF8('string'); -- { serverError 43 }
|
||||
SELECT randomStringUTF8(-10); -- { serverError 43 }
|
||||
SELECT lengthUTF8(randomStringUTF8(100));
|
||||
SELECT toTypeName(randomStringUTF8(10));
|
||||
SELECT isValidUTF8(randomStringUTF8(100000));
|
||||
SELECT randomStringUTF8(0);
|
||||
-- SELECT DISTINCT c > 30000 FROM (SELECT arrayJoin(arrayMap(x -> reinterpretAsUInt8(substring(randomStringUTF8(100), x + 1, 1)), range(100))) AS byte, count() AS c FROM numbers(100000) GROUP BY byte ORDER BY byte);
|
||||
|
Loading…
Reference in New Issue
Block a user