Merge branch 'random_string_utf8_function' of https://github.com/xolm/ClickHouse into xolm-random_string_utf8_function

This commit is contained in:
Alexey Milovidov 2020-05-18 04:22:16 +03:00
commit 4d1dddeee9
6 changed files with 173 additions and 1 deletions

View File

@ -4,8 +4,8 @@
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunctionImpl.h>
#include <pcg_random.hpp>
#include <common/unaligned.h>
#include <Common/randomSeed.h>
#include <common/unaligned.h>
namespace DB

View File

@ -0,0 +1,149 @@
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunctionImpl.h>
#include <pcg_random.hpp>
#include <Common/UTF8Helpers.h>
#include <Common/randomSeed.h>
#include <common/defines.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_STRING_SIZE;
}
/* Generate string with a UTF-8 encoded text.
* Take a single argument - length of result string in Unicode code points.
* ATTENTION: Method generate only assignable code points(excluded 4-13 planes).
* See https://en.wikipedia.org/wiki/Plane_(Unicode) */
class FunctionRandomStringUTF8 : public IFunction
{
public:
static constexpr auto name = "randomStringUTF8";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomStringUTF8>(); }
String getName() const override { return name; }
bool isVariadic() const override { return false; }
size_t getNumberOfArguments() const override { return 1; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isNumber(*arguments[0]))
throw Exception("First argument of function " + getName() + " must have numeric type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
auto col_to = ColumnString::create();
ColumnString::Chars & data_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
if (input_rows_count == 0)
{
block.getByPosition(result).column = std::move(col_to);
return;
}
offsets_to.resize(input_rows_count);
const IColumn & length_column = *block.getByPosition(arguments[0]).column;
size_t summary_utf8_len = 0;
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t utf8_len = length_column.getUInt(row_num);
summary_utf8_len += utf8_len;
}
/* As we generate only assigned planes, the mathematical expectation of the number of bytes
* per generated code point ~= 3.85. So, reserving for coefficient 4 will not be an overhead
*/
if (summary_utf8_len > (1 << 29))
throw Exception("Too large string size in function " + getName(), ErrorCodes::TOO_LARGE_STRING_SIZE);
size_t size_in_bytes_with_margin = summary_utf8_len * 4 + input_rows_count;
data_to.resize(size_in_bytes_with_margin);
pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead.
const auto generate_code_point = [](UInt32 rand) -> UInt32 {
/// We want to generate number in [0x0, 0x70000) and shift it if need
/// Generate highest byte in [0, 6]
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
UInt32 code_point = (rand >> 16) * 7u;
code_point &= ~0xFFFF;
code_point |= rand & 0xFFFF; // and other bytes obtaining in a simple way
if (code_point >= 0x40000)
{
code_point += 0xa0000; // shift if it is in 14-16 plane
return code_point;
}
if (0xD7FF < code_point && code_point < 0xE000) // this range will not be valid in isValidUTF8
{
/// The distribution will be slightly non-uniform but we don't care.
return 0u;
}
return code_point;
};
IColumn::Offset offset = 0;
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t utf8_len = length_column.getUInt(row_num);
auto * pos = data_to.data() + offset;
size_t last_writen_bytes = 0;
size_t i = 0;
for (; i < utf8_len; i += 2)
{
UInt64 rand = rng();
UInt32 code_point1 = generate_code_point(rand);
UInt32 code_point2 = generate_code_point(rand >> 32);
/// We have padding in column buffers that we can overwrite.
pos += UTF8::convert(code_point1, pos, sizeof(int));
last_writen_bytes = UTF8::convert(code_point2, pos, sizeof(int));
pos += last_writen_bytes;
}
offset = pos - data_to.data() + 1;
if (i > utf8_len)
{
offset -= last_writen_bytes;
}
offsets_to[row_num] = offset;
}
/// Put zero bytes in between.
auto * pos = data_to.data();
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
pos[offsets_to[row_num] - 1] = 0;
block.getByPosition(result).column = std::move(col_to);
}
};
void registerFunctionRandomStringUTF8(FunctionFactory & factory)
{
factory.registerFunction<FunctionRandomStringUTF8>();
}
}

View File

@ -9,6 +9,7 @@ void registerFunctionGenerateUUIDv4(FunctionFactory & factory);
void registerFunctionRandomPrintableASCII(FunctionFactory & factory);
void registerFunctionRandomString(FunctionFactory & factory);
void registerFunctionRandomFixedString(FunctionFactory & factory);
void registerFunctionRandomStringUTF8(FunctionFactory & factory);
void registerFunctionsRandom(FunctionFactory & factory)
{
@ -19,6 +20,7 @@ void registerFunctionsRandom(FunctionFactory & factory)
registerFunctionRandomPrintableASCII(factory);
registerFunctionRandomString(factory);
registerFunctionRandomFixedString(factory);
registerFunctionRandomStringUTF8(factory);
}
}

View File

@ -0,0 +1,12 @@
<test>
<query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(10))</query>
<query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(100))</query>
<query>SELECT count() FROM zeros(100000) WHERE NOT ignore(randomStringUTF8(1000))</query>
<query>SELECT count() FROM zeros(10000) WHERE NOT ignore(randomStringUTF8(10000))</query>
<query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(randomStringUTF8(rand() % 10))</query>
<query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(randomStringUTF8(rand() % 100))</query>
<query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomStringUTF8(rand() % 1000))</query>
</test>

View File

@ -0,0 +1,4 @@
100
String
1

View File

@ -0,0 +1,5 @@
SELECT randomStringUTF8('string'); -- { serverError 43 }
SELECT lengthUTF8(randomStringUTF8(100));
SELECT toTypeName(randomStringUTF8(10));
SELECT isValidUTF8(randomStringUTF8(100000));
SELECT randomStringUTF8(0);