mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Merging randomPrintableASCII #8401
This commit is contained in:
parent
4faf2f5485
commit
d42b50456f
@ -1,134 +0,0 @@
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <IO/WriteBufferFromVector.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Common/formatReadable.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <type_traits>
|
||||
|
||||
#include <random>
|
||||
#include <iostream>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
class FunctionRandomASCII : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "randomASCII";
|
||||
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomASCII>(); }
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
const IDataType & type = *arguments[0];
|
||||
|
||||
if (!isNativeNumber(type))
|
||||
throw Exception("Cannot format " + type.getName() + " as size in bytes", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
bool isDeterministic() const override { return false; }
|
||||
bool isDeterministicInScopeOfQuery() const override { return false; }
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
|
||||
{
|
||||
if (!(executeType<UInt8>(block, arguments, result, input_rows_count)
|
||||
|| executeType<UInt16>(block, arguments, result, input_rows_count)
|
||||
|| executeType<UInt32>(block, arguments, result, input_rows_count)
|
||||
|| executeType<UInt64>(block, arguments, result, input_rows_count)
|
||||
|| executeType<Int8>(block, arguments, result, input_rows_count)
|
||||
|| executeType<Int16>(block, arguments, result, input_rows_count)
|
||||
|| executeType<Int32>(block, arguments, result, input_rows_count)
|
||||
|| executeType<Int64>(block, arguments, result, input_rows_count)))
|
||||
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
|
||||
+ " of argument of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
bool executeType(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count)
|
||||
{
|
||||
bool is_const_column = false;
|
||||
const ColumnVector<T> * col_from = checkAndGetColumn<ColumnVector<T>>(block.getByPosition(arguments[0]).column.get());
|
||||
|
||||
if (!col_from){
|
||||
col_from = checkAndGetColumnConstData<ColumnVector<T>>(block.getByPosition(arguments[0]).column.get());
|
||||
is_const_column = true;
|
||||
}
|
||||
|
||||
if (col_from){
|
||||
|
||||
auto col_to = ColumnString::create();
|
||||
|
||||
const typename ColumnVector<T>::Container & vec_from = col_from->getData();
|
||||
ColumnString::Chars & data_to = col_to->getChars();
|
||||
ColumnString::Offsets & offsets_to = col_to->getOffsets();
|
||||
offsets_to.resize(input_rows_count);
|
||||
|
||||
WriteBufferFromVector<ColumnString::Chars> buf_to(data_to);
|
||||
|
||||
std::default_random_engine generator;
|
||||
std::uniform_int_distribution<int> distribution(32, 127); //Printable ASCII symbols
|
||||
std::random_device rd;
|
||||
char character;
|
||||
size_t str_length = 0;
|
||||
|
||||
if (is_const_column){
|
||||
str_length = static_cast<size_t>(vec_from[0]);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
if (!is_const_column){
|
||||
str_length = static_cast<size_t>(vec_from[i]);
|
||||
}
|
||||
|
||||
generator.seed( rd() );
|
||||
|
||||
if (str_length > 0){
|
||||
for (size_t j = 0; j < str_length; ++j)
|
||||
{
|
||||
character = distribution(generator);
|
||||
writeChar(character, buf_to);
|
||||
}
|
||||
}
|
||||
|
||||
writeChar(0, buf_to);
|
||||
offsets_to[i] = buf_to.count();
|
||||
}
|
||||
|
||||
buf_to.finish();
|
||||
block.getByPosition(result).column = std::move(col_to);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
void registerFunctionRandomASCII(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionRandomASCII>();
|
||||
}
|
||||
|
||||
}
|
103
dbms/src/Functions/randomPrintableASCII.cpp
Normal file
103
dbms/src/Functions/randomPrintableASCII.cpp
Normal file
@ -0,0 +1,103 @@
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Common/thread_local_rng.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
}
|
||||
|
||||
|
||||
/** Generate random string of specified length with printable ASCII characters, almost uniformly distributed.
|
||||
* First argument is length, other optional arguments are ignored and used to prevent common subexpression elimination to get different values.
|
||||
*/
|
||||
class FunctionRandomPrintableASCII : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "randomPrintableASCII";
|
||||
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomPrintableASCII>(); }
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
bool isVariadic() const override { return true; }
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (arguments.size() < 1)
|
||||
throw Exception("Function " + getName() + " requires at least one argument: the size of resulting string", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
const IDataType & length_type = *arguments[0];
|
||||
if (!isNumber(length_type))
|
||||
throw Exception("First argument of function " + getName() + " must have numeric type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
bool isDeterministic() const override { return false; }
|
||||
bool isDeterministicInScopeOfQuery() const override { return false; }
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
|
||||
{
|
||||
auto col_to = ColumnString::create();
|
||||
ColumnString::Chars & data_to = col_to->getChars();
|
||||
ColumnString::Offsets & offsets_to = col_to->getOffsets();
|
||||
offsets_to.resize(input_rows_count);
|
||||
|
||||
const IColumn & length_column = *block.getByPosition(arguments[0]).column;
|
||||
|
||||
IColumn::Offset offset = 0;
|
||||
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
|
||||
{
|
||||
size_t length = length_column.getUInt(row_num);
|
||||
|
||||
IColumn::Offset next_offset = offset + length + 1;
|
||||
data_to.resize(next_offset);
|
||||
offsets_to[row_num] = next_offset;
|
||||
|
||||
for (size_t pos = offset, end = offset + length; pos < end; pos += 4) /// We have padding in column buffers that we can overwrite.
|
||||
{
|
||||
UInt64 rand = thread_local_rng();
|
||||
|
||||
UInt16 rand1 = rand;
|
||||
UInt16 rand2 = rand >> 16;
|
||||
UInt16 rand3 = rand >> 32;
|
||||
UInt16 rand4 = rand >> 48;
|
||||
|
||||
/// Printable characters are from range [32; 126].
|
||||
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
||||
|
||||
data_to[pos + 0] = 32 + ((rand1 * 95) >> 16);
|
||||
data_to[pos + 1] = 32 + ((rand2 * 95) >> 16);
|
||||
data_to[pos + 2] = 32 + ((rand3 * 95) >> 16);
|
||||
data_to[pos + 3] = 32 + ((rand4 * 95) >> 16);
|
||||
|
||||
/// TODO Implement SIMD optimizations from Danila Kutenin.
|
||||
}
|
||||
|
||||
data_to[offset + length] = 0;
|
||||
|
||||
offset = next_offset;
|
||||
}
|
||||
|
||||
block.getByPosition(result).column = std::move(col_to);
|
||||
}
|
||||
};
|
||||
|
||||
void registerFunctionRandomPrintableASCII(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionRandomPrintableASCII>();
|
||||
}
|
||||
|
||||
}
|
@ -54,7 +54,7 @@ void registerFunctionEvalMLMethod(FunctionFactory &);
|
||||
void registerFunctionBasename(FunctionFactory &);
|
||||
void registerFunctionTransform(FunctionFactory &);
|
||||
void registerFunctionGetMacro(FunctionFactory &);
|
||||
void registerFunctionRandomASCII(FunctionFactory &);
|
||||
void registerFunctionRandomPrintableASCII(FunctionFactory &);
|
||||
void registerFunctionGetScalar(FunctionFactory &);
|
||||
|
||||
#if USE_ICU
|
||||
|
@ -54,7 +54,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
|
||||
registerFunctionBasename(factory);
|
||||
registerFunctionTransform(factory);
|
||||
registerFunctionGetMacro(factory);
|
||||
registerFunctionRandomASCII(factory);
|
||||
registerFunctionRandomPrintableASCII(factory);
|
||||
registerFunctionGetScalar(factory);
|
||||
|
||||
#if USE_ICU
|
||||
|
25
dbms/tests/performance/random_printable_ascii.xml
Normal file
25
dbms/tests/performance/random_printable_ascii.xml
Normal file
@ -0,0 +1,25 @@
|
||||
<test>
|
||||
<type>once</type>
|
||||
|
||||
<stop_conditions>
|
||||
<any_of>
|
||||
<average_speed_not_changing_for_ms>4000</average_speed_not_changing_for_ms>
|
||||
<total_time_ms>10000</total_time_ms>
|
||||
</any_of>
|
||||
</stop_conditions>
|
||||
|
||||
<main_metric>
|
||||
<max_rows_per_second />
|
||||
<max_bytes_per_second />
|
||||
<avg_rows_per_second />
|
||||
<avg_bytes_per_second />
|
||||
</main_metric>
|
||||
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(10))</query>
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(100))</query>
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(1000))</query>
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(10000))</query>
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(rand() % 10))</query>
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(rand() % 100))</query>
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(rand() % 1000))</query>
|
||||
</test>
|
@ -1,2 +0,0 @@
|
||||
SELECT toTypeName(randomASCII(1000));
|
||||
SELECT length(randomASCII(1000));
|
@ -0,0 +1,2 @@
|
||||
SELECT toTypeName(randomPrintableASCII(1000));
|
||||
SELECT length(randomPrintableASCII(1000));
|
@ -6,7 +6,7 @@ Returns a string with the name of the host that this function was performed on.
|
||||
|
||||
## FQDN {#fqdn}
|
||||
|
||||
Returns the fully qualified domain name.
|
||||
Returns the fully qualified domain name.
|
||||
|
||||
**Syntax**
|
||||
|
||||
@ -392,7 +392,7 @@ neighbor(column, offset[, default_value])
|
||||
The result of the function depends on the affected data blocks and the order of data in the block.
|
||||
If you make a subquery with ORDER BY and call the function from outside the subquery, you can get the expected result.
|
||||
|
||||
**Parameters**
|
||||
**Parameters**
|
||||
|
||||
- `column` — A column name or scalar expression.
|
||||
- `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../data_types/int_uint.md).
|
||||
@ -400,7 +400,7 @@ If you make a subquery with ORDER BY and call the function from outside the subq
|
||||
|
||||
**Returned values**
|
||||
|
||||
- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds.
|
||||
- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds.
|
||||
- Default value for `column` if `offset` value is outside block bounds. If `default_value` is given, then it will be used.
|
||||
|
||||
Type: type of data blocks affected or default value type.
|
||||
@ -545,7 +545,7 @@ WHERE diff != 1
|
||||
└────────┴──────┘
|
||||
```
|
||||
```sql
|
||||
set max_block_size=100000 -- default value is 65536!
|
||||
set max_block_size=100000 -- default value is 65536!
|
||||
|
||||
SELECT
|
||||
number,
|
||||
@ -886,7 +886,7 @@ Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many.
|
||||
|
||||
## identity()
|
||||
|
||||
Returns the same value that was used as its argument.
|
||||
Returns the same value that was used as its argument.
|
||||
|
||||
```sql
|
||||
SELECT identity(42)
|
||||
@ -898,14 +898,14 @@ SELECT identity(42)
|
||||
```
|
||||
Used for debugging and testing, allows to "break" access by index, and get the result and query performance for a full scan.
|
||||
|
||||
## randomASCII {#randomascii}
|
||||
## randomPrintableASCII {#randomascii}
|
||||
|
||||
Generates a string with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
randomASKII(length)
|
||||
randomPrintableASCII(length)
|
||||
```
|
||||
|
||||
**Parameters**
|
||||
@ -923,14 +923,14 @@ Type: [String](../../data_types/string.md)
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT number, randomASCII(30) as str, length(str) FROM system.numbers LIMIT 3
|
||||
SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3
|
||||
```
|
||||
```text
|
||||
┌─number─┬─str────────────────────────────┬─length(randomASCII(30))─┐
|
||||
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
|
||||
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
|
||||
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
|
||||
└────────┴────────────────────────────────┴─────────────────────────┘
|
||||
┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐
|
||||
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
|
||||
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
|
||||
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
|
||||
└────────┴────────────────────────────────┴──────────────────────────────────┘
|
||||
```
|
||||
|
||||
[Original article](https://clickhouse.yandex/docs/en/query_language/functions/other_functions/) <!--hide-->
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
## FQDN {#fqdn}
|
||||
|
||||
Возвращает полное имя домена.
|
||||
Возвращает полное имя домена.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
@ -377,7 +377,7 @@ neighbor(column, offset[, default_value])
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение `column` в смещении от текущей строки, если значение `offset` не выходит за пределы блока.
|
||||
- Значение `column` в смещении от текущей строки, если значение `offset` не выходит за пределы блока.
|
||||
- Значение по умолчанию для `column`, если значение `offset` выходит за пределы блока данных. Если передан параметр `default_value`, то значение берется из него.
|
||||
|
||||
Тип: зависит от данных в `column` или переданного значения по умолчанию в `default_value`.
|
||||
@ -885,14 +885,14 @@ SELECT identity(42)
|
||||
```
|
||||
Используется для отладки и тестирования, позволяет "сломать" доступ по индексу, и получить результат и производительность запроса для полного сканирования.
|
||||
|
||||
## randomASCII {#randomascii}
|
||||
## randomPrintableASCII {#randomascii}
|
||||
|
||||
Генерирует строку со случайным набором печатных символов [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters).
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
```sql
|
||||
randomASKII(length)
|
||||
randomPrintableASCII(length)
|
||||
```
|
||||
|
||||
**Параметры**
|
||||
@ -910,14 +910,14 @@ randomASKII(length)
|
||||
**Пример**
|
||||
|
||||
```sql
|
||||
SELECT number, randomASCII(30) as str, length(str) FROM system.numbers LIMIT 3
|
||||
SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3
|
||||
```
|
||||
```text
|
||||
┌─number─┬─str────────────────────────────┬─length(randomASCII(30))─┐
|
||||
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
|
||||
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
|
||||
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
|
||||
└────────┴────────────────────────────────┴─────────────────────────┘
|
||||
┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐
|
||||
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
|
||||
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
|
||||
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
|
||||
└────────┴────────────────────────────────┴──────────────────────────────────┘
|
||||
```
|
||||
|
||||
[Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/other_functions/) <!--hide-->
|
||||
|
Loading…
Reference in New Issue
Block a user