Merge pull request #66257 from bigo-sg/ch_support_printf

Add function printf for spark compatiability
This commit is contained in:
vdimir 2024-08-05 09:53:22 +00:00 committed by GitHub
commit 908f3fc937
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 451 additions and 0 deletions

View File

@ -223,3 +223,28 @@ SELECT translateUTF8('Münchener Straße', 'üß', 'us') AS res;
│ Munchener Strase │
└──────────────────┘
```
## printf
The `printf` function formats the given string with the values (strings, integers, floating-points etc.) listed in the arguments, similar to printf function in C++. The format string can contain format specifiers starting with `%` character. Anything not contained in `%` and the following format specifier is considered literal text and copied verbatim into the output. Literal `%` character can be escaped by `%%`.
**Syntax**
``` sql
printf(format, arg1, arg2, ...)
```
**Example**
Query:
``` sql
select printf('%%%s %s %d', 'Hello', 'World', 2024);
```
``` response
┌─printf('%%%s %s %d', 'Hello', 'World', 2024)─┐
│ %Hello World 2024 │
└──────────────────────────────────────────────┘
```

364
src/Functions/printf.cpp Normal file
View File

@ -0,0 +1,364 @@
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnStringHelpers.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/formatString.h>
#include <IO/Operators.h>
#include <IO/WriteHelpers.h>
#include <memory>
#include <vector>
#include <fmt/format.h>
#include <fmt/printf.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int BAD_ARGUMENTS;
}
namespace
{
class FunctionPrintf : public IFunction
{
private:
ContextPtr context;
FunctionOverloadResolverPtr function_concat;
struct Instruction
{
std::string_view format;
size_t rows;
bool is_literal; /// format is literal string without any argument
ColumnWithTypeAndName input; /// Only used when is_literal is false
ColumnWithTypeAndName execute() const
{
if (is_literal)
return executeLiteral(format);
else if (isColumnConst(*input.column))
return executeConstant(input);
else
return executeNonconstant(input);
}
[[maybe_unused]] String toString() const
{
WriteBufferFromOwnString buf;
buf << "format:" << format << ", rows:" << rows << ", is_literal:" << is_literal << ", input:" << input.dumpStructure() << "\n";
return buf.str();
}
private:
ColumnWithTypeAndName executeLiteral(std::string_view literal) const
{
ColumnWithTypeAndName res;
auto str_col = ColumnString::create();
str_col->insert(fmt::sprintf(literal));
res.column = ColumnConst::create(std::move(str_col), rows);
res.type = std::make_shared<DataTypeString>();
return res;
}
ColumnWithTypeAndName executeConstant(const ColumnWithTypeAndName & arg) const
{
ColumnWithTypeAndName tmp_arg = arg;
const auto & const_col = static_cast<const ColumnConst &>(*arg.column);
tmp_arg.column = const_col.getDataColumnPtr();
ColumnWithTypeAndName tmp_res = executeNonconstant(tmp_arg);
return ColumnWithTypeAndName{ColumnConst::create(tmp_res.column, arg.column->size()), tmp_res.type, tmp_res.name};
}
template <typename T>
bool executeNumber(const IColumn & column, ColumnString::Chars & res_chars, ColumnString::Offsets & res_offsets) const
{
const ColumnVector<T> * concrete_column = checkAndGetColumn<ColumnVector<T>>(&column);
if (!concrete_column)
return false;
String s;
size_t curr_offset = 0;
const auto & data = concrete_column->getData();
for (size_t i = 0; i < data.size(); ++i)
{
T a = data[i];
s = fmt::sprintf(format, static_cast<NearestFieldType<T>>(a));
res_chars.resize(curr_offset + s.size() + 1);
memcpy(&res_chars[curr_offset], s.data(), s.size());
res_chars[curr_offset + s.size()] = 0;
curr_offset += s.size() + 1;
res_offsets[i] = curr_offset;
}
return true;
}
template <typename COLUMN>
bool executeString(const IColumn & column, ColumnString::Chars & res_chars, ColumnString::Offsets & res_offsets) const
{
const COLUMN * concrete_column = checkAndGetColumn<COLUMN>(&column);
if (!concrete_column)
return false;
String s;
size_t curr_offset = 0;
for (size_t i = 0; i < concrete_column->size(); ++i)
{
auto a = concrete_column->getDataAt(i).toView();
s = fmt::sprintf(format, a);
res_chars.resize(curr_offset + s.size() + 1);
memcpy(&res_chars[curr_offset], s.data(), s.size());
res_chars[curr_offset + s.size()] = 0;
curr_offset += s.size() + 1;
res_offsets[i] = curr_offset;
}
return true;
}
ColumnWithTypeAndName executeNonconstant(const ColumnWithTypeAndName & arg) const
{
size_t size = arg.column->size();
auto res_col = ColumnString::create();
auto & res_str = static_cast<ColumnString &>(*res_col);
auto & res_offsets = res_str.getOffsets();
auto & res_chars = res_str.getChars();
res_offsets.resize_exact(size);
res_chars.reserve(format.size() * size);
WhichDataType which(arg.type);
if (which.isNativeNumber()
&& (executeNumber<UInt8>(*arg.column, res_chars, res_offsets) || executeNumber<UInt16>(*arg.column, res_chars, res_offsets)
|| executeNumber<UInt32>(*arg.column, res_chars, res_offsets)
|| executeNumber<UInt64>(*arg.column, res_chars, res_offsets)
|| executeNumber<Int8>(*arg.column, res_chars, res_offsets) || executeNumber<Int16>(*arg.column, res_chars, res_offsets)
|| executeNumber<Int32>(*arg.column, res_chars, res_offsets)
|| executeNumber<Int64>(*arg.column, res_chars, res_offsets)
|| executeNumber<Float32>(*arg.column, res_chars, res_offsets)
|| executeNumber<Float64>(*arg.column, res_chars, res_offsets)))
{
return {std::move(res_col), std::make_shared<DataTypeString>(), arg.name};
}
else if (
which.isStringOrFixedString()
&& (executeString<ColumnString>(*arg.column, res_chars, res_offsets)
|| executeString<ColumnFixedString>(*arg.column, res_chars, res_offsets)))
{
return {std::move(res_col), std::make_shared<DataTypeString>(), arg.name};
}
else
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"The argument type of function {} is {}, but native numeric or string type is expected",
FunctionPrintf::name,
arg.type->getName());
}
};
public:
static constexpr auto name = "printf";
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionPrintf>(context); }
explicit FunctionPrintf(ContextPtr context_)
: context(context_), function_concat(FunctionFactory::instance().get("concat", context)) { }
String getName() const override { return name; }
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool useDefaultImplementationForConstants() const override { return false; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.empty())
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at least 1",
getName(),
arguments.size());
/// First pattern argument must have string type
if (!isString(arguments[0]))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"The first argument type of function {} is {}, but String type is expected",
getName(),
arguments[0]->getName());
for (size_t i = 1; i < arguments.size(); ++i)
{
if (!isNativeNumber(arguments[i]) && !isStringOrFixedString(arguments[i]))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"The {}-th argument type of function {} is {}, but native numeric or string type is expected",
i + 1,
getName(),
arguments[i]->getName());
}
return std::make_shared<DataTypeString>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const ColumnPtr & c0 = arguments[0].column;
const ColumnConst * c0_const_string = typeid_cast<const ColumnConst *>(&*c0);
if (!c0_const_string)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument of function {} must be constant string", getName());
String format = c0_const_string->getValue<String>();
auto instructions = buildInstructions(format, arguments, input_rows_count);
ColumnsWithTypeAndName concat_args(instructions.size());
for (size_t i = 0; i < instructions.size(); ++i)
{
const auto & instruction = instructions[i];
try
{
// std::cout << "instruction[" << i << "]:" << instructions[i].toString() << std::endl;
concat_args[i] = instruction.execute();
// std::cout << "concat_args[" << i << "]:" << concat_args[i].dumpStructure() << std::endl;
}
catch (const fmt::v9::format_error & e)
{
if (instruction.is_literal)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Bad format {} in function {} without input argument, reason: {}",
instruction.format,
getName(),
e.what());
else
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Bad format {} in function {} with {} as input argument, reason: {}",
instructions[i].format,
getName(),
instruction.input.dumpStructure(),
e.what());
}
}
auto res = function_concat->build(concat_args)->execute(concat_args, std::make_shared<DataTypeString>(), input_rows_count);
return res;
}
private:
std::vector<Instruction>
buildInstructions(const String & format, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const
{
std::vector<Instruction> instructions;
instructions.reserve(arguments.size());
auto append_instruction = [&](const char * begin, const char * end, const ColumnWithTypeAndName & arg)
{
Instruction instr;
instr.rows = input_rows_count;
instr.format = std::string_view(begin, end - begin);
size_t size = end - begin;
if (size > 1 && begin[0] == '%' and begin[1] != '%')
{
instr.is_literal = false;
instr.input = arg;
}
else
{
instr.is_literal = true;
}
instructions.emplace_back(std::move(instr));
};
auto check_index_range = [&](size_t idx)
{
if (idx >= arguments.size())
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, but format is {}",
getName(),
arguments.size(),
format);
};
const char * begin = format.data();
const char * end = format.data() + format.size();
const char * curr = begin;
size_t idx = 0;
while (curr < end)
{
const char * tmp = curr;
bool is_first = curr == begin; /// If current instruction is the first one
bool is_literal = false; /// If current instruction is literal string without any argument
if (is_first)
{
if (*curr != '%')
is_literal = true;
else if (curr + 1 < end && *(curr + 1) == '%')
is_literal = true;
else
++idx; /// Skip first argument if first instruction is not literal
}
if (!is_literal)
++curr;
while (curr < end)
{
if (*curr != '%')
++curr;
else if (curr + 1 < end && *(curr + 1) == '%')
curr += 2;
else
{
check_index_range(idx);
append_instruction(tmp, curr, arguments[idx]);
++idx;
break;
}
}
if (curr == end)
{
check_index_range(idx);
append_instruction(tmp, curr, arguments[idx]);
++idx;
}
}
/// Check if all arguments are used
if (idx != arguments.size())
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, but format is {}",
getName(),
arguments.size(),
format);
return instructions;
}
};
}
REGISTER_FUNCTION(Printf)
{
factory.registerFunction<FunctionPrintf>();
}
}

View File

@ -562,6 +562,7 @@ positionCaseInsensitive
positionCaseInsensitiveUTF8
positionUTF8
pow
printf
proportionsZTest
protocol
queryID

View File

@ -0,0 +1,21 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,39 @@
-- Testing integer formats
select printf('%%d: %d', 123) = '%d: 123';
select printf('%%i: %i', 123) = '%i: 123';
select printf('%%u: %u', 123) = '%u: 123';
select printf('%%o: %o', 123) = '%o: 173';
select printf('%%x: %x', 123) = '%x: 7b';
select printf('%%X: %X', 123) = '%X: 7B';
-- Testing floating point formats
select printf('%%f: %f', 123.456) = '%f: 123.456000';
select printf('%%F: %F', 123.456) = '%F: 123.456000';
select printf('%%e: %e', 123.456) = '%e: 1.234560e+02';
select printf('%%E: %E', 123.456) = '%E: 1.234560E+02';
select printf('%%g: %g', 123.456) = '%g: 123.456';
select printf('%%G: %G', 123.456) = '%G: 123.456';
select printf('%%a: %a', 123.456) = '%a: 0x1.edd2f1a9fbe77p+6';
select printf('%%A: %A', 123.456) = '%A: 0X1.EDD2F1A9FBE77P+6';
-- Testing character formats
select printf('%%s: %s', 'abc') = '%s: abc';
-- Testing the %% specifier
select printf('%%%%: %%') = '%%: %';
-- Testing integer formats with precision
select printf('%%.5d: %.5d', 123) = '%.5d: 00123';
-- Testing floating point formats with precision
select printf('%%.2f: %.2f', 123.456) = '%.2f: 123.46';
select printf('%%.2e: %.2e', 123.456) = '%.2e: 1.23e+02';
select printf('%%.2g: %.2g', 123.456) = '%.2g: 1.2e+02';
-- Testing character formats with precision
select printf('%%.2s: %.2s', 'abc') = '%.2s: ab';
select printf('%%X: %X', 123.123); -- { serverError BAD_ARGUMENTS }
select printf('%%A: %A', 'abc'); -- { serverError BAD_ARGUMENTS }
select printf('%%s: %s', 100); -- { serverError BAD_ARGUMENTS }
select printf('%%n: %n', 100); -- { serverError BAD_ARGUMENTS }

View File

@ -2282,6 +2282,7 @@ prettyspacemonoblock
prettyspacenoescapes
prettyspacenoescapesmonoblock
prewhere
printf
privateKeyFile
privateKeyPassphraseHandler
prlimit