mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-03 13:02:00 +00:00
Merge pull request #35651 from amosbird/columntransformerrename
This commit is contained in:
commit
31c367d3cd
178
src/Common/format.h
Normal file
178
src/Common/format.h
Normal file
@ -0,0 +1,178 @@
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/PODArray.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
namespace Format
|
||||
{
|
||||
using IndexPositions = PODArrayWithStackMemory<UInt64, 64>;
|
||||
|
||||
static inline void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res, UInt64 argument_number)
|
||||
{
|
||||
res = 0;
|
||||
for (UInt64 pos = l; pos < r; ++pos)
|
||||
{
|
||||
if (!isNumericASCII(description[pos]))
|
||||
throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS);
|
||||
res = res * 10 + description[pos] - '0';
|
||||
if (res >= argument_number)
|
||||
throw Exception(
|
||||
"Too big number for arguments, must be at most " + std::to_string(argument_number - 1), ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void init(
|
||||
const String & pattern,
|
||||
size_t argument_number,
|
||||
const std::vector<std::optional<String>> & constant_strings,
|
||||
IndexPositions & index_positions,
|
||||
std::vector<String> & substrings)
|
||||
{
|
||||
/// Is current position after open curly brace.
|
||||
bool is_open_curly = false;
|
||||
/// The position of last open token.
|
||||
size_t last_open = -1;
|
||||
|
||||
/// Is formatting in a plain {} token.
|
||||
std::optional<bool> is_plain_numbering;
|
||||
UInt64 index_if_plain = 0;
|
||||
|
||||
/// Left position of adding substrings, just to the closed brace position or the start of the string.
|
||||
/// Invariant --- the start of substring is in this position.
|
||||
size_t start_pos = 0;
|
||||
|
||||
/// A flag to decide whether we should glue the constant strings.
|
||||
bool glue_to_next = false;
|
||||
|
||||
/// Handling double braces (escaping).
|
||||
auto double_brace_removal = [](String & str)
|
||||
{
|
||||
size_t i = 0;
|
||||
bool should_delete = true;
|
||||
str.erase(
|
||||
std::remove_if(
|
||||
str.begin(),
|
||||
str.end(),
|
||||
[&i, &should_delete, &str](char)
|
||||
{
|
||||
bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
|
||||
++i;
|
||||
if (is_double_brace && should_delete)
|
||||
{
|
||||
should_delete = false;
|
||||
return true;
|
||||
}
|
||||
should_delete = true;
|
||||
return false;
|
||||
}),
|
||||
str.end());
|
||||
};
|
||||
|
||||
index_positions.emplace_back();
|
||||
|
||||
for (size_t i = 0; i < pattern.size(); ++i)
|
||||
{
|
||||
if (pattern[i] == '{')
|
||||
{
|
||||
/// Escaping handling
|
||||
/// It is safe to access because of null termination
|
||||
if (pattern[i + 1] == '{')
|
||||
{
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_open_curly)
|
||||
throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
String to_add = String(pattern.data() + start_pos, i - start_pos);
|
||||
double_brace_removal(to_add);
|
||||
if (!glue_to_next)
|
||||
substrings.emplace_back(to_add);
|
||||
else
|
||||
substrings.back() += to_add;
|
||||
|
||||
glue_to_next = false;
|
||||
|
||||
is_open_curly = true;
|
||||
last_open = i + 1;
|
||||
}
|
||||
else if (pattern[i] == '}')
|
||||
{
|
||||
if (pattern[i + 1] == '}')
|
||||
{
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_open_curly)
|
||||
throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
is_open_curly = false;
|
||||
|
||||
if (last_open == i)
|
||||
{
|
||||
if (is_plain_numbering && !*is_plain_numbering)
|
||||
throw Exception(
|
||||
"Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
|
||||
is_plain_numbering = true;
|
||||
if (index_if_plain >= argument_number)
|
||||
throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS);
|
||||
index_positions.back() = index_if_plain++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is_plain_numbering && *is_plain_numbering)
|
||||
throw Exception(
|
||||
"Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
|
||||
is_plain_numbering = false;
|
||||
|
||||
UInt64 arg;
|
||||
parseNumber(pattern, last_open, i, arg, argument_number);
|
||||
|
||||
if (arg >= argument_number)
|
||||
throw Exception(
|
||||
"Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
index_positions.back() = arg;
|
||||
}
|
||||
|
||||
if (!constant_strings.empty() && constant_strings[index_positions.back()])
|
||||
{
|
||||
/// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
|
||||
glue_to_next = true;
|
||||
substrings.back() += *constant_strings[index_positions.back()];
|
||||
}
|
||||
else
|
||||
index_positions.emplace_back(); /// Otherwise we commit arg number and proceed.
|
||||
|
||||
start_pos = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_open_curly)
|
||||
throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
|
||||
double_brace_removal(to_add);
|
||||
|
||||
if (!glue_to_next)
|
||||
substrings.emplace_back(to_add);
|
||||
else
|
||||
substrings.back() += to_add;
|
||||
|
||||
index_positions.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -52,23 +52,21 @@ public:
|
||||
{
|
||||
if (arguments.size() < 2)
|
||||
throw Exception(
|
||||
"Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
|
||||
+ ", should be at least 2.",
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
if (arguments.size() > FormatImpl::argument_threshold)
|
||||
throw Exception(
|
||||
"Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
|
||||
+ ", should be at most " + std::to_string(FormatImpl::argument_threshold),
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
||||
"Number of arguments for function {} doesn't match: passed {}, should be at least 2",
|
||||
getName(),
|
||||
arguments.size());
|
||||
|
||||
for (const auto arg_idx : collections::range(0, arguments.size()))
|
||||
{
|
||||
const auto * arg = arguments[arg_idx].get();
|
||||
if (!isStringOrFixedString(arg))
|
||||
throw Exception{"Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function "
|
||||
+ getName(),
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument {} of function {}",
|
||||
arg->getName(),
|
||||
arg_idx + 1,
|
||||
getName());
|
||||
}
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
@ -125,7 +123,7 @@ private:
|
||||
std::vector<const ColumnString::Chars *> data(num_arguments);
|
||||
std::vector<const ColumnString::Offsets *> offsets(num_arguments);
|
||||
std::vector<size_t> fixed_string_sizes(num_arguments);
|
||||
std::vector<String> constant_strings(num_arguments);
|
||||
std::vector<std::optional<String>> constant_strings(num_arguments);
|
||||
bool has_column_string = false;
|
||||
bool has_column_fixed_string = false;
|
||||
for (size_t i = 0; i < num_arguments; ++i)
|
||||
|
@ -45,25 +45,23 @@ public:
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (arguments.empty())
|
||||
if (arguments.size() < 2)
|
||||
throw Exception(
|
||||
"Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
|
||||
+ ", should be at least 1",
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
if (arguments.size() > FormatImpl::argument_threshold)
|
||||
throw Exception(
|
||||
"Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
|
||||
+ ", should be at most " + std::to_string(FormatImpl::argument_threshold),
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
||||
"Number of arguments for function {} doesn't match: passed {}, should be at least 2",
|
||||
getName(),
|
||||
arguments.size());
|
||||
|
||||
for (const auto arg_idx : collections::range(0, arguments.size()))
|
||||
{
|
||||
const auto * arg = arguments[arg_idx].get();
|
||||
if (!isStringOrFixedString(arg))
|
||||
throw Exception(
|
||||
"Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument {} of function {}",
|
||||
arg->getName(),
|
||||
arg_idx + 1,
|
||||
getName());
|
||||
}
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
@ -84,7 +82,7 @@ public:
|
||||
std::vector<const ColumnString::Chars *> data(arguments.size() - 1);
|
||||
std::vector<const ColumnString::Offsets *> offsets(arguments.size() - 1);
|
||||
std::vector<size_t> fixed_string_sizes(arguments.size() - 1);
|
||||
std::vector<String> constant_strings(arguments.size() - 1);
|
||||
std::vector<std::optional<String>> constant_strings(arguments.size() - 1);
|
||||
|
||||
bool has_column_string = false;
|
||||
bool has_column_fixed_string = false;
|
||||
|
@ -4,8 +4,10 @@
|
||||
#include <base/types.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Common/format.h>
|
||||
#include <Common/memcpySmall.h>
|
||||
|
||||
|
||||
#include <algorithm>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
@ -15,15 +17,9 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
struct FormatImpl
|
||||
{
|
||||
static constexpr size_t small_argument_threshold = 1024;
|
||||
static constexpr size_t argument_threshold = std::numeric_limits<UInt32>::max();
|
||||
static constexpr size_t right_padding = 15;
|
||||
|
||||
template <typename... Args>
|
||||
@ -39,165 +35,10 @@ struct FormatImpl
|
||||
format<false, false>(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res)
|
||||
{
|
||||
res = 0;
|
||||
for (UInt64 pos = l; pos < r; ++pos)
|
||||
{
|
||||
if (!isNumericASCII(description[pos]))
|
||||
throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS);
|
||||
res = res * 10 + description[pos] - '0';
|
||||
if (res >= argument_threshold)
|
||||
throw Exception(
|
||||
"Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void init(
|
||||
const String & pattern,
|
||||
const std::vector<const ColumnString::Chars *> & data,
|
||||
size_t argument_number,
|
||||
const std::vector<String> & constant_strings,
|
||||
UInt64 * index_positions_ptr,
|
||||
std::vector<String> & substrings)
|
||||
{
|
||||
/// Is current position after open curly brace.
|
||||
bool is_open_curly = false;
|
||||
/// The position of last open token.
|
||||
size_t last_open = -1;
|
||||
|
||||
/// Is formatting in a plain {} token.
|
||||
std::optional<bool> is_plain_numbering;
|
||||
UInt64 index_if_plain = 0;
|
||||
|
||||
/// Left position of adding substrings, just to the closed brace position or the start of the string.
|
||||
/// Invariant --- the start of substring is in this position.
|
||||
size_t start_pos = 0;
|
||||
|
||||
/// A flag to decide whether we should glue the constant strings.
|
||||
bool glue_to_next = false;
|
||||
|
||||
/// Handling double braces (escaping).
|
||||
auto double_brace_removal = [](String & str)
|
||||
{
|
||||
size_t i = 0;
|
||||
bool should_delete = true;
|
||||
str.erase(
|
||||
std::remove_if(
|
||||
str.begin(),
|
||||
str.end(),
|
||||
[&i, &should_delete, &str](char)
|
||||
{
|
||||
bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
|
||||
++i;
|
||||
if (is_double_brace && should_delete)
|
||||
{
|
||||
should_delete = false;
|
||||
return true;
|
||||
}
|
||||
should_delete = true;
|
||||
return false;
|
||||
}),
|
||||
str.end());
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < pattern.size(); ++i)
|
||||
{
|
||||
if (pattern[i] == '{')
|
||||
{
|
||||
/// Escaping handling
|
||||
/// It is safe to access because of null termination
|
||||
if (pattern[i + 1] == '{')
|
||||
{
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_open_curly)
|
||||
throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
String to_add = String(pattern.data() + start_pos, i - start_pos);
|
||||
double_brace_removal(to_add);
|
||||
if (!glue_to_next)
|
||||
substrings.emplace_back(to_add);
|
||||
else
|
||||
substrings.back() += to_add;
|
||||
|
||||
glue_to_next = false;
|
||||
|
||||
is_open_curly = true;
|
||||
last_open = i + 1;
|
||||
}
|
||||
else if (pattern[i] == '}')
|
||||
{
|
||||
if (pattern[i + 1] == '}')
|
||||
{
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_open_curly)
|
||||
throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
is_open_curly = false;
|
||||
|
||||
if (last_open == i)
|
||||
{
|
||||
if (is_plain_numbering && !*is_plain_numbering)
|
||||
throw Exception(
|
||||
"Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
|
||||
is_plain_numbering = true;
|
||||
if (index_if_plain >= argument_number)
|
||||
throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS);
|
||||
*index_positions_ptr = index_if_plain++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is_plain_numbering && *is_plain_numbering)
|
||||
throw Exception(
|
||||
"Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
|
||||
is_plain_numbering = false;
|
||||
|
||||
UInt64 arg;
|
||||
parseNumber(pattern, last_open, i, arg);
|
||||
|
||||
if (arg >= argument_number)
|
||||
throw Exception(
|
||||
"Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
*index_positions_ptr = arg;
|
||||
}
|
||||
|
||||
/// Constant string.
|
||||
if (!data[*index_positions_ptr])
|
||||
{
|
||||
/// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
|
||||
glue_to_next = true;
|
||||
substrings.back() += constant_strings[*index_positions_ptr];
|
||||
}
|
||||
else
|
||||
++index_positions_ptr; /// Otherwise we commit arg number and proceed.
|
||||
|
||||
start_pos = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_open_curly)
|
||||
throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
|
||||
double_brace_removal(to_add);
|
||||
|
||||
if (!glue_to_next)
|
||||
substrings.emplace_back(to_add);
|
||||
else
|
||||
substrings.back() += to_add;
|
||||
}
|
||||
|
||||
/// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string.
|
||||
/// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString.
|
||||
/// fixed_string_N for savings N to fixed strings.
|
||||
/// constant_strings for constant strings. If data[i] is nullptr, than it is constant string.
|
||||
/// constant_strings for constant strings. If data[i] is nullptr, it is constant string.
|
||||
/// res_data is result_data, res_offsets is offset result.
|
||||
/// input_rows_count is the number of rows processed.
|
||||
/// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size().
|
||||
@ -207,29 +48,22 @@ struct FormatImpl
|
||||
const std::vector<const ColumnString::Chars *> & data,
|
||||
const std::vector<const ColumnString::Offsets *> & offsets,
|
||||
[[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N,
|
||||
const std::vector<String> & constant_strings,
|
||||
const std::vector<std::optional<String>> & constant_strings,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets,
|
||||
size_t input_rows_count)
|
||||
{
|
||||
const size_t argument_number = offsets.size();
|
||||
|
||||
UInt64 small_index_positions_buffer[small_argument_threshold];
|
||||
/// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices.
|
||||
UInt64 * index_positions = small_index_positions_buffer;
|
||||
|
||||
std::unique_ptr<UInt64[]> big_index_positions_buffer;
|
||||
if (argument_number > small_argument_threshold)
|
||||
{
|
||||
big_index_positions_buffer.reset(new UInt64[argument_number]);
|
||||
index_positions = big_index_positions_buffer.get();
|
||||
}
|
||||
/// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this
|
||||
/// array will be filled with [1, 3, 1, 0] but without constant string indices.
|
||||
Format::IndexPositions index_positions;
|
||||
|
||||
/// Vector of substrings of pattern that will be copied to the answer, not string view because of escaping and iterators invalidation.
|
||||
/// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``].
|
||||
std::vector<String> substrings;
|
||||
|
||||
init(pattern, data, argument_number, constant_strings, index_positions, substrings);
|
||||
Format::init(pattern, argument_number, constant_strings, index_positions, substrings);
|
||||
|
||||
UInt64 final_size = 0;
|
||||
|
||||
@ -271,7 +105,7 @@ struct FormatImpl
|
||||
for (size_t j = 1; j < substrings.size(); ++j)
|
||||
{
|
||||
UInt64 arg = index_positions[j - 1];
|
||||
auto offset_ptr = offsets[arg];
|
||||
const auto * offset_ptr = offsets[arg];
|
||||
UInt64 arg_offset = 0;
|
||||
UInt64 size = 0;
|
||||
|
||||
|
@ -0,0 +1 @@
|
||||
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user