#pragma once #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int LOGICAL_ERROR; } struct FormatImpl { static constexpr size_t small_argument_threshold = 1024; static constexpr size_t argument_threshold = std::numeric_limits::max(); static constexpr size_t right_padding = 15; template static inline void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args) { if (possibly_has_column_string && possibly_has_column_fixed_string) format(std::forward(args)...); else if (!possibly_has_column_string && possibly_has_column_fixed_string) format(std::forward(args)...); else if (possibly_has_column_string && !possibly_has_column_fixed_string) format(std::forward(args)...); else format(std::forward(args)...); } static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res) { res = 0; for (UInt64 pos = l; pos < r; pos++) { if (!isNumericASCII(description[pos])) throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::LOGICAL_ERROR); res = res * 10 + description[pos] - '0'; if (res >= argument_threshold) throw Exception( "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::LOGICAL_ERROR); } } static inline void init( const String & pattern, const std::vector & data, size_t argument_number, const std::vector & constant_strings, UInt64 * index_positions_ptr, std::vector & substrings) { /// Is current position is after open curly brace. bool is_open_curly = false; /// The position of last open token. size_t last_open = -1; /// Is formatting in a plain {} token. std::optional is_plain_numbering; UInt64 index_if_plain = 0; /// Left position of adding substrings, just to the closed brace position or the start of the string. /// Invariant --- the start of substring is in this position. size_t start_pos = 0; /// A flag to decide whether we should glue the constant strings. bool glue_to_next = false; /// Handling double braces (escaping). auto double_brace_removal = [](String & str) { size_t i = 0; bool should_delete = true; str.erase( std::remove_if( str.begin(), str.end(), [&i, &should_delete, &str](char) { bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}'); ++i; if (is_double_brace && should_delete) { should_delete = false; return true; } should_delete = true; return false; }), str.end()); }; for (size_t i = 0; i < pattern.size(); ++i) { if (pattern[i] == '{') { /// Escaping handling /// It is safe to access because of null termination if (pattern[i + 1] == '{') { ++i; continue; } if (is_open_curly) throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR); String to_add = String(pattern.data() + start_pos, i - start_pos); double_brace_removal(to_add); if (!glue_to_next) substrings.emplace_back(to_add); else substrings.back() += to_add; glue_to_next = false; is_open_curly = true; last_open = i + 1; } else if (pattern[i] == '}') { if (pattern[i + 1] == '}') { ++i; continue; } if (!is_open_curly) throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR); is_open_curly = false; if (last_open == i) { if (is_plain_numbering && !*is_plain_numbering) throw Exception( "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::LOGICAL_ERROR); is_plain_numbering = true; if (index_if_plain >= argument_number) throw Exception("Argument is too big for formatting", ErrorCodes::LOGICAL_ERROR); *index_positions_ptr = index_if_plain++; } else { if (is_plain_numbering && *is_plain_numbering) throw Exception( "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::LOGICAL_ERROR); is_plain_numbering = false; UInt64 arg; parseNumber(pattern, last_open, i, arg); if (arg >= argument_number) throw Exception( "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::LOGICAL_ERROR); *index_positions_ptr = arg; } /// Constant string. if (!data[*index_positions_ptr]) { /// The next string should be glued to last `A {} C`.format('B') -> `A B C`. glue_to_next = true; substrings.back() += constant_strings[*index_positions_ptr]; } else ++index_positions_ptr; /// Otherwise we commit arg number and proceed. start_pos = i + 1; } } if (is_open_curly) throw Exception("Last open curly brace is not closed", ErrorCodes::LOGICAL_ERROR); String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos); double_brace_removal(to_add); if (!glue_to_next) substrings.emplace_back(to_add); else substrings.back() += to_add; } /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string. /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString. /// fixed_string_N for savings N to fixed strings. /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string. /// res_data is result_data, res_offsets is offset result. /// input_rows_count is the number of rows processed. /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size(). template static inline void format( String pattern, const std::vector & data, const std::vector & offsets, [[maybe_unused]] /* Because sometimes !HasColumnFixedString */ const std::vector & fixed_string_N, const std::vector & constant_strings, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, size_t input_rows_count) { const size_t argument_number = offsets.size(); UInt64 small_index_positions_buffer[small_argument_threshold]; /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices. UInt64 * index_positions = small_index_positions_buffer; std::unique_ptr big_index_positions_buffer; if (argument_number > small_argument_threshold) { big_index_positions_buffer.reset(new UInt64[argument_number]); index_positions = big_index_positions_buffer.get(); } /// Vector of substrings of pattern that will be copied to the ans, not string view because of escaping and iterators invalidation. /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``]. std::vector substrings; init(pattern, data, argument_number, constant_strings, index_positions, substrings); UInt64 final_size = 0; for (String & str : substrings) { /// To use memcpySmallAllowReadWriteOverflow15 for substrings we should allocate a bit more to each string. /// That was chosen due to perfomance issues. if (!str.empty()) str.reserve(str.size() + right_padding); final_size += str.size(); } /// The substring number is repeated input_rows_times. final_size *= input_rows_count; /// Strings without null termination. for (size_t i = 1; i < substrings.size(); ++i) { final_size += data[index_positions[i - 1]]->size(); /// Fixed strings do not have zero terminating character. if (offsets[index_positions[i - 1]]) final_size -= input_rows_count; } /// Null termination characters. final_size += input_rows_count; res_data.resize(final_size); res_offsets.resize(input_rows_count); UInt64 offset = 0; for (UInt64 i = 0; i < input_rows_count; ++i) { memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[0].data(), substrings[0].size()); offset += substrings[0].size(); /// All strings are constant, we should have substrings.size() == 1. if constexpr (HasColumnString || HasColumnFixedString) { for (size_t j = 1; j < substrings.size(); ++j) { UInt64 arg = index_positions[j - 1]; auto offset_ptr = offsets[arg]; UInt64 arg_offset = 0; UInt64 size = 0; if constexpr (HasColumnString) { if (!HasColumnFixedString || offset_ptr) { arg_offset = (*offset_ptr)[i - 1]; size = (*offset_ptr)[i] - arg_offset - 1; } } if constexpr (HasColumnFixedString) { if (!HasColumnString || !offset_ptr) { arg_offset = fixed_string_N[arg] * i; size = fixed_string_N[arg]; } } memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, data[arg]->data() + arg_offset, size); offset += size; memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[j].data(), substrings[j].size()); offset += substrings[j].size(); } } res_data[offset] = '\0'; ++offset; res_offsets[i] = offset; } /* * Invariant of `offset == final_size` must be held. * * if (offset != final_size) * abort(); */ } }; }