2019-05-20 21:06:43 +00:00
# pragma once
# include <Columns/ColumnString.h>
# include <Core/Types.h>
# include <Common/Exception.h>
# include <Common/StringUtils/StringUtils.h>
# include <Common/memcpySmall.h>
# include <algorithm>
# include <optional>
# include <string>
# include <utility>
# include <vector>
2019-12-30 20:03:22 +00:00
2019-05-20 21:06:43 +00:00
namespace DB
{
namespace ErrorCodes
{
2019-12-30 14:46:02 +00:00
extern const int BAD_ARGUMENTS ;
2019-05-20 21:06:43 +00:00
}
struct FormatImpl
{
static constexpr size_t small_argument_threshold = 1024 ;
static constexpr size_t argument_threshold = std : : numeric_limits < UInt32 > : : max ( ) ;
static constexpr size_t right_padding = 15 ;
template < typename . . . Args >
static inline void formatExecute ( bool possibly_has_column_string , bool possibly_has_column_fixed_string , Args & & . . . args )
{
if ( possibly_has_column_string & & possibly_has_column_fixed_string )
format < true , true > ( std : : forward < Args > ( args ) . . . ) ;
else if ( ! possibly_has_column_string & & possibly_has_column_fixed_string )
format < false , true > ( std : : forward < Args > ( args ) . . . ) ;
else if ( possibly_has_column_string & & ! possibly_has_column_fixed_string )
format < true , false > ( std : : forward < Args > ( args ) . . . ) ;
else
format < false , false > ( std : : forward < Args > ( args ) . . . ) ;
}
static void parseNumber ( const String & description , UInt64 l , UInt64 r , UInt64 & res )
{
res = 0 ;
for ( UInt64 pos = l ; pos < r ; pos + + )
{
if ( ! isNumericASCII ( description [ pos ] ) )
2019-12-30 14:46:02 +00:00
throw Exception ( " Not a number in curly braces at position " + std : : to_string ( pos ) , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
res = res * 10 + description [ pos ] - ' 0 ' ;
if ( res > = argument_threshold )
throw Exception (
2019-12-30 14:46:02 +00:00
" Too big number for arguments, must be at most " + std : : to_string ( argument_threshold ) , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
}
}
static inline void init (
const String & pattern ,
const std : : vector < const ColumnString : : Chars * > & data ,
size_t argument_number ,
const std : : vector < String > & constant_strings ,
UInt64 * index_positions_ptr ,
std : : vector < String > & substrings )
{
2019-12-30 20:03:22 +00:00
/// Is current position after open curly brace.
2019-05-20 21:06:43 +00:00
bool is_open_curly = false ;
/// The position of last open token.
size_t last_open = - 1 ;
/// Is formatting in a plain {} token.
std : : optional < bool > is_plain_numbering ;
UInt64 index_if_plain = 0 ;
/// Left position of adding substrings, just to the closed brace position or the start of the string.
/// Invariant --- the start of substring is in this position.
size_t start_pos = 0 ;
/// A flag to decide whether we should glue the constant strings.
bool glue_to_next = false ;
/// Handling double braces (escaping).
auto double_brace_removal = [ ] ( String & str )
{
size_t i = 0 ;
bool should_delete = true ;
str . erase (
std : : remove_if (
str . begin ( ) ,
str . end ( ) ,
[ & i , & should_delete , & str ] ( char )
{
bool is_double_brace = ( str [ i ] = = ' { ' & & str [ i + 1 ] = = ' { ' ) | | ( str [ i ] = = ' } ' & & str [ i + 1 ] = = ' } ' ) ;
+ + i ;
if ( is_double_brace & & should_delete )
{
should_delete = false ;
return true ;
}
should_delete = true ;
return false ;
} ) ,
str . end ( ) ) ;
} ;
for ( size_t i = 0 ; i < pattern . size ( ) ; + + i )
{
if ( pattern [ i ] = = ' { ' )
{
/// Escaping handling
/// It is safe to access because of null termination
if ( pattern [ i + 1 ] = = ' { ' )
{
+ + i ;
continue ;
}
if ( is_open_curly )
2019-12-30 14:46:02 +00:00
throw Exception ( " Two open curly braces without close one at position " + std : : to_string ( i ) , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
String to_add = String ( pattern . data ( ) + start_pos , i - start_pos ) ;
double_brace_removal ( to_add ) ;
if ( ! glue_to_next )
substrings . emplace_back ( to_add ) ;
else
substrings . back ( ) + = to_add ;
glue_to_next = false ;
is_open_curly = true ;
last_open = i + 1 ;
}
else if ( pattern [ i ] = = ' } ' )
{
if ( pattern [ i + 1 ] = = ' } ' )
{
+ + i ;
continue ;
}
if ( ! is_open_curly )
2019-12-30 14:46:02 +00:00
throw Exception ( " Closed curly brace without open one at position " + std : : to_string ( i ) , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
is_open_curly = false ;
if ( last_open = = i )
{
if ( is_plain_numbering & & ! * is_plain_numbering )
throw Exception (
2019-12-30 14:46:02 +00:00
" Cannot switch from automatic field numbering to manual field specification " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
is_plain_numbering = true ;
if ( index_if_plain > = argument_number )
2019-12-30 14:46:02 +00:00
throw Exception ( " Argument is too big for formatting " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
* index_positions_ptr = index_if_plain + + ;
}
else
{
if ( is_plain_numbering & & * is_plain_numbering )
throw Exception (
2019-12-30 14:46:02 +00:00
" Cannot switch from automatic field numbering to manual field specification " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
is_plain_numbering = false ;
UInt64 arg ;
parseNumber ( pattern , last_open , i , arg ) ;
if ( arg > = argument_number )
throw Exception (
2019-12-30 14:46:02 +00:00
" Argument is too big for formatting. Note that indexing starts from zero " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
* index_positions_ptr = arg ;
}
/// Constant string.
if ( ! data [ * index_positions_ptr ] )
{
/// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
glue_to_next = true ;
substrings . back ( ) + = constant_strings [ * index_positions_ptr ] ;
}
else
+ + index_positions_ptr ; /// Otherwise we commit arg number and proceed.
start_pos = i + 1 ;
}
}
if ( is_open_curly )
2019-12-30 14:46:02 +00:00
throw Exception ( " Last open curly brace is not closed " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-05-20 21:06:43 +00:00
String to_add = String ( pattern . data ( ) + start_pos , pattern . size ( ) - start_pos ) ;
double_brace_removal ( to_add ) ;
if ( ! glue_to_next )
substrings . emplace_back ( to_add ) ;
else
substrings . back ( ) + = to_add ;
}
/// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string.
/// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString.
/// fixed_string_N for savings N to fixed strings.
/// constant_strings for constant strings. If data[i] is nullptr, than it is constant string.
/// res_data is result_data, res_offsets is offset result.
/// input_rows_count is the number of rows processed.
/// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size().
2019-10-25 03:25:02 +00:00
template < bool has_column_string , bool has_column_fixed_string >
2019-05-20 21:06:43 +00:00
static inline void format (
String pattern ,
const std : : vector < const ColumnString : : Chars * > & data ,
const std : : vector < const ColumnString : : Offsets * > & offsets ,
2019-10-25 03:25:02 +00:00
[[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std : : vector < size_t > & fixed_string_N ,
2019-05-20 21:06:43 +00:00
const std : : vector < String > & constant_strings ,
ColumnString : : Chars & res_data ,
ColumnString : : Offsets & res_offsets ,
size_t input_rows_count )
{
const size_t argument_number = offsets . size ( ) ;
UInt64 small_index_positions_buffer [ small_argument_threshold ] ;
/// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices.
UInt64 * index_positions = small_index_positions_buffer ;
std : : unique_ptr < UInt64 [ ] > big_index_positions_buffer ;
if ( argument_number > small_argument_threshold )
{
big_index_positions_buffer . reset ( new UInt64 [ argument_number ] ) ;
index_positions = big_index_positions_buffer . get ( ) ;
}
/// Vector of substrings of pattern that will be copied to the ans, not string view because of escaping and iterators invalidation.
/// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``].
std : : vector < String > substrings ;
init ( pattern , data , argument_number , constant_strings , index_positions , substrings ) ;
UInt64 final_size = 0 ;
for ( String & str : substrings )
{
/// To use memcpySmallAllowReadWriteOverflow15 for substrings we should allocate a bit more to each string.
/// That was chosen due to perfomance issues.
if ( ! str . empty ( ) )
str . reserve ( str . size ( ) + right_padding ) ;
final_size + = str . size ( ) ;
}
/// The substring number is repeated input_rows_times.
final_size * = input_rows_count ;
/// Strings without null termination.
for ( size_t i = 1 ; i < substrings . size ( ) ; + + i )
{
final_size + = data [ index_positions [ i - 1 ] ] - > size ( ) ;
/// Fixed strings do not have zero terminating character.
if ( offsets [ index_positions [ i - 1 ] ] )
final_size - = input_rows_count ;
}
/// Null termination characters.
final_size + = input_rows_count ;
res_data . resize ( final_size ) ;
res_offsets . resize ( input_rows_count ) ;
UInt64 offset = 0 ;
for ( UInt64 i = 0 ; i < input_rows_count ; + + i )
{
memcpySmallAllowReadWriteOverflow15 ( res_data . data ( ) + offset , substrings [ 0 ] . data ( ) , substrings [ 0 ] . size ( ) ) ;
offset + = substrings [ 0 ] . size ( ) ;
/// All strings are constant, we should have substrings.size() == 1.
2019-10-25 03:25:02 +00:00
if constexpr ( has_column_string | | has_column_fixed_string )
2019-05-20 21:06:43 +00:00
{
for ( size_t j = 1 ; j < substrings . size ( ) ; + + j )
{
UInt64 arg = index_positions [ j - 1 ] ;
auto offset_ptr = offsets [ arg ] ;
UInt64 arg_offset = 0 ;
UInt64 size = 0 ;
2019-10-25 03:25:02 +00:00
if constexpr ( has_column_string )
2019-05-20 21:06:43 +00:00
{
2019-10-25 03:25:02 +00:00
if ( ! has_column_fixed_string | | offset_ptr )
2019-05-20 21:06:43 +00:00
{
arg_offset = ( * offset_ptr ) [ i - 1 ] ;
size = ( * offset_ptr ) [ i ] - arg_offset - 1 ;
}
}
2019-10-25 03:25:02 +00:00
if constexpr ( has_column_fixed_string )
2019-05-20 21:06:43 +00:00
{
2019-10-25 03:25:02 +00:00
if ( ! has_column_string | | ! offset_ptr )
2019-05-20 21:06:43 +00:00
{
arg_offset = fixed_string_N [ arg ] * i ;
size = fixed_string_N [ arg ] ;
}
}
memcpySmallAllowReadWriteOverflow15 ( res_data . data ( ) + offset , data [ arg ] - > data ( ) + arg_offset , size ) ;
offset + = size ;
memcpySmallAllowReadWriteOverflow15 ( res_data . data ( ) + offset , substrings [ j ] . data ( ) , substrings [ j ] . size ( ) ) ;
offset + = substrings [ j ] . size ( ) ;
}
}
res_data [ offset ] = ' \0 ' ;
+ + offset ;
res_offsets [ i ] = offset ;
}
/*
* Invariant of ` offset = = final_size ` must be held .
*
* if ( offset ! = final_size )
* abort ( ) ;
*/
}
} ;
}