2020-05-06 23:21:13 +00:00
# pragma once
2021-10-02 07:13:14 +00:00
# include <base/types.h>
2022-06-26 16:12:17 +00:00
# include <Columns/ColumnArray.h>
2020-05-06 23:21:13 +00:00
# include <Columns/ColumnString.h>
# include <DataTypes/DataTypesNumber.h>
2022-03-12 18:05:50 +00:00
# include <DataTypes/DataTypeArray.h>
2022-06-24 14:12:38 +00:00
# include <Functions/checkHyperscanRegexp.h>
2023-02-08 13:07:27 +00:00
# include <Functions/Regexps.h>
2020-05-06 23:21:13 +00:00
2022-09-28 08:45:15 +00:00
# include "config.h"
2020-05-06 23:21:13 +00:00
2022-06-17 10:15:19 +00:00
# if USE_VECTORSCAN
2020-05-06 23:21:13 +00:00
# include <hs.h>
# endif
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY ;
2022-06-25 15:53:11 +00:00
extern const int FUNCTION_NOT_ALLOWED ;
extern const int HYPERSCAN_CANNOT_SCAN_TEXT ;
2020-05-06 23:51:41 +00:00
extern const int NOT_IMPLEMENTED ;
2020-05-06 23:21:13 +00:00
extern const int TOO_MANY_BYTES ;
}
2022-06-25 15:28:15 +00:00
template < typename Name , typename ResultType_ , bool WithEditDistance >
2020-05-06 23:21:13 +00:00
struct MultiMatchAllIndicesImpl
{
2022-06-24 13:34:40 +00:00
using ResultType = ResultType_ ;
2020-05-06 23:21:13 +00:00
/// Variable for understanding, if we used offsets for the output, most
/// likely to determine whether the function returns ColumnVector of ColumnArray.
static constexpr bool is_column_array = true ;
2021-09-21 16:43:46 +00:00
static constexpr auto name = Name : : name ;
2020-05-06 23:21:13 +00:00
static auto getReturnType ( )
{
return std : : make_shared < DataTypeArray > ( std : : make_shared < DataTypeUInt64 > ( ) ) ;
}
static void vectorConstant (
const ColumnString : : Chars & haystack_data ,
const ColumnString : : Offsets & haystack_offsets ,
2022-06-26 16:12:17 +00:00
const Array & needles_arr ,
2022-06-24 13:34:40 +00:00
PaddedPODArray < ResultType > & res ,
2022-06-24 14:12:38 +00:00
PaddedPODArray < UInt64 > & offsets ,
2022-06-25 15:53:11 +00:00
bool allow_hyperscan ,
2022-06-24 14:12:38 +00:00
size_t max_hyperscan_regexp_length ,
2023-02-08 13:07:27 +00:00
size_t max_hyperscan_regexp_total_length ,
bool reject_expensive_hyperscan_regexps )
2020-05-06 23:21:13 +00:00
{
2023-02-08 13:07:27 +00:00
vectorConstant ( haystack_data , haystack_offsets , needles_arr , res , offsets , std : : nullopt , allow_hyperscan , max_hyperscan_regexp_length , max_hyperscan_regexp_total_length , reject_expensive_hyperscan_regexps ) ;
2020-05-06 23:21:13 +00:00
}
static void vectorConstant (
2022-06-28 07:51:36 +00:00
const ColumnString : : Chars & haystack_data ,
const ColumnString : : Offsets & haystack_offsets ,
const Array & needles_arr ,
PaddedPODArray < ResultType > & res ,
PaddedPODArray < UInt64 > & offsets ,
std : : optional < UInt32 > edit_distance ,
2022-06-25 15:53:11 +00:00
bool allow_hyperscan ,
2022-06-28 07:51:36 +00:00
size_t max_hyperscan_regexp_length ,
2023-02-08 13:07:27 +00:00
size_t max_hyperscan_regexp_total_length ,
bool reject_expensive_hyperscan_regexps )
2020-05-06 23:21:13 +00:00
{
2022-06-25 15:53:11 +00:00
if ( ! allow_hyperscan )
throw Exception ( ErrorCodes : : FUNCTION_NOT_ALLOWED , " Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0 " ) ;
2022-06-17 10:15:19 +00:00
# if USE_VECTORSCAN
2022-06-26 16:12:17 +00:00
std : : vector < std : : string_view > needles ;
needles . reserve ( needles_arr . size ( ) ) ;
for ( const auto & needle : needles_arr )
needles . emplace_back ( needle . get < String > ( ) ) ;
2022-06-24 14:12:38 +00:00
checkHyperscanRegexp ( needles , max_hyperscan_regexp_length , max_hyperscan_regexp_total_length ) ;
2023-02-08 13:07:27 +00:00
if ( reject_expensive_hyperscan_regexps )
{
SlowWithHyperscanChecker checker ;
for ( auto needle : needles )
if ( checker . isSlow ( needle ) )
throw Exception ( ErrorCodes : : HYPERSCAN_CANNOT_SCAN_TEXT , " Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'. " ) ;
}
2022-06-24 14:12:38 +00:00
offsets . resize ( haystack_offsets . size ( ) ) ;
2022-06-26 16:45:16 +00:00
2022-07-08 11:18:53 +00:00
if ( needles_arr . empty ( ) )
{
std : : fill ( offsets . begin ( ) , offsets . end ( ) , 0 ) ;
return ;
}
2022-08-16 09:56:53 +00:00
MultiRegexps : : DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps : : getOrSet < /*SaveIndices*/ true , WithEditDistance > ( needles , edit_distance ) ;
MultiRegexps : : Regexps * regexps = deferred_constructed_regexps - > get ( ) ;
2020-05-06 23:21:13 +00:00
hs_scratch_t * scratch = nullptr ;
2022-08-16 09:56:53 +00:00
hs_error_t err = hs_clone_scratch ( regexps - > getScratch ( ) , & scratch ) ;
2020-05-06 23:21:13 +00:00
if ( err ! = HS_SUCCESS )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : CANNOT_ALLOCATE_MEMORY , " Could not clone scratch space for vectorscan " ) ;
2020-05-06 23:21:13 +00:00
MultiRegexps : : ScratchPtr smart_scratch ( scratch ) ;
auto on_match = [ ] ( unsigned int id ,
unsigned long long /* from */ , // NOLINT
unsigned long long /* to */ , // NOLINT
unsigned int /* flags */ ,
void * context ) - > int
{
2022-06-24 13:34:40 +00:00
static_cast < PaddedPODArray < ResultType > * > ( context ) - > push_back ( id ) ;
2020-05-06 23:21:13 +00:00
return 0 ;
} ;
const size_t haystack_offsets_size = haystack_offsets . size ( ) ;
UInt64 offset = 0 ;
for ( size_t i = 0 ; i < haystack_offsets_size ; + + i )
{
UInt64 length = haystack_offsets [ i ] - offset - 1 ;
2022-06-28 07:51:36 +00:00
/// vectorscan restriction.
2020-05-06 23:21:13 +00:00
if ( length > std : : numeric_limits < UInt32 > : : max ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : TOO_MANY_BYTES , " Too long string to search " ) ;
2022-06-28 07:51:36 +00:00
/// scan, check, update the offsets array and the offset of haystack.
2020-05-06 23:21:13 +00:00
err = hs_scan (
2022-08-16 09:56:53 +00:00
regexps - > getDB ( ) ,
2020-05-06 23:21:13 +00:00
reinterpret_cast < const char * > ( haystack_data . data ( ) ) + offset ,
2022-10-07 10:46:45 +00:00
static_cast < unsigned > ( length ) ,
2020-05-06 23:21:13 +00:00
0 ,
smart_scratch . get ( ) ,
on_match ,
& res ) ;
if ( err ! = HS_SUCCESS )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : HYPERSCAN_CANNOT_SCAN_TEXT , " Failed to scan with vectorscan " ) ;
2020-05-06 23:21:13 +00:00
offsets [ i ] = res . size ( ) ;
offset = haystack_offsets [ i ] ;
}
# else
2022-06-28 07:51:36 +00:00
( void ) haystack_data ;
( void ) haystack_offsets ;
( void ) needles_arr ;
( void ) res ;
( void ) offsets ;
( void ) edit_distance ;
( void ) max_hyperscan_regexp_length ;
( void ) max_hyperscan_regexp_total_length ;
2023-02-08 13:07:27 +00:00
( void ) reject_expensive_hyperscan_regexps ;
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " multi-search all indices is not implemented when vectorscan is off " ) ;
2022-06-26 16:45:16 +00:00
# endif // USE_VECTORSCAN
}
static void vectorVector (
const ColumnString : : Chars & haystack_data ,
const ColumnString : : Offsets & haystack_offsets ,
2022-07-06 21:36:14 +00:00
const IColumn & needles_data ,
const ColumnArray : : Offsets & needles_offsets ,
2022-06-26 16:45:16 +00:00
PaddedPODArray < ResultType > & res ,
PaddedPODArray < UInt64 > & offsets ,
bool allow_hyperscan ,
size_t max_hyperscan_regexp_length ,
2023-02-08 13:07:27 +00:00
size_t max_hyperscan_regexp_total_length ,
bool reject_expensive_hyperscan_regexps )
2022-06-26 16:45:16 +00:00
{
2023-02-08 13:07:27 +00:00
vectorVector ( haystack_data , haystack_offsets , needles_data , needles_offsets , res , offsets , std : : nullopt , allow_hyperscan , max_hyperscan_regexp_length , max_hyperscan_regexp_total_length , reject_expensive_hyperscan_regexps ) ;
2022-06-26 16:45:16 +00:00
}
static void vectorVector (
2022-06-28 07:51:36 +00:00
const ColumnString : : Chars & haystack_data ,
const ColumnString : : Offsets & haystack_offsets ,
2022-07-06 21:36:14 +00:00
const IColumn & needles_data ,
const ColumnArray : : Offsets & needles_offsets ,
2022-06-28 07:51:36 +00:00
PaddedPODArray < ResultType > & res ,
PaddedPODArray < UInt64 > & offsets ,
std : : optional < UInt32 > edit_distance ,
2022-06-26 16:45:16 +00:00
bool allow_hyperscan ,
2022-06-28 07:51:36 +00:00
size_t max_hyperscan_regexp_length ,
2023-02-08 13:07:27 +00:00
size_t max_hyperscan_regexp_total_length ,
bool reject_expensive_hyperscan_regexps )
2022-06-26 16:45:16 +00:00
{
if ( ! allow_hyperscan )
throw Exception ( ErrorCodes : : FUNCTION_NOT_ALLOWED , " Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0 " ) ;
# if USE_VECTORSCAN
2022-07-06 14:45:22 +00:00
offsets . resize ( haystack_offsets . size ( ) ) ;
2022-06-26 16:45:16 +00:00
size_t prev_haystack_offset = 0 ;
2022-07-07 20:25:26 +00:00
size_t prev_needles_offset = 0 ;
2022-07-06 21:36:14 +00:00
const ColumnString * needles_data_string = checkAndGetColumn < ColumnString > ( & needles_data ) ;
std : : vector < std : : string_view > needles ;
2022-06-26 16:45:16 +00:00
for ( size_t i = 0 ; i < haystack_offsets . size ( ) ; + + i )
{
2022-07-07 20:25:26 +00:00
needles . reserve ( needles_offsets [ i ] - prev_needles_offset ) ;
2022-06-26 16:45:16 +00:00
2022-07-07 20:25:26 +00:00
for ( size_t j = prev_needles_offset ; j < needles_offsets [ i ] ; + + j )
2022-07-06 21:36:14 +00:00
{
2022-07-07 20:25:26 +00:00
needles . emplace_back ( needles_data_string - > getDataAt ( j ) . toView ( ) ) ;
2022-07-06 21:36:14 +00:00
}
2022-06-26 16:45:16 +00:00
2022-07-08 11:18:53 +00:00
if ( needles . empty ( ) )
{
offsets [ i ] = ( i = = 0 ) ? 0 : offsets [ i - 1 ] ;
prev_haystack_offset = haystack_offsets [ i ] ;
prev_needles_offset = needles_offsets [ i ] ;
continue ;
}
2022-06-26 16:45:16 +00:00
checkHyperscanRegexp ( needles , max_hyperscan_regexp_length , max_hyperscan_regexp_total_length ) ;
2023-02-08 13:07:27 +00:00
if ( reject_expensive_hyperscan_regexps )
{
SlowWithHyperscanChecker checker ;
for ( auto needle : needles )
if ( checker . isSlow ( needle ) )
throw Exception ( ErrorCodes : : HYPERSCAN_CANNOT_SCAN_TEXT , " Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'. " ) ;
}
2022-08-16 09:56:53 +00:00
MultiRegexps : : DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps : : getOrSet < /*SaveIndices*/ true , WithEditDistance > ( needles , edit_distance ) ;
MultiRegexps : : Regexps * regexps = deferred_constructed_regexps - > get ( ) ;
2022-06-26 16:45:16 +00:00
hs_scratch_t * scratch = nullptr ;
2022-08-16 09:56:53 +00:00
hs_error_t err = hs_clone_scratch ( regexps - > getScratch ( ) , & scratch ) ;
2022-06-26 16:45:16 +00:00
if ( err ! = HS_SUCCESS )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : CANNOT_ALLOCATE_MEMORY , " Could not clone scratch space for vectorscan " ) ;
2022-06-26 16:45:16 +00:00
MultiRegexps : : ScratchPtr smart_scratch ( scratch ) ;
auto on_match = [ ] ( unsigned int id ,
unsigned long long /* from */ , // NOLINT
unsigned long long /* to */ , // NOLINT
unsigned int /* flags */ ,
void * context ) - > int
{
static_cast < PaddedPODArray < ResultType > * > ( context ) - > push_back ( id ) ;
return 0 ;
} ;
const size_t cur_haystack_length = haystack_offsets [ i ] - prev_haystack_offset - 1 ;
2022-06-28 07:51:36 +00:00
/// vectorscan restriction.
2022-06-26 16:45:16 +00:00
if ( cur_haystack_length > std : : numeric_limits < UInt32 > : : max ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : TOO_MANY_BYTES , " Too long string to search " ) ;
2022-06-26 16:45:16 +00:00
2022-06-28 07:51:36 +00:00
/// scan, check, update the offsets array and the offset of haystack.
2022-06-26 16:45:16 +00:00
err = hs_scan (
2022-08-16 09:56:53 +00:00
regexps - > getDB ( ) ,
2022-06-26 16:45:16 +00:00
reinterpret_cast < const char * > ( haystack_data . data ( ) ) + prev_haystack_offset ,
2022-10-07 10:46:45 +00:00
static_cast < unsigned > ( cur_haystack_length ) ,
2022-06-26 16:45:16 +00:00
0 ,
smart_scratch . get ( ) ,
on_match ,
& res ) ;
if ( err ! = HS_SUCCESS )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : HYPERSCAN_CANNOT_SCAN_TEXT , " Failed to scan with vectorscan " ) ;
2022-06-26 16:45:16 +00:00
offsets [ i ] = res . size ( ) ;
prev_haystack_offset = haystack_offsets [ i ] ;
2022-07-07 20:25:26 +00:00
prev_needles_offset = needles_offsets [ i ] ;
2022-07-06 21:36:14 +00:00
needles . clear ( ) ;
2022-06-26 16:45:16 +00:00
}
# else
2022-06-28 07:51:36 +00:00
( void ) haystack_data ;
( void ) haystack_offsets ;
2022-07-06 21:36:14 +00:00
( void ) needles_data ;
( void ) needles_offsets ;
2022-06-28 07:51:36 +00:00
( void ) res ;
( void ) offsets ;
( void ) edit_distance ;
( void ) max_hyperscan_regexp_length ;
( void ) max_hyperscan_regexp_total_length ;
2023-02-08 13:07:27 +00:00
( void ) reject_expensive_hyperscan_regexps ;
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " multi-search all indices is not implemented when vectorscan is off " ) ;
2022-06-17 10:15:19 +00:00
# endif // USE_VECTORSCAN
2020-05-06 23:21:13 +00:00
}
} ;
}