2019-05-10 03:42:28 +00:00
# include <Storages/MergeTree/MergeTreeIndexFullText.h>
2019-02-20 11:22:07 +00:00
2019-02-25 17:12:09 +00:00
# include <Common/StringUtils/StringUtils.h>
2019-02-20 12:48:50 +00:00
# include <Common/UTF8Helpers.h>
2019-02-20 11:22:07 +00:00
# include <DataTypes/DataTypesNumber.h>
2021-09-22 23:06:54 +00:00
# include <DataTypes/DataTypeArray.h>
2019-02-20 11:22:07 +00:00
# include <IO/WriteHelpers.h>
# include <IO/ReadHelpers.h>
2019-02-20 12:12:41 +00:00
# include <Interpreters/ExpressionActions.h>
# include <Interpreters/ExpressionAnalyzer.h>
2020-07-22 17:13:05 +00:00
# include <Interpreters/TreeRewriter.h>
2019-10-23 13:59:03 +00:00
# include <Interpreters/misc.h>
2019-02-20 12:12:41 +00:00
# include <Storages/MergeTree/MergeTreeData.h>
2019-03-07 09:15:58 +00:00
# include <Storages/MergeTree/RPNBuilder.h>
2019-02-24 21:17:52 +00:00
# include <Parsers/ASTIdentifier.h>
2019-02-20 12:12:41 +00:00
# include <Parsers/ASTLiteral.h>
2019-02-24 21:17:52 +00:00
# include <Parsers/ASTSubquery.h>
2020-05-11 05:04:59 +00:00
# include <Core/Defines.h>
2019-02-20 12:12:41 +00:00
# include <Poco/Logger.h>
2019-02-20 11:22:07 +00:00
2019-02-21 20:32:36 +00:00
# include <boost/algorithm/string.hpp>
2020-04-06 10:27:31 +00:00
# if defined(__SSE2__)
2020-04-01 21:28:02 +00:00
# include <immintrin.h>
2020-04-06 10:27:31 +00:00
# if defined(__SSE4_2__)
2020-04-01 21:28:02 +00:00
# include <nmmintrin.h>
2020-04-06 10:27:31 +00:00
# endif
# endif
2020-04-01 21:28:02 +00:00
2019-02-20 11:22:07 +00:00
namespace DB
{
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int LOGICAL_ERROR ;
2019-02-20 11:22:07 +00:00
extern const int INCORRECT_QUERY ;
2020-07-10 17:53:58 +00:00
extern const int BAD_ARGUMENTS ;
2019-02-20 11:22:07 +00:00
}
2019-02-21 20:32:36 +00:00
/// Adds all tokens from string to bloom filter.
2019-02-20 16:24:46 +00:00
static void stringToBloomFilter (
2021-04-01 02:37:19 +00:00
const String & string , TokenExtractorPtr token_extractor , BloomFilter & bloom_filter )
{
const char * data = string . data ( ) ;
size_t size = string . size ( ) ;
size_t cur = 0 ;
size_t token_start = 0 ;
size_t token_len = 0 ;
while ( cur < size & & token_extractor - > nextInField ( data , size , & cur , & token_start , & token_len ) )
bloom_filter . add ( data + token_start , token_len ) ;
}
static void columnToBloomFilter (
2020-05-28 12:37:05 +00:00
const char * data , size_t size , TokenExtractorPtr token_extractor , BloomFilter & bloom_filter )
2019-02-20 16:24:46 +00:00
{
2019-02-20 13:18:15 +00:00
size_t cur = 0 ;
size_t token_start = 0 ;
size_t token_len = 0 ;
2021-04-01 02:37:19 +00:00
while ( cur < size & & token_extractor - > nextInColumn ( data , size , & cur , & token_start , & token_len ) )
2019-02-20 16:24:46 +00:00
bloom_filter . add ( data + token_start , token_len ) ;
2019-02-20 13:18:15 +00:00
}
2021-04-01 02:37:19 +00:00
2019-02-21 20:32:36 +00:00
/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
static void likeStringToBloomFilter (
2020-05-28 12:37:05 +00:00
const String & data , TokenExtractorPtr token_extractor , BloomFilter & bloom_filter )
2019-02-21 20:32:36 +00:00
{
size_t cur = 0 ;
String token ;
2019-02-21 21:29:24 +00:00
while ( cur < data . size ( ) & & token_extractor - > nextLike ( data , & cur , token ) )
2019-02-21 20:32:36 +00:00
bloom_filter . add ( token . c_str ( ) , token . size ( ) ) ;
}
2021-04-01 02:37:19 +00:00
2019-07-16 11:40:11 +00:00
/// Unified condition for equals, startsWith and endsWith
2020-07-10 08:13:21 +00:00
bool MergeTreeConditionFullText : : createFunctionEqualsCondition (
RPNElement & out , const Field & value , const BloomFilterParameters & params , TokenExtractorPtr token_extractor )
2019-07-16 11:40:11 +00:00
{
out . function = RPNElement : : FUNCTION_EQUALS ;
2020-05-28 12:37:05 +00:00
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
2021-04-01 02:37:19 +00:00
stringToBloomFilter ( value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
2019-07-16 11:40:11 +00:00
return true ;
}
2019-02-20 13:18:15 +00:00
2020-05-28 12:37:05 +00:00
MergeTreeIndexGranuleFullText : : MergeTreeIndexGranuleFullText (
const String & index_name_ ,
size_t columns_number ,
const BloomFilterParameters & params_ )
: index_name ( index_name_ )
, params ( params_ )
2019-02-22 19:59:40 +00:00
, bloom_filters (
2020-05-28 12:37:05 +00:00
columns_number , BloomFilter ( params ) )
, has_elems ( false )
{
}
2019-02-20 11:22:07 +00:00
2019-05-10 03:42:28 +00:00
void MergeTreeIndexGranuleFullText : : serializeBinary ( WriteBuffer & ostr ) const
2019-02-20 11:22:07 +00:00
{
if ( empty ( ) )
2021-08-05 18:09:17 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Attempt to write empty fulltext index {}. " , backQuote ( index_name ) ) ;
2019-02-20 11:22:07 +00:00
2019-02-23 09:26:32 +00:00
for ( const auto & bloom_filter : bloom_filters )
2020-05-28 12:37:05 +00:00
ostr . write ( reinterpret_cast < const char * > ( bloom_filter . getFilter ( ) . data ( ) ) , params . filter_size ) ;
2019-02-20 11:22:07 +00:00
}
2021-08-05 18:09:17 +00:00
void MergeTreeIndexGranuleFullText : : deserializeBinary ( ReadBuffer & istr , MergeTreeIndexVersion version )
2019-02-20 11:22:07 +00:00
{
2021-08-05 18:09:17 +00:00
if ( version ! = 1 )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Unknown index version {}. " , version ) ;
2019-02-23 09:26:32 +00:00
for ( auto & bloom_filter : bloom_filters )
2019-02-22 19:59:40 +00:00
{
2020-05-28 12:37:05 +00:00
istr . read ( reinterpret_cast < char * > (
bloom_filter . getFilter ( ) . data ( ) ) , params . filter_size ) ;
2019-02-22 19:59:40 +00:00
}
2019-02-20 12:48:50 +00:00
has_elems = true ;
2019-02-20 11:22:07 +00:00
}
2019-03-11 17:59:36 +00:00
2020-05-28 12:37:05 +00:00
MergeTreeIndexAggregatorFullText : : MergeTreeIndexAggregatorFullText (
const Names & index_columns_ ,
const String & index_name_ ,
const BloomFilterParameters & params_ ,
TokenExtractorPtr token_extractor_ )
: index_columns ( index_columns_ )
, index_name ( index_name_ )
, params ( params_ )
, token_extractor ( token_extractor_ )
, granule (
std : : make_shared < MergeTreeIndexGranuleFullText > (
index_name , index_columns . size ( ) , params ) )
{
}
2019-03-11 17:59:36 +00:00
2019-05-10 03:42:28 +00:00
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorFullText : : getGranuleAndReset ( )
2019-03-11 17:59:36 +00:00
{
2020-05-28 12:37:05 +00:00
auto new_granule = std : : make_shared < MergeTreeIndexGranuleFullText > (
index_name , index_columns . size ( ) , params ) ;
2019-03-11 17:59:36 +00:00
new_granule . swap ( granule ) ;
return new_granule ;
}
2019-05-10 03:42:28 +00:00
void MergeTreeIndexAggregatorFullText : : update ( const Block & block , size_t * pos , size_t limit )
2019-02-20 11:22:07 +00:00
{
if ( * pos > = block . rows ( ) )
throw Exception (
" The provided position is not less than the number of block rows. Position: "
+ toString ( * pos ) + " , Block rows: " + toString ( block . rows ( ) ) + " . " , ErrorCodes : : LOGICAL_ERROR ) ;
size_t rows_read = std : : min ( limit , block . rows ( ) - * pos ) ;
2020-05-28 12:37:05 +00:00
for ( size_t col = 0 ; col < index_columns . size ( ) ; + + col )
2019-02-20 11:22:07 +00:00
{
2021-09-22 23:06:54 +00:00
const auto & column_with_type = block . getByName ( index_columns [ col ] ) ;
const auto & column = column_with_type . column ;
2021-09-21 18:28:52 +00:00
size_t current_position = * pos ;
2021-09-22 23:06:54 +00:00
if ( isArray ( column_with_type . type ) )
2019-02-22 19:59:40 +00:00
{
2021-09-22 23:06:54 +00:00
const auto & column_array = assert_cast < const ColumnArray & > ( * column ) ;
const auto & column_offsets = column_array . getOffsets ( ) ;
const auto & column_key = column_array . getData ( ) ;
2021-09-02 06:17:47 +00:00
for ( size_t i = 0 ; i < rows_read ; + + i )
{
2021-09-22 23:06:54 +00:00
size_t element_start_row = column_offsets [ current_position - 1 ] ;
size_t elements_size = column_offsets [ current_position ] - element_start_row ;
2021-09-02 06:17:47 +00:00
for ( size_t row_num = 0 ; row_num < elements_size ; row_num + + )
{
2021-09-21 13:43:33 +00:00
auto ref = column_key . getDataAt ( element_start_row + row_num ) ;
2021-09-02 06:17:47 +00:00
columnToBloomFilter ( ref . data , ref . size , token_extractor , granule - > bloom_filters [ col ] ) ;
}
2021-09-21 18:28:52 +00:00
current_position + = 1 ;
2021-09-02 06:17:47 +00:00
}
}
else
{
for ( size_t i = 0 ; i < rows_read ; + + i )
{
2021-09-21 18:28:52 +00:00
auto ref = column - > getDataAt ( current_position + i ) ;
2021-09-02 06:17:47 +00:00
columnToBloomFilter ( ref . data , ref . size , token_extractor , granule - > bloom_filters [ col ] ) ;
}
2019-02-22 19:59:40 +00:00
}
2019-02-20 11:22:07 +00:00
}
2021-09-21 18:28:52 +00:00
2019-03-11 17:59:36 +00:00
granule - > has_elems = true ;
2021-09-21 18:28:52 +00:00
* pos + = rows_read ;
2019-02-20 11:22:07 +00:00
}
2019-02-20 12:12:41 +00:00
2019-05-10 03:42:28 +00:00
MergeTreeConditionFullText : : MergeTreeConditionFullText (
2019-02-20 16:24:46 +00:00
const SelectQueryInfo & query_info ,
2021-04-10 23:33:54 +00:00
ContextPtr context ,
2020-05-28 12:37:05 +00:00
const Block & index_sample_block ,
const BloomFilterParameters & params_ ,
TokenExtractorPtr token_extactor_ )
: index_columns ( index_sample_block . getNames ( ) )
, index_data_types ( index_sample_block . getNamesAndTypesList ( ) . getTypes ( ) )
, params ( params_ )
, token_extractor ( token_extactor_ )
, prepared_sets ( query_info . sets )
2019-02-20 16:24:46 +00:00
{
2019-03-12 13:36:15 +00:00
rpn = std : : move (
RPNBuilder < RPNElement > (
query_info , context ,
2021-04-10 23:33:54 +00:00
[ this ] ( const ASTPtr & node , ContextPtr /* context */ , Block & block_with_constants , RPNElement & out ) - > bool
2019-03-12 13:36:15 +00:00
{
2021-09-27 14:26:25 +00:00
return this - > traverseAtomAST ( node , block_with_constants , out ) ;
2019-03-12 13:36:15 +00:00
} ) . extractRPN ( ) ) ;
2019-02-20 16:24:46 +00:00
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText : : alwaysUnknownOrTrue ( ) const
2019-02-20 16:24:46 +00:00
{
/// Check like in KeyCondition.
std : : vector < bool > rpn_stack ;
for ( const auto & element : rpn )
{
if ( element . function = = RPNElement : : FUNCTION_UNKNOWN
| | element . function = = RPNElement : : ALWAYS_TRUE )
{
rpn_stack . push_back ( true ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_EQUALS
2019-02-20 20:17:44 +00:00
| | element . function = = RPNElement : : FUNCTION_NOT_EQUALS
2021-09-22 23:06:54 +00:00
| | element . function = = RPNElement : : FUNCTION_HAS
2019-02-24 21:17:52 +00:00
| | element . function = = RPNElement : : FUNCTION_IN
| | element . function = = RPNElement : : FUNCTION_NOT_IN
2019-07-12 11:35:17 +00:00
| | element . function = = RPNElement : : FUNCTION_MULTI_SEARCH
2019-02-20 20:17:44 +00:00
| | element . function = = RPNElement : : ALWAYS_FALSE )
2019-02-20 16:24:46 +00:00
{
rpn_stack . push_back ( false ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_NOT )
{
// do nothing
}
else if ( element . function = = RPNElement : : FUNCTION_AND )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 & & arg2 ;
}
else if ( element . function = = RPNElement : : FUNCTION_OR )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 | | arg2 ;
}
else
throw Exception ( " Unexpected function type in KeyCondition::RPNElement " , ErrorCodes : : LOGICAL_ERROR ) ;
}
return rpn_stack [ 0 ] ;
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText : : mayBeTrueOnGranule ( MergeTreeIndexGranulePtr idx_granule ) const
2019-02-20 16:24:46 +00:00
{
2019-05-10 03:42:28 +00:00
std : : shared_ptr < MergeTreeIndexGranuleFullText > granule
= std : : dynamic_pointer_cast < MergeTreeIndexGranuleFullText > ( idx_granule ) ;
2019-02-20 16:24:46 +00:00
if ( ! granule )
throw Exception (
" BloomFilter index condition got a granule with the wrong type. " , ErrorCodes : : LOGICAL_ERROR ) ;
/// Check like in KeyCondition.
std : : vector < BoolMask > rpn_stack ;
2019-02-22 07:59:07 +00:00
for ( const auto & element : rpn )
2019-02-20 16:24:46 +00:00
{
if ( element . function = = RPNElement : : FUNCTION_UNKNOWN )
{
rpn_stack . emplace_back ( true , true ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_EQUALS
2021-09-22 23:06:54 +00:00
| | element . function = = RPNElement : : FUNCTION_NOT_EQUALS
| | element . function = = RPNElement : : FUNCTION_HAS )
2019-02-20 16:24:46 +00:00
{
2020-04-16 12:31:57 +00:00
rpn_stack . emplace_back ( granule - > bloom_filters [ element . key_column ] . contains ( * element . bloom_filter ) , true ) ;
2019-02-20 16:24:46 +00:00
if ( element . function = = RPNElement : : FUNCTION_NOT_EQUALS )
rpn_stack . back ( ) = ! rpn_stack . back ( ) ;
}
2019-02-24 21:17:52 +00:00
else if ( element . function = = RPNElement : : FUNCTION_IN
2019-07-12 11:35:17 +00:00
| | element . function = = RPNElement : : FUNCTION_NOT_IN )
2019-02-24 21:17:52 +00:00
{
2019-02-25 08:43:19 +00:00
std : : vector < bool > result ( element . set_bloom_filters . back ( ) . size ( ) , true ) ;
2019-02-24 21:17:52 +00:00
2019-02-25 18:38:57 +00:00
for ( size_t column = 0 ; column < element . set_key_position . size ( ) ; + + column )
2019-02-24 21:17:52 +00:00
{
2019-02-25 18:38:57 +00:00
const size_t key_idx = element . set_key_position [ column ] ;
2019-02-24 21:17:52 +00:00
2019-02-25 08:43:19 +00:00
const auto & bloom_filters = element . set_bloom_filters [ column ] ;
for ( size_t row = 0 ; row < bloom_filters . size ( ) ; + + row )
result [ row ] = result [ row ] & & granule - > bloom_filters [ key_idx ] . contains ( bloom_filters [ row ] ) ;
2019-02-24 21:17:52 +00:00
}
2019-02-25 08:43:19 +00:00
rpn_stack . emplace_back (
std : : find ( std : : cbegin ( result ) , std : : cend ( result ) , true ) ! = std : : end ( result ) , true ) ;
2019-02-24 21:17:52 +00:00
if ( element . function = = RPNElement : : FUNCTION_NOT_IN )
rpn_stack . back ( ) = ! rpn_stack . back ( ) ;
}
2019-07-12 11:35:17 +00:00
else if ( element . function = = RPNElement : : FUNCTION_MULTI_SEARCH )
{
std : : vector < bool > result ( element . set_bloom_filters . back ( ) . size ( ) , true ) ;
const auto & bloom_filters = element . set_bloom_filters [ 0 ] ;
for ( size_t row = 0 ; row < bloom_filters . size ( ) ; + + row )
result [ row ] = result [ row ] & & granule - > bloom_filters [ element . key_column ] . contains ( bloom_filters [ row ] ) ;
rpn_stack . emplace_back (
std : : find ( std : : cbegin ( result ) , std : : cend ( result ) , true ) ! = std : : end ( result ) , true ) ;
}
2019-02-20 16:24:46 +00:00
else if ( element . function = = RPNElement : : FUNCTION_NOT )
{
rpn_stack . back ( ) = ! rpn_stack . back ( ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_AND )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 & arg2 ;
}
else if ( element . function = = RPNElement : : FUNCTION_OR )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 | arg2 ;
}
else if ( element . function = = RPNElement : : ALWAYS_FALSE )
{
rpn_stack . emplace_back ( false , true ) ;
}
else if ( element . function = = RPNElement : : ALWAYS_TRUE )
{
rpn_stack . emplace_back ( true , false ) ;
}
else
2020-01-29 21:40:22 +00:00
throw Exception ( " Unexpected function type in BloomFilterCondition::RPNElement " , ErrorCodes : : LOGICAL_ERROR ) ;
2019-02-20 16:24:46 +00:00
}
if ( rpn_stack . size ( ) ! = 1 )
2020-01-29 21:40:22 +00:00
throw Exception ( " Unexpected stack size in BloomFilterCondition::mayBeTrueOnGranule " , ErrorCodes : : LOGICAL_ERROR ) ;
2019-02-20 16:24:46 +00:00
return rpn_stack [ 0 ] . can_be_true ;
}
2021-09-21 13:43:33 +00:00
bool MergeTreeConditionFullText : : getKey ( const std : : string & key_column_name , size_t & key_column_num )
2019-02-20 16:24:46 +00:00
{
2021-09-21 13:43:33 +00:00
auto it = std : : find ( index_columns . begin ( ) , index_columns . end ( ) , key_column_name ) ;
2020-05-28 12:37:05 +00:00
if ( it = = index_columns . end ( ) )
2019-02-20 16:24:46 +00:00
return false ;
2020-05-28 12:37:05 +00:00
key_column_num = static_cast < size_t > ( it - index_columns . begin ( ) ) ;
2019-02-20 16:24:46 +00:00
return true ;
}
2021-09-27 14:26:25 +00:00
bool MergeTreeConditionFullText : : traverseAtomAST ( const ASTPtr & node , Block & block_with_constants , RPNElement & out )
2019-02-20 16:24:46 +00:00
{
{
2021-09-27 14:26:25 +00:00
Field const_value ;
DataTypePtr const_type ;
2019-02-20 16:24:46 +00:00
2021-09-27 14:26:25 +00:00
if ( KeyCondition : : getConstant ( node , block_with_constants , const_value , const_type ) )
{
/// Check constant like in KeyCondition
if ( const_value . getType ( ) = = Field : : Types : : UInt64
| | const_value . getType ( ) = = Field : : Types : : Int64
| | const_value . getType ( ) = = Field : : Types : : Float64 )
{
/// Zero in all types is represented in memory the same way as in UInt64.
out . function = const_value . get < UInt64 > ( )
? RPNElement : : ALWAYS_TRUE
: RPNElement : : ALWAYS_FALSE ;
return true ;
}
}
}
if ( const auto * function = node - > as < ASTFunction > ( ) )
{
if ( ! function - > arguments )
2019-07-11 13:44:44 +00:00
return false ;
2021-09-27 14:26:25 +00:00
const ASTs & arguments = function - > arguments - > children ;
2019-02-20 16:24:46 +00:00
2021-09-27 14:26:25 +00:00
if ( arguments . size ( ) ! = 2 )
return false ;
if ( functionIsInOrGlobalInOperator ( function - > name ) )
2019-07-11 13:44:44 +00:00
{
2021-09-27 14:26:25 +00:00
if ( tryPrepareSetBloomFilter ( arguments , out ) )
{
if ( function - > name = = " notIn " )
{
out . function = RPNElement : : FUNCTION_NOT_IN ;
return true ;
}
else if ( function - > name = = " in " )
{
out . function = RPNElement : : FUNCTION_IN ;
return true ;
}
}
2019-02-20 16:24:46 +00:00
}
2021-09-27 14:26:25 +00:00
else if ( function - > name = = " equals " | |
function - > name = = " notEquals " | |
function - > name = = " has " | |
function - > name = = " mapContains " | |
function - > name = = " like " | |
function - > name = = " notLike " | |
function - > name = = " hasToken " | |
function - > name = = " startsWith " | |
function - > name = = " endsWith " | |
function - > name = = " multiSearchAny " )
{
Field const_value ;
DataTypePtr const_type ;
if ( KeyCondition : : getConstant ( arguments [ 1 ] , block_with_constants , const_value , const_type ) )
2021-09-21 13:43:33 +00:00
{
2021-09-27 14:26:25 +00:00
if ( traverseASTEquals ( function - > name , arguments [ 0 ] , const_type , const_value , out ) )
return true ;
}
else if ( KeyCondition : : getConstant ( arguments [ 0 ] , block_with_constants , const_value , const_type ) & & ( function - > name = = " equals " | | function - > name = = " notEquals " ) )
{
if ( traverseASTEquals ( function - > name , arguments [ 1 ] , const_type , const_value , out ) )
return true ;
}
}
}
2021-09-21 13:43:33 +00:00
2021-09-27 14:26:25 +00:00
return false ;
}
2021-09-21 13:43:33 +00:00
2021-09-27 14:26:25 +00:00
bool MergeTreeConditionFullText : : traverseASTEquals (
const String & function_name ,
const ASTPtr & key_ast ,
const DataTypePtr & value_type ,
const Field & value_field ,
RPNElement & out )
{
auto value_data_type = WhichDataType ( value_type ) ;
if ( ! value_data_type . isStringOrFixedString ( ) & & ! value_data_type . isArray ( ) )
return false ;
2021-09-21 13:43:33 +00:00
2021-09-27 14:26:25 +00:00
if ( ! token_extractor - > supportLike ( ) & & ( function_name = = " like " | | function_name = = " notLike " ) )
return false ;
2021-09-21 13:43:33 +00:00
2021-09-27 14:26:25 +00:00
Field const_value = value_field ;
size_t key_column_num = 0 ;
bool key_exists = getKey ( key_ast - > getColumnName ( ) , key_column_num ) ;
bool map_key_exists = getKey ( fmt : : format ( " mapKeys({}) " , key_ast - > getColumnName ( ) ) , key_column_num ) ;
if ( const auto * function = key_ast - > as < ASTFunction > ( ) )
{
if ( function - > name = = " arrayElement " )
{
/** Try to parse arrayElement for mapKeys index.
* It is important to ignore keys like column_map [ ' Key ' ] = ' ' because if key does not exists in map
* we return default value for arrayElement .
*
* We cannot skip keys that does not exist in map if comparison is with default type value because
* that way we skip necessary granules where map key does not exists .
*/
if ( value_field = = value_type - > getDefault ( ) )
return false ;
const auto & map_column_name = assert_cast < ASTIdentifier * > ( function - > arguments . get ( ) - > children [ 0 ] . get ( ) ) - > name ( ) ;
2021-09-21 13:43:33 +00:00
2021-09-27 14:26:25 +00:00
size_t map_keys_key_column_num = 0 ;
auto map_keys_index_column_name = fmt : : format ( " mapKeys({}) " , map_column_name ) ;
bool map_keys_exists = getKey ( map_keys_index_column_name , map_keys_key_column_num ) ;
2021-09-21 13:43:33 +00:00
2021-09-27 14:26:25 +00:00
size_t map_values_key_column_num = 0 ;
auto map_values_index_column_name = fmt : : format ( " mapValues({}) " , map_column_name ) ;
bool map_values_exists = getKey ( map_values_index_column_name , map_values_key_column_num ) ;
if ( map_keys_exists )
{
auto & argument = function - > arguments . get ( ) - > children [ 1 ] ;
if ( const auto * literal = argument - > as < ASTLiteral > ( ) )
{
auto element_key = literal - > value ;
const_value = element_key ;
key_column_num = map_keys_key_column_num ;
key_exists = true ;
2021-09-21 13:43:33 +00:00
}
else
{
return false ;
}
}
2021-09-27 14:26:25 +00:00
else if ( map_values_exists )
{
key_column_num = map_values_key_column_num ;
key_exists = true ;
}
2021-09-21 13:43:33 +00:00
else
{
return false ;
}
}
2021-09-27 14:26:25 +00:00
}
2019-07-11 13:44:44 +00:00
2021-09-27 14:26:25 +00:00
if ( ! key_exists & & ! map_key_exists )
return false ;
2021-09-22 23:06:54 +00:00
2021-09-27 14:26:25 +00:00
if ( map_key_exists & & ( function_name = = " has " | | function_name = = " mapContains " ) )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_HAS ;
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
stringToBloomFilter ( const_value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
2019-02-20 16:24:46 +00:00
2021-09-27 14:26:25 +00:00
return true ;
}
else if ( function_name = = " has " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_HAS ;
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
stringToBloomFilter ( const_value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
2020-03-13 18:42:19 +00:00
2021-09-27 14:26:25 +00:00
return true ;
}
2020-03-13 18:42:19 +00:00
2021-09-27 14:26:25 +00:00
if ( function_name = = " notEquals " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_NOT_EQUALS ;
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
stringToBloomFilter ( const_value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
return true ;
}
else if ( function_name = = " equals " )
{
out . key_column = key_column_num ;
2021-09-28 09:52:07 +00:00
return createFunctionEqualsCondition ( out , const_value , params , token_extractor ) ;
2021-09-27 14:26:25 +00:00
}
else if ( function_name = = " like " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_EQUALS ;
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
likeStringToBloomFilter ( const_value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
return true ;
}
else if ( function_name = = " notLike " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_NOT_EQUALS ;
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
likeStringToBloomFilter ( const_value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
return true ;
}
else if ( function_name = = " hasToken " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_EQUALS ;
out . bloom_filter = std : : make_unique < BloomFilter > ( params ) ;
stringToBloomFilter ( const_value . get < String > ( ) , token_extractor , * out . bloom_filter ) ;
return true ;
2019-02-20 16:24:46 +00:00
}
2021-09-27 14:26:25 +00:00
else if ( function_name = = " startsWith " )
2019-02-20 16:24:46 +00:00
{
2021-09-27 14:26:25 +00:00
out . key_column = key_column_num ;
return createFunctionEqualsCondition ( out , const_value , params , token_extractor ) ;
}
else if ( function_name = = " endsWith " )
{
out . key_column = key_column_num ;
return createFunctionEqualsCondition ( out , const_value , params , token_extractor ) ;
}
else if ( function_name = = " multiSearchAny " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_MULTI_SEARCH ;
/// 2d vector is not needed here but is used because already exists for FUNCTION_IN
std : : vector < std : : vector < BloomFilter > > bloom_filters ;
bloom_filters . emplace_back ( ) ;
for ( const auto & element : const_value . get < Array > ( ) )
2019-02-20 19:27:23 +00:00
{
2021-09-27 14:26:25 +00:00
if ( element . getType ( ) ! = Field : : Types : : String )
return false ;
2019-02-20 16:24:46 +00:00
2021-09-27 14:26:25 +00:00
bloom_filters . back ( ) . emplace_back ( params ) ;
stringToBloomFilter ( element . get < String > ( ) , token_extractor , bloom_filters . back ( ) . back ( ) ) ;
2019-02-20 19:27:23 +00:00
}
2021-09-27 14:26:25 +00:00
out . set_bloom_filters = std : : move ( bloom_filters ) ;
return true ;
2019-02-20 16:24:46 +00:00
}
return false ;
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText : : tryPrepareSetBloomFilter (
2019-02-22 10:51:19 +00:00
const ASTs & args ,
2019-02-24 21:17:52 +00:00
RPNElement & out )
2019-02-22 10:51:19 +00:00
{
2019-02-24 21:17:52 +00:00
const ASTPtr & left_arg = args [ 0 ] ;
const ASTPtr & right_arg = args [ 1 ] ;
std : : vector < KeyTuplePositionMapping > key_tuple_mapping ;
DataTypes data_types ;
const auto * left_arg_tuple = typeid_cast < const ASTFunction * > ( left_arg . get ( ) ) ;
if ( left_arg_tuple & & left_arg_tuple - > name = = " tuple " )
{
const auto & tuple_elements = left_arg_tuple - > arguments - > children ;
for ( size_t i = 0 ; i < tuple_elements . size ( ) ; + + i )
{
size_t key = 0 ;
2021-09-21 13:43:33 +00:00
if ( getKey ( tuple_elements [ i ] - > getColumnName ( ) , key ) )
2019-02-24 21:17:52 +00:00
{
key_tuple_mapping . emplace_back ( i , key ) ;
2020-05-28 12:37:05 +00:00
data_types . push_back ( index_data_types [ key ] ) ;
2019-02-24 21:17:52 +00:00
}
}
}
else
{
size_t key = 0 ;
2021-09-21 13:43:33 +00:00
if ( getKey ( left_arg - > getColumnName ( ) , key ) )
2019-02-24 21:17:52 +00:00
{
2019-02-25 08:43:19 +00:00
key_tuple_mapping . emplace_back ( 0 , key ) ;
2020-05-28 12:37:05 +00:00
data_types . push_back ( index_data_types [ key ] ) ;
2019-02-24 21:17:52 +00:00
}
}
if ( key_tuple_mapping . empty ( ) )
return false ;
PreparedSetKey set_key ;
if ( typeid_cast < const ASTSubquery * > ( right_arg . get ( ) ) | | typeid_cast < const ASTIdentifier * > ( right_arg . get ( ) ) )
set_key = PreparedSetKey : : forSubquery ( * right_arg ) ;
else
set_key = PreparedSetKey : : forLiteral ( * right_arg , data_types ) ;
auto set_it = prepared_sets . find ( set_key ) ;
if ( set_it = = prepared_sets . end ( ) )
return false ;
const SetPtr & prepared_set = set_it - > second ;
if ( ! prepared_set - > hasExplicitSetElements ( ) )
return false ;
2019-02-25 08:43:19 +00:00
for ( const auto & data_type : prepared_set - > getDataTypes ( ) )
if ( data_type - > getTypeId ( ) ! = TypeIndex : : String & & data_type - > getTypeId ( ) ! = TypeIndex : : FixedString )
return false ;
2019-05-10 03:42:28 +00:00
std : : vector < std : : vector < BloomFilter > > bloom_filters ;
2019-02-25 18:38:57 +00:00
std : : vector < size_t > key_position ;
2019-02-24 21:17:52 +00:00
2019-03-29 14:17:29 +00:00
Columns columns = prepared_set - > getSetElements ( ) ;
2020-03-09 02:05:04 +00:00
for ( const auto & elem : key_tuple_mapping )
2019-02-24 21:17:52 +00:00
{
2019-02-25 08:43:19 +00:00
bloom_filters . emplace_back ( ) ;
2020-03-09 02:05:04 +00:00
key_position . push_back ( elem . key_index ) ;
2019-02-25 18:38:57 +00:00
2020-03-09 02:05:04 +00:00
size_t tuple_idx = elem . tuple_index ;
2019-02-24 21:17:52 +00:00
const auto & column = columns [ tuple_idx ] ;
for ( size_t row = 0 ; row < prepared_set - > getTotalRowCount ( ) ; + + row )
{
2020-05-28 12:37:05 +00:00
bloom_filters . back ( ) . emplace_back ( params ) ;
2019-02-24 21:17:52 +00:00
auto ref = column - > getDataAt ( row ) ;
2021-04-01 02:37:19 +00:00
columnToBloomFilter ( ref . data , ref . size , token_extractor , bloom_filters . back ( ) . back ( ) ) ;
2019-02-24 21:17:52 +00:00
}
}
2019-02-25 18:38:57 +00:00
out . set_key_position = std : : move ( key_position ) ;
2019-02-24 21:17:52 +00:00
out . set_bloom_filters = std : : move ( bloom_filters ) ;
return true ;
}
2019-02-22 10:51:19 +00:00
2019-05-10 03:42:28 +00:00
MergeTreeIndexGranulePtr MergeTreeIndexFullText : : createIndexGranule ( ) const
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
return std : : make_shared < MergeTreeIndexGranuleFullText > ( index . name , index . column_names . size ( ) , params ) ;
2019-02-20 12:12:41 +00:00
}
2019-05-10 03:42:28 +00:00
MergeTreeIndexAggregatorPtr MergeTreeIndexFullText : : createIndexAggregator ( ) const
2019-03-11 17:59:36 +00:00
{
2020-05-28 12:37:05 +00:00
return std : : make_shared < MergeTreeIndexAggregatorFullText > ( index . column_names , index . name , params , token_extractor . get ( ) ) ;
2019-03-11 17:59:36 +00:00
}
2019-06-19 15:30:48 +00:00
MergeTreeIndexConditionPtr MergeTreeIndexFullText : : createIndexCondition (
2021-04-10 23:33:54 +00:00
const SelectQueryInfo & query , ContextPtr context ) const
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
return std : : make_shared < MergeTreeConditionFullText > ( query , context , index . sample_block , params , token_extractor . get ( ) ) ;
2019-02-20 12:12:41 +00:00
} ;
2019-05-10 03:42:28 +00:00
bool MergeTreeIndexFullText : : mayBenefitFromIndexForIn ( const ASTPtr & node ) const
2019-02-25 08:43:19 +00:00
{
2020-05-28 12:37:05 +00:00
return std : : find ( std : : cbegin ( index . column_names ) , std : : cend ( index . column_names ) , node - > getColumnName ( ) ) ! = std : : cend ( index . column_names ) ;
2019-02-25 08:43:19 +00:00
}
2019-02-20 12:12:41 +00:00
2021-04-01 02:37:19 +00:00
bool NgramTokenExtractor : : nextInField ( const char * data , size_t len , size_t * pos , size_t * token_start , size_t * token_len ) const
2019-02-21 20:32:36 +00:00
{
2019-02-21 21:29:24 +00:00
* token_start = * pos ;
* token_len = 0 ;
2019-02-24 18:55:56 +00:00
size_t code_points = 0 ;
for ( ; code_points < n & & * token_start + * token_len < len ; + + code_points )
2019-02-21 21:29:24 +00:00
{
size_t sz = UTF8 : : seqLength ( static_cast < UInt8 > ( data [ * token_start + * token_len ] ) ) ;
* token_len + = sz ;
}
2019-02-22 07:59:07 +00:00
* pos + = UTF8 : : seqLength ( static_cast < UInt8 > ( data [ * pos ] ) ) ;
2019-02-24 18:55:56 +00:00
return code_points = = n ;
2019-02-21 20:32:36 +00:00
}
2019-02-21 21:29:24 +00:00
bool NgramTokenExtractor : : nextLike ( const String & str , size_t * pos , String & token ) const
2019-02-20 12:12:41 +00:00
{
2019-02-21 21:29:24 +00:00
token . clear ( ) ;
2019-02-20 12:12:41 +00:00
2019-02-22 07:59:07 +00:00
size_t code_points = 0 ;
2019-02-21 21:29:24 +00:00
bool escaped = false ;
2019-02-22 07:59:07 +00:00
for ( size_t i = * pos ; i < str . size ( ) ; )
2019-02-20 12:12:41 +00:00
{
2019-02-22 07:59:07 +00:00
if ( escaped & & ( str [ i ] = = ' % ' | | str [ i ] = = ' _ ' | | str [ i ] = = ' \\ ' ) )
2019-02-20 12:48:50 +00:00
{
2019-02-22 07:59:07 +00:00
token + = str [ i ] ;
+ + code_points ;
2019-02-21 21:29:24 +00:00
escaped = false ;
2019-02-22 07:59:07 +00:00
+ + i ;
2019-02-20 12:48:50 +00:00
}
2019-02-22 07:59:07 +00:00
else if ( ! escaped & & ( str [ i ] = = ' % ' | | str [ i ] = = ' _ ' ) )
2019-02-21 20:32:36 +00:00
{
2019-02-21 21:29:24 +00:00
/// This token is too small, go to the next.
token . clear ( ) ;
2019-02-22 07:59:07 +00:00
code_points = 0 ;
2019-02-21 21:29:24 +00:00
escaped = false ;
2019-02-22 07:59:07 +00:00
* pos = + + i ;
2019-02-21 21:29:24 +00:00
}
2019-02-22 07:59:07 +00:00
else if ( ! escaped & & str [ i ] = = ' \\ ' )
2019-02-21 21:29:24 +00:00
{
escaped = true ;
2019-02-22 07:59:07 +00:00
+ + i ;
2019-02-21 21:29:24 +00:00
}
else
{
2019-02-22 07:59:07 +00:00
const size_t sz = UTF8 : : seqLength ( static_cast < UInt8 > ( str [ i ] ) ) ;
for ( size_t j = 0 ; j < sz ; + + j )
token + = str [ i + j ] ;
i + = sz ;
+ + code_points ;
2019-02-21 21:29:24 +00:00
escaped = false ;
2019-02-21 20:32:36 +00:00
}
2019-02-25 18:46:54 +00:00
if ( code_points = = n )
{
2019-02-22 07:59:07 +00:00
* pos + = UTF8 : : seqLength ( static_cast < UInt8 > ( str [ * pos ] ) ) ;
2019-02-21 21:29:24 +00:00
return true ;
}
2019-02-21 20:32:36 +00:00
}
2019-02-21 21:29:24 +00:00
return false ;
}
2019-02-20 12:12:41 +00:00
2021-04-01 02:37:19 +00:00
bool SplitTokenExtractor : : nextInField ( const char * data , size_t len , size_t * pos , size_t * token_start , size_t * token_len ) const
{
* token_start = * pos ;
* token_len = 0 ;
while ( * pos < len )
{
if ( isASCII ( data [ * pos ] ) & & ! isAlphaNumericASCII ( data [ * pos ] ) )
{
/// Finish current token if any
if ( * token_len > 0 )
return true ;
* token_start = + + * pos ;
}
else
{
/// Note that UTF-8 sequence is completely consisted of non-ASCII bytes.
+ + * pos ;
+ + * token_len ;
}
}
return * token_len > 0 ;
}
bool SplitTokenExtractor : : nextInColumn ( const char * data , size_t len , size_t * pos , size_t * token_start , size_t * token_len ) const
2019-02-25 14:23:19 +00:00
{
* token_start = * pos ;
* token_len = 0 ;
2020-03-31 13:14:59 +00:00
2019-02-25 14:23:19 +00:00
while ( * pos < len )
{
2020-08-08 01:01:47 +00:00
# if defined(__SSE2__) && !defined(MEMORY_SANITIZER) /// We read uninitialized bytes and decide on the calculated mask
2020-04-02 10:53:13 +00:00
// NOTE: we assume that `data` string is padded from the right with 15 bytes.
2020-03-31 13:14:59 +00:00
const __m128i haystack = _mm_loadu_si128 ( reinterpret_cast < const __m128i * > ( data + * pos ) ) ;
const size_t haystack_length = 16 ;
2020-04-01 21:28:02 +00:00
# if defined(__SSE4_2__)
2020-03-31 13:14:59 +00:00
// With the help of https://www.strchr.com/strcmp_and_strlen_using_sse_4.2
2020-04-06 10:27:31 +00:00
const auto alnum_chars_ranges = _mm_set_epi8 ( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
2020-04-01 21:28:02 +00:00
' \xFF ' , ' \x80 ' , ' z ' , ' a ' , ' Z ' , ' A ' , ' 9 ' , ' 0 ' ) ;
2020-04-11 21:37:19 +00:00
// Every bit represents if `haystack` character is in the ranges (1) or not (0)
2020-04-01 21:28:02 +00:00
const int result_bitmask = _mm_cvtsi128_si32 ( _mm_cmpestrm ( alnum_chars_ranges , 8 , haystack , haystack_length , _SIDD_CMP_RANGES ) ) ;
2020-03-31 13:14:59 +00:00
# else
// NOTE: -1 and +1 required since SSE2 has no `>=` and `<=` instructions on packed 8-bit integers (epi8).
2020-04-06 10:27:31 +00:00
const auto number_begin = _mm_set1_epi8 ( ' 0 ' - 1 ) ;
const auto number_end = _mm_set1_epi8 ( ' 9 ' + 1 ) ;
const auto alpha_lower_begin = _mm_set1_epi8 ( ' a ' - 1 ) ;
const auto alpha_lower_end = _mm_set1_epi8 ( ' z ' + 1 ) ;
const auto alpha_upper_begin = _mm_set1_epi8 ( ' A ' - 1 ) ;
const auto alpha_upper_end = _mm_set1_epi8 ( ' Z ' + 1 ) ;
2021-01-22 23:57:35 +00:00
const auto zero = _mm_set1_epi8 ( 0 ) ;
2020-03-31 13:14:59 +00:00
2020-08-08 01:01:47 +00:00
// every bit represents if `haystack` character `c` satisfies condition:
2020-04-01 21:28:02 +00:00
// (c < 0) || (c > '0' - 1 && c < '9' + 1) || (c > 'a' - 1 && c < 'z' + 1) || (c > 'A' - 1 && c < 'Z' + 1)
2020-04-02 10:53:13 +00:00
// < 0 since _mm_cmplt_epi8 threats chars as SIGNED, and so all chars > 0x80 are negative.
2020-04-01 21:28:02 +00:00
const int result_bitmask = _mm_movemask_epi8 ( _mm_or_si128 ( _mm_or_si128 ( _mm_or_si128 (
_mm_cmplt_epi8 ( haystack , zero ) ,
_mm_and_si128 ( _mm_cmpgt_epi8 ( haystack , number_begin ) , _mm_cmplt_epi8 ( haystack , number_end ) ) ) ,
_mm_and_si128 ( _mm_cmpgt_epi8 ( haystack , alpha_lower_begin ) , _mm_cmplt_epi8 ( haystack , alpha_lower_end ) ) ) ,
_mm_and_si128 ( _mm_cmpgt_epi8 ( haystack , alpha_upper_begin ) , _mm_cmplt_epi8 ( haystack , alpha_upper_end ) ) ) ) ;
2020-03-31 13:14:59 +00:00
# endif
if ( result_bitmask = = 0 )
{
if ( * token_len ! = 0 )
2020-04-02 10:53:13 +00:00
// end of token started on previous haystack
2020-03-31 13:14:59 +00:00
return true ;
* pos + = haystack_length ;
continue ;
}
2020-04-02 10:53:13 +00:00
const auto token_start_pos_in_current_haystack = getTrailingZeroBitsUnsafe ( result_bitmask ) ;
2020-03-31 13:14:59 +00:00
if ( * token_len = = 0 )
2020-04-02 10:53:13 +00:00
// new token
* token_start = * pos + token_start_pos_in_current_haystack ;
else if ( token_start_pos_in_current_haystack ! = 0 )
// end of token starting in one of previous haystacks
2020-04-01 21:28:02 +00:00
return true ;
2020-03-31 13:14:59 +00:00
2020-04-02 10:53:13 +00:00
const auto token_bytes_in_current_haystack = getTrailingZeroBitsUnsafe ( ~ ( result_bitmask > > token_start_pos_in_current_haystack ) ) ;
* token_len + = token_bytes_in_current_haystack ;
2020-03-31 13:14:59 +00:00
2020-04-02 10:53:13 +00:00
* pos + = token_start_pos_in_current_haystack + token_bytes_in_current_haystack ;
if ( token_start_pos_in_current_haystack + token_bytes_in_current_haystack = = haystack_length )
2020-03-31 13:14:59 +00:00
// check if there are leftovers in next `haystack`
continue ;
2020-04-06 10:27:31 +00:00
break ;
2020-03-31 13:14:59 +00:00
# else
2019-03-12 15:20:54 +00:00
if ( isASCII ( data [ * pos ] ) & & ! isAlphaNumericASCII ( data [ * pos ] ) )
2019-02-25 14:23:19 +00:00
{
2020-01-08 10:20:55 +00:00
/// Finish current token if any
2019-02-25 14:23:19 +00:00
if ( * token_len > 0 )
return true ;
* token_start = + + * pos ;
}
else
2019-02-25 17:12:09 +00:00
{
2020-01-08 10:20:55 +00:00
/// Note that UTF-8 sequence is completely consisted of non-ASCII bytes.
+ + * pos ;
+ + * token_len ;
2019-02-25 17:12:09 +00:00
}
2020-03-31 13:14:59 +00:00
# endif
2019-02-25 14:23:19 +00:00
}
2020-04-02 10:53:13 +00:00
2020-05-11 05:04:59 +00:00
# if defined(__SSE2__) && !defined(MEMORY_SANITIZER)
2021-01-22 23:57:35 +00:00
// Could happen only if string is not padded with zeros, and we accidentally hopped over the end of data.
2020-04-02 10:53:13 +00:00
if ( * token_start > len )
return false ;
2020-04-06 10:27:31 +00:00
* token_len = std : : min ( len - * token_start , * token_len ) ;
2020-04-02 10:53:13 +00:00
# endif
2019-02-25 14:23:19 +00:00
return * token_len > 0 ;
}
2019-03-06 15:30:27 +00:00
bool SplitTokenExtractor : : nextLike ( const String & str , size_t * pos , String & token ) const
2019-02-25 14:23:19 +00:00
{
2019-03-06 15:30:27 +00:00
token . clear ( ) ;
bool bad_token = false ; // % or _ before token
2019-03-12 13:17:22 +00:00
bool escaped = false ;
2019-03-12 15:20:54 +00:00
while ( * pos < str . size ( ) )
2019-03-06 15:30:27 +00:00
{
2019-03-12 13:17:22 +00:00
if ( ! escaped & & ( str [ * pos ] = = ' % ' | | str [ * pos ] = = ' _ ' ) )
2019-03-06 15:30:27 +00:00
{
token . clear ( ) ;
bad_token = true ;
2019-03-12 15:20:54 +00:00
+ + * pos ;
2019-03-06 15:30:27 +00:00
}
2019-03-12 13:17:22 +00:00
else if ( ! escaped & & str [ * pos ] = = ' \\ ' )
{
escaped = true ;
2019-03-12 15:20:54 +00:00
+ + * pos ;
2019-03-12 13:17:22 +00:00
}
else if ( isASCII ( str [ * pos ] ) & & ! isAlphaNumericASCII ( str [ * pos ] ) )
2019-03-06 15:30:27 +00:00
{
if ( ! bad_token & & ! token . empty ( ) )
return true ;
token . clear ( ) ;
bad_token = false ;
2019-03-12 13:17:22 +00:00
escaped = false ;
2019-03-12 15:20:54 +00:00
+ + * pos ;
2019-03-06 15:30:27 +00:00
}
else
{
2019-03-12 15:20:54 +00:00
const size_t sz = UTF8 : : seqLength ( static_cast < UInt8 > ( str [ * pos ] ) ) ;
for ( size_t j = 0 ; j < sz ; + + j )
{
token + = str [ * pos ] ;
+ + * pos ;
}
2019-03-12 13:17:22 +00:00
escaped = false ;
2019-03-06 15:30:27 +00:00
}
}
return ! bad_token & & ! token . empty ( ) ;
2019-02-25 14:23:19 +00:00
}
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr bloomFilterIndexCreator (
2020-05-28 13:09:03 +00:00
const IndexDescription & index )
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
if ( index . type = = NgramTokenExtractor : : getName ( ) )
{
size_t n = index . arguments [ 0 ] . get < size_t > ( ) ;
2020-07-10 08:13:21 +00:00
BloomFilterParameters params (
index . arguments [ 1 ] . get < size_t > ( ) ,
index . arguments [ 2 ] . get < size_t > ( ) ,
index . arguments [ 3 ] . get < size_t > ( ) ) ;
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
auto tokenizer = std : : make_unique < NgramTokenExtractor > ( n ) ;
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
return std : : make_shared < MergeTreeIndexFullText > ( index , params , std : : move ( tokenizer ) ) ;
}
else if ( index . type = = SplitTokenExtractor : : getName ( ) )
{
2020-07-10 08:13:21 +00:00
BloomFilterParameters params (
index . arguments [ 0 ] . get < size_t > ( ) ,
index . arguments [ 1 ] . get < size_t > ( ) ,
index . arguments [ 2 ] . get < size_t > ( ) ) ;
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
auto tokenizer = std : : make_unique < SplitTokenExtractor > ( ) ;
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
return std : : make_shared < MergeTreeIndexFullText > ( index , params , std : : move ( tokenizer ) ) ;
}
else
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
throw Exception ( " Unknown index type: " + backQuote ( index . name ) , ErrorCodes : : LOGICAL_ERROR ) ;
}
}
2019-02-20 12:12:41 +00:00
2020-05-28 13:09:03 +00:00
void bloomFilterIndexValidator ( const IndexDescription & index , bool /*attach*/ )
2020-05-28 12:37:05 +00:00
{
2021-09-22 23:06:54 +00:00
for ( const auto & index_data_type : index . data_types )
2021-09-06 10:22:06 +00:00
{
2021-09-22 23:06:54 +00:00
WhichDataType data_type ( index_data_type ) ;
2021-09-02 07:27:54 +00:00
2021-09-22 23:06:54 +00:00
if ( data_type . isArray ( ) )
2021-09-21 13:43:33 +00:00
{
2021-09-22 23:06:54 +00:00
const auto & array_type = assert_cast < const DataTypeArray & > ( * index_data_type ) ;
data_type = WhichDataType ( array_type . getNestedType ( ) ) ;
2021-09-21 13:43:33 +00:00
}
2021-09-02 06:17:47 +00:00
2021-09-22 23:06:54 +00:00
if ( ! data_type . isString ( ) & & ! data_type . isFixedString ( ) )
throw Exception ( " Bloom filter index can be used only with `String`, `FixedString` column or Array with `String` or `FixedString` values column. " , ErrorCodes : : INCORRECT_QUERY ) ;
2019-02-20 12:12:41 +00:00
}
2020-05-28 12:37:05 +00:00
if ( index . type = = NgramTokenExtractor : : getName ( ) )
2019-02-25 18:46:54 +00:00
{
2020-05-28 12:37:05 +00:00
if ( index . arguments . size ( ) ! = 4 )
2019-02-24 18:55:56 +00:00
throw Exception ( " `ngrambf` index must have exactly 4 arguments. " , ErrorCodes : : INCORRECT_QUERY ) ;
2019-02-25 18:46:54 +00:00
}
2020-05-28 12:37:05 +00:00
else if ( index . type = = SplitTokenExtractor : : getName ( ) )
2019-02-25 18:46:54 +00:00
{
2020-05-28 12:37:05 +00:00
if ( index . arguments . size ( ) ! = 3 )
2019-02-25 14:23:19 +00:00
throw Exception ( " `tokenbf` index must have exactly 3 arguments. " , ErrorCodes : : INCORRECT_QUERY ) ;
2019-02-25 18:46:54 +00:00
}
else
{
2020-05-28 12:37:05 +00:00
throw Exception ( " Unknown index type: " + backQuote ( index . name ) , ErrorCodes : : LOGICAL_ERROR ) ;
2019-02-20 12:12:41 +00:00
}
2020-07-10 08:13:21 +00:00
assert ( index . arguments . size ( ) > = 3 ) ;
2020-07-10 08:21:40 +00:00
for ( const auto & arg : index . arguments )
if ( arg . getType ( ) ! = Field : : Types : : UInt64 )
throw Exception ( " All parameters to *bf_v1 index must be unsigned integers " , ErrorCodes : : BAD_ARGUMENTS ) ;
2020-07-10 08:13:21 +00:00
/// Just validate
BloomFilterParameters params (
index . arguments [ 0 ] . get < size_t > ( ) ,
index . arguments [ 1 ] . get < size_t > ( ) ,
index . arguments [ 2 ] . get < size_t > ( ) ) ;
2019-02-20 12:12:41 +00:00
}
2019-02-20 11:22:07 +00:00
}