2022-06-24 01:56:15 +00:00
# include <algorithm>
# include <Columns/ColumnArray.h>
# include <DataTypes/DataTypesNumber.h>
# include <DataTypes/DataTypeArray.h>
# include <IO/WriteHelpers.h>
# include <IO/ReadHelpers.h>
# include <Interpreters/GinFilter.h>
# include <Interpreters/ExpressionActions.h>
# include <Interpreters/ExpressionAnalyzer.h>
# include <Interpreters/TreeRewriter.h>
# include <Interpreters/misc.h>
# include <Storages/MergeTree/MergeTreeData.h>
# include <Storages/MergeTree/RPNBuilder.h>
# include <Storages/MergeTree/MergeTreeIndexGin.h>
2022-12-29 16:00:17 +00:00
# include <Storages/MergeTree/MergeTreeIndexUtils.h>
2022-06-24 01:56:15 +00:00
# include <Parsers/ASTIdentifier.h>
# include <Parsers/ASTLiteral.h>
# include <Parsers/ASTSubquery.h>
# include <Core/Defines.h>
# include <Poco/Logger.h>
# include <Columns/ColumnNullable.h>
# include <Columns/ColumnLowCardinality.h>
# include <DataTypes/DataTypeNullable.h>
# include <DataTypes/DataTypeLowCardinality.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR ;
extern const int INCORRECT_QUERY ;
}
MergeTreeIndexGranuleGinFilter : : MergeTreeIndexGranuleGinFilter (
const String & index_name_ ,
size_t columns_number ,
const GinFilterParameters & params_ )
: index_name ( index_name_ )
, params ( params_ )
, gin_filters (
columns_number , GinFilter ( params ) )
, has_elems ( false )
{
}
void MergeTreeIndexGranuleGinFilter : : serializeBinary ( WriteBuffer & ostr ) const
{
if ( empty ( ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Attempt to write empty fulltext index {}. " , backQuote ( index_name ) ) ;
2022-09-25 23:29:30 +00:00
const auto & size_type = std : : make_shared < DataTypeUInt32 > ( ) ;
2022-06-24 01:56:15 +00:00
auto size_serialization = size_type - > getDefaultSerialization ( ) ;
for ( const auto & gin_filter : gin_filters )
{
2022-07-19 03:07:57 +00:00
size_t filter_size = gin_filter . getFilter ( ) . size ( ) ;
2022-12-29 16:00:17 +00:00
size_serialization - > serializeBinary ( filter_size , ostr , { } ) ;
2022-09-25 23:29:30 +00:00
ostr . write ( reinterpret_cast < const char * > ( gin_filter . getFilter ( ) . data ( ) ) , filter_size * sizeof ( GinFilter : : RowIDRangeContainer : : value_type ) ) ;
2022-06-24 01:56:15 +00:00
}
}
void MergeTreeIndexGranuleGinFilter : : deserializeBinary ( ReadBuffer & istr , MergeTreeIndexVersion version )
{
if ( version ! = 1 )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Unknown index version {}. " , version ) ;
Field field_rows ;
2022-09-25 23:29:30 +00:00
const auto & size_type = std : : make_shared < DataTypeUInt32 > ( ) ;
2022-06-24 01:56:15 +00:00
2022-09-25 23:29:30 +00:00
auto size_serialization = size_type - > getDefaultSerialization ( ) ;
2022-06-24 01:56:15 +00:00
for ( auto & gin_filter : gin_filters )
{
2022-12-29 16:00:17 +00:00
size_serialization - > deserializeBinary ( field_rows , istr , { } ) ;
2022-07-19 20:15:59 +00:00
size_t filter_size = field_rows . get < size_t > ( ) ;
2022-06-24 01:56:15 +00:00
2022-07-19 20:15:59 +00:00
if ( filter_size = = 0 )
2022-06-24 01:56:15 +00:00
continue ;
2022-07-19 20:15:59 +00:00
gin_filter . getFilter ( ) . assign ( filter_size , { } ) ;
2022-12-29 16:00:17 +00:00
istr . readStrict ( reinterpret_cast < char * > ( gin_filter . getFilter ( ) . data ( ) ) , filter_size * sizeof ( GinFilter : : RowIDRangeContainer : : value_type ) ) ;
2022-06-24 01:56:15 +00:00
}
has_elems = true ;
}
MergeTreeIndexAggregatorGinFilter : : MergeTreeIndexAggregatorGinFilter (
GinIndexStorePtr store_ ,
const Names & index_columns_ ,
const String & index_name_ ,
const GinFilterParameters & params_ ,
TokenExtractorPtr token_extractor_ )
: store ( store_ )
, index_columns ( index_columns_ )
, index_name ( index_name_ )
, params ( params_ )
, token_extractor ( token_extractor_ )
, granule (
std : : make_shared < MergeTreeIndexGranuleGinFilter > (
index_name , index_columns . size ( ) , params ) )
{
}
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorGinFilter : : getGranuleAndReset ( )
{
auto new_granule = std : : make_shared < MergeTreeIndexGranuleGinFilter > (
index_name , index_columns . size ( ) , params ) ;
new_granule . swap ( granule ) ;
return new_granule ;
}
2023-01-05 03:42:45 +00:00
void MergeTreeIndexAggregatorGinFilter : : addToGinFilter ( UInt32 rowID , const char * data , size_t length , GinFilter & gin_filter , UInt64 limit )
2022-06-24 01:56:15 +00:00
{
2022-07-03 12:18:51 +00:00
size_t cur = 0 ;
size_t token_start = 0 ;
size_t token_len = 0 ;
while ( cur < length & & token_extractor - > nextInStringPadded ( data , length , & cur , & token_start , & token_len ) )
{
2023-01-05 03:42:45 +00:00
gin_filter . add ( data + token_start , token_len , rowID , store , limit ) ;
2022-07-03 12:18:51 +00:00
}
2022-06-24 01:56:15 +00:00
}
void MergeTreeIndexAggregatorGinFilter : : update ( const Block & block , size_t * pos , size_t limit )
{
if ( * pos > = block . rows ( ) )
throw Exception (
" The provided position is not less than the number of block rows. Position: "
+ toString ( * pos ) + " , Block rows: " + toString ( block . rows ( ) ) + " . " , ErrorCodes : : LOGICAL_ERROR ) ;
size_t rows_read = std : : min ( limit , block . rows ( ) - * pos ) ;
2022-07-19 03:07:57 +00:00
auto row_id = store - > getNextRowIDRange ( rows_read ) ;
auto start_row_id = row_id ;
2022-06-24 01:56:15 +00:00
for ( size_t col = 0 ; col < index_columns . size ( ) ; + + col )
{
const auto & column_with_type = block . getByName ( index_columns [ col ] ) ;
const auto & column = column_with_type . column ;
size_t current_position = * pos ;
bool need_to_write = false ;
if ( isArray ( column_with_type . type ) )
{
const auto & column_array = assert_cast < const ColumnArray & > ( * column ) ;
const auto & column_offsets = column_array . getOffsets ( ) ;
const auto & column_key = column_array . getData ( ) ;
for ( size_t i = 0 ; i < rows_read ; + + i )
{
size_t element_start_row = column_offsets [ current_position - 1 ] ;
size_t elements_size = column_offsets [ current_position ] - element_start_row ;
for ( size_t row_num = 0 ; row_num < elements_size ; + + row_num )
{
auto ref = column_key . getDataAt ( element_start_row + row_num ) ;
2023-01-05 03:42:45 +00:00
addToGinFilter ( row_id , ref . data , ref . size , granule - > gin_filters [ col ] , rows_read ) ;
2022-06-24 01:56:15 +00:00
store - > addSize ( ref . size ) ;
}
current_position + = 1 ;
2022-07-19 03:07:57 +00:00
row_id + + ;
2022-06-24 01:56:15 +00:00
if ( store - > needToWrite ( ) )
need_to_write = true ;
}
}
else
{
for ( size_t i = 0 ; i < rows_read ; + + i )
{
auto ref = column - > getDataAt ( current_position + i ) ;
2023-01-05 03:42:45 +00:00
addToGinFilter ( row_id , ref . data , ref . size , granule - > gin_filters [ col ] , rows_read ) ;
2022-06-24 01:56:15 +00:00
store - > addSize ( ref . size ) ;
2022-07-19 03:07:57 +00:00
row_id + + ;
2022-06-24 01:56:15 +00:00
if ( store - > needToWrite ( ) )
need_to_write = true ;
}
}
2022-12-29 16:00:17 +00:00
granule - > gin_filters [ col ] . addRowRangeToGinFilter ( store - > getCurrentSegmentID ( ) , start_row_id , static_cast < UInt32 > ( start_row_id + rows_read - 1 ) ) ;
2022-06-24 01:56:15 +00:00
if ( need_to_write )
{
store - > writeSegment ( ) ;
}
}
granule - > has_elems = true ;
* pos + = rows_read ;
}
MergeTreeConditionGinFilter : : MergeTreeConditionGinFilter (
const SelectQueryInfo & query_info ,
2022-12-29 16:00:17 +00:00
ContextPtr context_ ,
2022-06-24 01:56:15 +00:00
const Block & index_sample_block ,
const GinFilterParameters & params_ ,
TokenExtractorPtr token_extactor_ )
2022-12-31 17:06:56 +00:00
: WithContext ( context_ ) , header ( index_sample_block )
2022-06-24 01:56:15 +00:00
, params ( params_ )
, token_extractor ( token_extactor_ )
2022-08-25 12:54:06 +00:00
, prepared_sets ( query_info . prepared_sets )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
if ( context_ - > getSettingsRef ( ) . allow_experimental_analyzer )
{
if ( ! query_info . filter_actions_dag )
{
rpn . push_back ( RPNElement : : FUNCTION_UNKNOWN ) ;
return ;
}
rpn = std : : move (
RPNBuilder < RPNElement > (
query_info . filter_actions_dag - > getOutputs ( ) . at ( 0 ) , context_ ,
[ & ] ( const RPNBuilderTreeNode & node , RPNElement & out )
{
return this - > traverseAtomAST ( node , out ) ;
} ) . extractRPN ( ) ) ;
}
ASTPtr filter_node = buildFilterNode ( query_info . query ) ;
if ( ! filter_node )
{
rpn . push_back ( RPNElement : : FUNCTION_UNKNOWN ) ;
return ;
}
auto block_with_constants = KeyCondition : : getBlockWithConstants ( query_info . query , query_info . syntax_analyzer_result , context_ ) ;
RPNBuilder < RPNElement > builder (
filter_node ,
context_ ,
std : : move ( block_with_constants ) ,
query_info . prepared_sets ,
[ & ] ( const RPNBuilderTreeNode & node , RPNElement & out ) { return traverseAtomAST ( node , out ) ; } ) ;
rpn = std : : move ( builder ) . extractRPN ( ) ;
2022-06-24 01:56:15 +00:00
}
bool MergeTreeConditionGinFilter : : alwaysUnknownOrTrue ( ) const
{
/// Check like in KeyCondition.
std : : vector < bool > rpn_stack ;
for ( const auto & element : rpn )
{
if ( element . function = = RPNElement : : FUNCTION_UNKNOWN
| | element . function = = RPNElement : : ALWAYS_TRUE )
{
rpn_stack . push_back ( true ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_EQUALS
| | element . function = = RPNElement : : FUNCTION_NOT_EQUALS
| | element . function = = RPNElement : : FUNCTION_HAS
| | element . function = = RPNElement : : FUNCTION_IN
| | element . function = = RPNElement : : FUNCTION_NOT_IN
| | element . function = = RPNElement : : FUNCTION_MULTI_SEARCH
2022-06-25 18:20:07 +00:00
| | element . function = = RPNElement : : ALWAYS_FALSE )
2022-06-24 01:56:15 +00:00
{
rpn_stack . push_back ( false ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_NOT )
{
// do nothing
}
else if ( element . function = = RPNElement : : FUNCTION_AND )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 & & arg2 ;
}
else if ( element . function = = RPNElement : : FUNCTION_OR )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 | | arg2 ;
}
else
throw Exception ( " Unexpected function type in KeyCondition::RPNElement " , ErrorCodes : : LOGICAL_ERROR ) ;
}
return rpn_stack [ 0 ] ;
}
2022-07-19 03:07:57 +00:00
bool MergeTreeConditionGinFilter : : mayBeTrueOnGranuleInPart ( MergeTreeIndexGranulePtr idx_granule , [[maybe_unused]] PostingsCacheForStore & cache_store ) const
2022-06-24 01:56:15 +00:00
{
std : : shared_ptr < MergeTreeIndexGranuleGinFilter > granule
= std : : dynamic_pointer_cast < MergeTreeIndexGranuleGinFilter > ( idx_granule ) ;
if ( ! granule )
throw Exception (
" GinFilter index condition got a granule with the wrong type. " , ErrorCodes : : LOGICAL_ERROR ) ;
/// Check like in KeyCondition.
std : : vector < BoolMask > rpn_stack ;
for ( const auto & element : rpn )
{
if ( element . function = = RPNElement : : FUNCTION_UNKNOWN )
{
rpn_stack . emplace_back ( true , true ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_EQUALS
| | element . function = = RPNElement : : FUNCTION_NOT_EQUALS
2022-06-25 18:20:07 +00:00
| | element . function = = RPNElement : : FUNCTION_HAS )
2022-06-24 01:56:15 +00:00
{
2022-07-19 03:07:57 +00:00
rpn_stack . emplace_back ( granule - > gin_filters [ element . key_column ] . contains ( * element . gin_filter , cache_store ) , true ) ;
2022-06-24 01:56:15 +00:00
if ( element . function = = RPNElement : : FUNCTION_NOT_EQUALS )
rpn_stack . back ( ) = ! rpn_stack . back ( ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_IN
| | element . function = = RPNElement : : FUNCTION_NOT_IN )
{
std : : vector < bool > result ( element . set_gin_filters . back ( ) . size ( ) , true ) ;
for ( size_t column = 0 ; column < element . set_key_position . size ( ) ; + + column )
{
const size_t key_idx = element . set_key_position [ column ] ;
const auto & gin_filters = element . set_gin_filters [ column ] ;
for ( size_t row = 0 ; row < gin_filters . size ( ) ; + + row )
2022-07-19 03:07:57 +00:00
result [ row ] = result [ row ] & & granule - > gin_filters [ key_idx ] . contains ( gin_filters [ row ] , cache_store ) ;
2022-06-24 01:56:15 +00:00
}
rpn_stack . emplace_back (
std : : find ( std : : cbegin ( result ) , std : : cend ( result ) , true ) ! = std : : end ( result ) , true ) ;
if ( element . function = = RPNElement : : FUNCTION_NOT_IN )
rpn_stack . back ( ) = ! rpn_stack . back ( ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_MULTI_SEARCH )
{
std : : vector < bool > result ( element . set_gin_filters . back ( ) . size ( ) , true ) ;
const auto & gin_filters = element . set_gin_filters [ 0 ] ;
for ( size_t row = 0 ; row < gin_filters . size ( ) ; + + row )
2022-07-19 03:07:57 +00:00
result [ row ] = result [ row ] & & granule - > gin_filters [ element . key_column ] . contains ( gin_filters [ row ] , cache_store ) ;
2022-06-24 01:56:15 +00:00
rpn_stack . emplace_back (
std : : find ( std : : cbegin ( result ) , std : : cend ( result ) , true ) ! = std : : end ( result ) , true ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_NOT )
{
rpn_stack . back ( ) = ! rpn_stack . back ( ) ;
}
else if ( element . function = = RPNElement : : FUNCTION_AND )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 & arg2 ;
}
else if ( element . function = = RPNElement : : FUNCTION_OR )
{
auto arg1 = rpn_stack . back ( ) ;
rpn_stack . pop_back ( ) ;
auto arg2 = rpn_stack . back ( ) ;
rpn_stack . back ( ) = arg1 | arg2 ;
}
else if ( element . function = = RPNElement : : ALWAYS_FALSE )
{
rpn_stack . emplace_back ( false , true ) ;
}
else if ( element . function = = RPNElement : : ALWAYS_TRUE )
{
rpn_stack . emplace_back ( true , false ) ;
}
else
throw Exception ( " Unexpected function type in GinFilterCondition::RPNElement " , ErrorCodes : : LOGICAL_ERROR ) ;
}
if ( rpn_stack . size ( ) ! = 1 )
throw Exception ( " Unexpected stack size in GinFilterCondition::mayBeTrueOnGranule " , ErrorCodes : : LOGICAL_ERROR ) ;
return rpn_stack [ 0 ] . can_be_true ;
}
2022-12-29 16:00:17 +00:00
bool MergeTreeConditionGinFilter : : traverseAtomAST ( const RPNBuilderTreeNode & node , RPNElement & out )
2022-06-24 01:56:15 +00:00
{
{
Field const_value ;
DataTypePtr const_type ;
2022-12-29 16:00:17 +00:00
if ( node . tryGetConstant ( const_value , const_type ) )
2022-06-24 01:56:15 +00:00
{
/// Check constant like in KeyCondition
if ( const_value . getType ( ) = = Field : : Types : : UInt64
| | const_value . getType ( ) = = Field : : Types : : Int64
| | const_value . getType ( ) = = Field : : Types : : Float64 )
{
/// Zero in all types is represented in memory the same way as in UInt64.
out . function = const_value . get < UInt64 > ( )
? RPNElement : : ALWAYS_TRUE
: RPNElement : : ALWAYS_FALSE ;
return true ;
}
}
}
2022-12-29 16:00:17 +00:00
if ( node . isFunction ( ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
const auto function = node . toFunctionNode ( ) ;
// auto arguments_size = function.getArgumentsSize();
auto function_name = function . getFunctionName ( ) ;
2022-06-24 01:56:15 +00:00
2022-12-29 16:00:17 +00:00
size_t function_arguments_size = function . getArgumentsSize ( ) ;
if ( function_arguments_size ! = 2 )
2022-06-24 01:56:15 +00:00
return false ;
2022-12-29 16:00:17 +00:00
auto lhs_argument = function . getArgumentAt ( 0 ) ;
auto rhs_argument = function . getArgumentAt ( 1 ) ;
2022-06-24 01:56:15 +00:00
2022-12-29 16:00:17 +00:00
if ( functionIsInOrGlobalInOperator ( function_name ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
if ( tryPrepareSetGinFilter ( lhs_argument , rhs_argument , out ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
if ( function_name = = " notIn " )
2022-06-24 01:56:15 +00:00
{
out . function = RPNElement : : FUNCTION_NOT_IN ;
return true ;
}
2022-12-29 16:00:17 +00:00
else if ( function_name = = " in " )
2022-06-24 01:56:15 +00:00
{
out . function = RPNElement : : FUNCTION_IN ;
return true ;
}
}
}
2022-12-29 16:00:17 +00:00
else if ( function_name = = " equals " | |
function_name = = " notEquals " | |
function_name = = " has " | |
function_name = = " mapContains " | |
function_name = = " like " | |
function_name = = " notLike " | |
function_name = = " hasToken " | |
function_name = = " startsWith " | |
function_name = = " endsWith " | |
function_name = = " multiSearchAny " )
2022-06-24 01:56:15 +00:00
{
Field const_value ;
DataTypePtr const_type ;
2022-12-29 16:00:17 +00:00
if ( rhs_argument . tryGetConstant ( const_value , const_type ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
if ( traverseASTEquals ( function_name , lhs_argument , const_type , const_value , out ) )
2022-06-24 01:56:15 +00:00
return true ;
}
2022-12-29 16:00:17 +00:00
else if ( lhs_argument . tryGetConstant ( const_value , const_type ) & & ( function_name = = " equals " | | function_name = = " notEquals " ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
if ( traverseASTEquals ( function_name , rhs_argument , const_type , const_value , out ) )
2022-06-24 01:56:15 +00:00
return true ;
}
}
}
return false ;
}
bool MergeTreeConditionGinFilter : : traverseASTEquals (
const String & function_name ,
2022-12-29 16:00:17 +00:00
const RPNBuilderTreeNode & key_ast ,
2022-06-24 01:56:15 +00:00
const DataTypePtr & value_type ,
const Field & value_field ,
RPNElement & out )
{
auto value_data_type = WhichDataType ( value_type ) ;
if ( ! value_data_type . isStringOrFixedString ( ) & & ! value_data_type . isArray ( ) )
return false ;
Field const_value = value_field ;
size_t key_column_num = 0 ;
2022-12-31 17:06:56 +00:00
bool key_exists = header . has ( key_ast . getColumnName ( ) ) ;
bool map_key_exists = header . has ( fmt : : format ( " mapKeys({}) " , key_ast . getColumnName ( ) ) ) ;
2022-06-24 01:56:15 +00:00
2022-12-29 16:00:17 +00:00
if ( key_ast . isFunction ( ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
const auto function = key_ast . toFunctionNode ( ) ;
if ( function . getFunctionName ( ) = = " arrayElement " )
2022-06-24 01:56:15 +00:00
{
/** Try to parse arrayElement for mapKeys index.
* It is important to ignore keys like column_map [ ' Key ' ] = ' ' because if key does not exists in map
* we return default value for arrayElement .
*
* We cannot skip keys that does not exist in map if comparison is with default type value because
* that way we skip necessary granules where map key does not exists .
*/
if ( value_field = = value_type - > getDefault ( ) )
return false ;
2022-12-29 16:00:17 +00:00
auto first_argument = function . getArgumentAt ( 0 ) ;
const auto map_column_name = first_argument . getColumnName ( ) ;
2022-06-24 01:56:15 +00:00
auto map_keys_index_column_name = fmt : : format ( " mapKeys({}) " , map_column_name ) ;
auto map_values_index_column_name = fmt : : format ( " mapValues({}) " , map_column_name ) ;
2022-12-31 17:06:56 +00:00
if ( header . has ( map_keys_index_column_name ) )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
auto argument = function . getArgumentAt ( 1 ) ;
DataTypePtr const_type ;
if ( argument . tryGetConstant ( const_value , const_type ) )
2022-06-24 01:56:15 +00:00
{
2022-12-31 17:06:56 +00:00
key_column_num = header . getPositionByName ( map_keys_index_column_name ) ;
2022-06-24 01:56:15 +00:00
key_exists = true ;
}
else
{
return false ;
}
}
2022-12-31 17:06:56 +00:00
else if ( header . has ( map_values_index_column_name ) )
2022-06-24 01:56:15 +00:00
{
2022-12-31 17:06:56 +00:00
key_column_num = header . getPositionByName ( map_values_index_column_name ) ;
2022-06-24 01:56:15 +00:00
key_exists = true ;
}
else
{
return false ;
}
}
}
if ( ! key_exists & & ! map_key_exists )
return false ;
if ( map_key_exists & & ( function_name = = " has " | | function_name = = " mapContains " ) )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_HAS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " has " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_HAS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
if ( function_name = = " notEquals " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_NOT_EQUALS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " equals " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_EQUALS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " like " )
{
out . key_column = key_column_num ;
2022-06-25 18:20:07 +00:00
out . function = RPNElement : : FUNCTION_EQUALS ;
2022-06-24 01:56:15 +00:00
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringLikeToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " notLike " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_NOT_EQUALS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringLikeToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " hasToken " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_EQUALS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " startsWith " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_EQUALS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " endsWith " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_EQUALS ;
out . gin_filter = std : : make_unique < GinFilter > ( params ) ;
const auto & value = const_value . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , * out . gin_filter ) ;
return true ;
}
else if ( function_name = = " multiSearchAny " )
{
out . key_column = key_column_num ;
out . function = RPNElement : : FUNCTION_MULTI_SEARCH ;
/// 2d vector is not needed here but is used because already exists for FUNCTION_IN
std : : vector < std : : vector < GinFilter > > gin_filters ;
gin_filters . emplace_back ( ) ;
for ( const auto & element : const_value . get < Array > ( ) )
{
if ( element . getType ( ) ! = Field : : Types : : String )
return false ;
gin_filters . back ( ) . emplace_back ( params ) ;
const auto & value = element . get < String > ( ) ;
token_extractor - > stringToGinFilter ( value . data ( ) , value . size ( ) , gin_filters . back ( ) . back ( ) ) ;
}
out . set_gin_filters = std : : move ( gin_filters ) ;
return true ;
}
return false ;
}
bool MergeTreeConditionGinFilter : : tryPrepareSetGinFilter (
2022-12-29 16:00:17 +00:00
const RPNBuilderTreeNode & lhs ,
const RPNBuilderTreeNode & rhs ,
2022-06-24 01:56:15 +00:00
RPNElement & out )
{
std : : vector < KeyTuplePositionMapping > key_tuple_mapping ;
DataTypes data_types ;
2022-12-29 16:00:17 +00:00
if ( lhs . isFunction ( ) & & lhs . toFunctionNode ( ) . getFunctionName ( ) = = " tuple " )
2022-06-24 01:56:15 +00:00
{
2022-12-29 16:00:17 +00:00
const auto function = lhs . toFunctionNode ( ) ;
auto arguments_size = function . getArgumentsSize ( ) ;
for ( size_t i = 0 ; i < arguments_size ; + + i )
2022-06-24 01:56:15 +00:00
{
2022-12-31 17:06:56 +00:00
if ( header . has ( function . getArgumentAt ( i ) . getColumnName ( ) ) )
2022-06-24 01:56:15 +00:00
{
2022-12-31 17:06:56 +00:00
auto key = header . getPositionByName ( function . getArgumentAt ( i ) . getColumnName ( ) ) ;
2022-06-24 01:56:15 +00:00
key_tuple_mapping . emplace_back ( i , key ) ;
2022-12-31 17:06:56 +00:00
data_types . push_back ( header . getByPosition ( key ) . type ) ;
2022-06-24 01:56:15 +00:00
}
}
}
else
{
2022-12-31 17:06:56 +00:00
if ( header . has ( lhs . getColumnName ( ) ) )
2022-06-24 01:56:15 +00:00
{
2022-12-31 17:06:56 +00:00
auto key = header . getPositionByName ( lhs . getColumnName ( ) ) ;
2022-06-24 01:56:15 +00:00
key_tuple_mapping . emplace_back ( 0 , key ) ;
2023-01-05 04:08:28 +00:00
data_types . push_back ( header . getByPosition ( key ) . type ) ;
2022-06-24 01:56:15 +00:00
}
}
if ( key_tuple_mapping . empty ( ) )
return false ;
2022-12-29 16:00:17 +00:00
ConstSetPtr prepared_set = rhs . tryGetPreparedSet ( ) ;
if ( ! prepared_set & & ! prepared_set - > hasExplicitSetElements ( ) )
2022-06-24 01:56:15 +00:00
return false ;
for ( const auto & data_type : prepared_set - > getDataTypes ( ) )
if ( data_type - > getTypeId ( ) ! = TypeIndex : : String & & data_type - > getTypeId ( ) ! = TypeIndex : : FixedString )
return false ;
std : : vector < std : : vector < GinFilter > > gin_filters ;
std : : vector < size_t > key_position ;
Columns columns = prepared_set - > getSetElements ( ) ;
for ( const auto & elem : key_tuple_mapping )
{
gin_filters . emplace_back ( ) ;
2022-09-25 23:29:30 +00:00
gin_filters . back ( ) . reserve ( prepared_set - > getTotalRowCount ( ) ) ;
2022-06-24 01:56:15 +00:00
key_position . push_back ( elem . key_index ) ;
size_t tuple_idx = elem . tuple_index ;
const auto & column = columns [ tuple_idx ] ;
for ( size_t row = 0 ; row < prepared_set - > getTotalRowCount ( ) ; + + row )
{
gin_filters . back ( ) . emplace_back ( params ) ;
auto ref = column - > getDataAt ( row ) ;
2022-12-31 17:06:56 +00:00
token_extractor - > stringToGinFilter ( ref . data , ref . size , gin_filters . back ( ) . back ( ) ) ;
2022-06-24 01:56:15 +00:00
}
}
out . set_key_position = std : : move ( key_position ) ;
out . set_gin_filters = std : : move ( gin_filters ) ;
return true ;
}
MergeTreeIndexGranulePtr MergeTreeIndexGinFilter : : createIndexGranule ( ) const
{
return std : : make_shared < MergeTreeIndexGranuleGinFilter > ( index . name , index . column_names . size ( ) , params ) ;
}
MergeTreeIndexAggregatorPtr MergeTreeIndexGinFilter : : createIndexAggregator ( ) const
{
/// should not be called: createIndexAggregatorForPart should be used
assert ( false ) ;
return nullptr ;
}
MergeTreeIndexAggregatorPtr MergeTreeIndexGinFilter : : createIndexAggregatorForPart ( const GinIndexStorePtr & store ) const
{
return std : : make_shared < MergeTreeIndexAggregatorGinFilter > ( store , index . column_names , index . name , params , token_extractor . get ( ) ) ;
}
MergeTreeIndexConditionPtr MergeTreeIndexGinFilter : : createIndexCondition (
const SelectQueryInfo & query , ContextPtr context ) const
{
return std : : make_shared < MergeTreeConditionGinFilter > ( query , context , index . sample_block , params , token_extractor . get ( ) ) ;
} ;
bool MergeTreeIndexGinFilter : : mayBenefitFromIndexForIn ( const ASTPtr & node ) const
{
return std : : find ( std : : cbegin ( index . column_names ) , std : : cend ( index . column_names ) , node - > getColumnName ( ) ) ! = std : : cend ( index . column_names ) ;
}
MergeTreeIndexPtr ginIndexCreator (
const IndexDescription & index )
{
if ( index . type = = GinFilter : : getName ( ) )
{
2022-07-19 03:07:57 +00:00
size_t n = index . arguments . empty ( ) ? 0 : index . arguments [ 0 ] . get < size_t > ( ) ;
2023-01-05 03:42:45 +00:00
Float64 density = index . arguments . size ( ) < 2 ? 1.0f : index . arguments [ 1 ] . get < Float64 > ( ) ;
GinFilterParameters params ( n , density ) ;
2022-06-24 01:56:15 +00:00
/// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor
2022-07-03 12:18:51 +00:00
if ( n > 0 )
2022-06-24 01:56:15 +00:00
{
auto tokenizer = std : : make_unique < NgramTokenExtractor > ( n ) ;
return std : : make_shared < MergeTreeIndexGinFilter > ( index , params , std : : move ( tokenizer ) ) ;
}
else
{
auto tokenizer = std : : make_unique < SplitTokenExtractor > ( ) ;
return std : : make_shared < MergeTreeIndexGinFilter > ( index , params , std : : move ( tokenizer ) ) ;
}
}
else
{
throw Exception ( " Unknown index type: " + backQuote ( index . name ) , ErrorCodes : : LOGICAL_ERROR ) ;
}
}
void ginIndexValidator ( const IndexDescription & index , bool /*attach*/ )
{
for ( const auto & index_data_type : index . data_types )
{
WhichDataType data_type ( index_data_type ) ;
if ( data_type . isArray ( ) )
{
const auto & gin_type = assert_cast < const DataTypeArray & > ( * index_data_type ) ;
data_type = WhichDataType ( gin_type . getNestedType ( ) ) ;
}
else if ( data_type . isLowCarnality ( ) )
{
const auto & low_cardinality = assert_cast < const DataTypeLowCardinality & > ( * index_data_type ) ;
data_type = WhichDataType ( low_cardinality . getDictionaryType ( ) ) ;
}
if ( ! data_type . isString ( ) & & ! data_type . isFixedString ( ) )
2023-01-05 03:42:45 +00:00
throw Exception ( " Inverted index can be used only with `String`, `FixedString`, `LowCardinality(String) ` , ` LowCardinality ( FixedString ) ` column or Array with ` String ` or ` FixedString ` values column . " , ErrorCodes::INCORRECT_QUERY) ;
2022-06-24 01:56:15 +00:00
}
2022-07-03 12:18:51 +00:00
if ( index . type ! = GinFilter : : getName ( ) )
2022-06-24 01:56:15 +00:00
throw Exception ( " Unknown index type: " + backQuote ( index . name ) , ErrorCodes : : LOGICAL_ERROR ) ;
2023-01-05 03:42:45 +00:00
if ( index . arguments . size ( ) > 2 )
throw Exception ( " Inverted index must have less than two arguments. " , ErrorCodes : : INCORRECT_QUERY ) ;
2022-06-24 01:56:15 +00:00
2023-01-05 03:42:45 +00:00
if ( index . arguments . size ( ) > = 1 & & index . arguments [ 0 ] . getType ( ) ! = Field : : Types : : UInt64 )
throw Exception ( " The first Inverted index argument must be positive integer. " , ErrorCodes : : INCORRECT_QUERY ) ;
if ( index . arguments . size ( ) = = 2 & & ( index . arguments [ 1 ] . getType ( ) ! = Field : : Types : : Float64 | | index . arguments [ 1 ] . get < Float64 > ( ) < = 0 | | index . arguments [ 1 ] . get < Float64 > ( ) > 1 ) )
throw Exception ( " The second Inverted index argument must be a float between 0 and 1. " , ErrorCodes : : INCORRECT_QUERY ) ;
2022-06-24 01:56:15 +00:00
2022-07-19 03:07:57 +00:00
size_t ngrams = index . arguments . empty ( ) ? 0 : index . arguments [ 0 ] . get < size_t > ( ) ;
2023-01-05 03:42:45 +00:00
Float64 density = index . arguments . size ( ) < 2 ? 1.0f : index . arguments [ 1 ] . get < Float64 > ( ) ;
2022-06-24 01:56:15 +00:00
/// Just validate
2023-01-05 03:42:45 +00:00
GinFilterParameters params ( ngrams , density ) ;
2022-06-24 01:56:15 +00:00
}
}