2021-07-30 13:30:30 +00:00
# if !defined(ARCADIA_BUILD)
# include "config_core.h"
# endif
# if USE_NLP
2021-05-04 12:47:34 +00:00
# include <Columns/ColumnString.h>
# include <DataTypes/DataTypeString.h>
# include <Functions/FunctionFactory.h>
# include <Functions/FunctionHelpers.h>
2021-05-28 15:30:33 +00:00
# include <Functions/IFunction.h>
2021-07-30 15:25:51 +00:00
# include <Interpreters/Context.h>
2021-05-04 12:47:34 +00:00
2021-05-10 09:34:29 +00:00
# include <libstemmer.h>
2021-05-04 12:47:34 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN ;
extern const int ILLEGAL_TYPE_OF_ARGUMENT ;
2021-07-30 15:25:51 +00:00
extern const int SUPPORT_IS_DISABLED ;
2021-05-04 12:47:34 +00:00
}
namespace
{
struct StemImpl
{
static void vector (
const ColumnString : : Chars & data ,
const ColumnString : : Offsets & offsets ,
ColumnString : : Chars & res_data ,
ColumnString : : Offsets & res_offsets ,
2021-05-13 09:12:57 +00:00
const String & language )
2021-05-04 12:47:34 +00:00
{
2021-06-19 18:52:09 +00:00
sb_stemmer * stemmer = sb_stemmer_new ( language . data ( ) , " UTF_8 " ) ;
2021-05-13 09:12:57 +00:00
if ( stemmer = = nullptr )
{
throw Exception (
" Language " + language + " is not supported for function stem " ,
ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
}
2021-05-10 09:34:29 +00:00
res_data . resize ( data . size ( ) ) ;
2021-05-04 12:47:34 +00:00
res_offsets . assign ( offsets ) ;
UInt64 data_size = 0 ;
for ( UInt64 i = 0 ; i < offsets . size ( ) ; + + i )
{
/// Note that accessing -1th element is valid for PaddedPODArray.
size_t original_size = offsets [ i ] - offsets [ i - 1 ] ;
2021-06-04 21:52:44 +00:00
const sb_symbol * result = sb_stemmer_stem ( stemmer ,
2021-06-19 18:52:09 +00:00
reinterpret_cast < const uint8_t * > ( data . data ( ) + offsets [ i - 1 ] ) ,
2021-06-04 21:52:44 +00:00
original_size - 1 ) ;
2021-05-10 09:34:29 +00:00
size_t new_size = sb_stemmer_length ( stemmer ) + 1 ;
2021-05-04 12:47:34 +00:00
2021-05-10 09:34:29 +00:00
memcpy ( res_data . data ( ) + data_size , result , new_size ) ;
2021-06-19 18:52:09 +00:00
2021-05-10 09:34:29 +00:00
data_size + = new_size ;
2021-05-04 12:47:34 +00:00
res_offsets [ i ] = data_size ;
}
res_data . resize ( data_size ) ;
2021-05-10 09:34:29 +00:00
sb_stemmer_delete ( stemmer ) ;
2021-05-04 12:47:34 +00:00
}
} ;
class FunctionStem : public IFunction
{
public :
static constexpr auto name = " stem " ;
2021-07-30 15:25:51 +00:00
static FunctionPtr create ( ContextPtr context )
{
if ( ! context - > getSettingsRef ( ) . allow_experimental_nlp_functions )
throw Exception ( ErrorCodes : : SUPPORT_IS_DISABLED , " Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it " , name ) ;
return std : : make_shared < FunctionStem > ( ) ;
}
2021-05-04 12:47:34 +00:00
String getName ( ) const override { return name ; }
size_t getNumberOfArguments ( ) const override { return 2 ; }
DataTypePtr getReturnTypeImpl ( const DataTypes & arguments ) const override
{
if ( ! isString ( arguments [ 0 ] ) )
throw Exception (
" Illegal type " + arguments [ 0 ] - > getName ( ) + " of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
if ( ! isString ( arguments [ 1 ] ) )
throw Exception (
" Illegal type " + arguments [ 1 ] - > getName ( ) + " of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
return arguments [ 1 ] ;
}
bool useDefaultImplementationForConstants ( ) const override { return true ; }
2021-05-11 10:55:24 +00:00
ColumnNumbers getArgumentsThatAreAlwaysConstant ( ) const override { return { 0 } ; }
2021-05-04 12:47:34 +00:00
ColumnPtr executeImpl ( const ColumnsWithTypeAndName & arguments , const DataTypePtr & , size_t ) const override
{
const auto & langcolumn = arguments [ 0 ] . column ;
const auto & strcolumn = arguments [ 1 ] . column ;
2021-05-13 09:12:57 +00:00
const ColumnConst * lang_col = checkAndGetColumn < ColumnConst > ( langcolumn . get ( ) ) ;
const ColumnString * words_col = checkAndGetColumn < ColumnString > ( strcolumn . get ( ) ) ;
2021-06-19 18:52:09 +00:00
if ( ! lang_col )
2021-05-13 09:12:57 +00:00
throw Exception (
" Illegal column " + arguments [ 0 ] . column - > getName ( ) + " of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_COLUMN ) ;
if ( ! words_col )
throw Exception (
" Illegal column " + arguments [ 1 ] . column - > getName ( ) + " of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_COLUMN ) ;
String language = lang_col - > getValue < String > ( ) ;
2021-05-04 12:47:34 +00:00
2021-05-13 09:12:57 +00:00
auto col_res = ColumnString : : create ( ) ;
StemImpl : : vector ( words_col - > getChars ( ) , words_col - > getOffsets ( ) , col_res - > getChars ( ) , col_res - > getOffsets ( ) , language ) ;
return col_res ;
2021-05-04 12:47:34 +00:00
}
} ;
}
void registerFunctionStem ( FunctionFactory & factory )
{
factory . registerFunction < FunctionStem > ( FunctionFactory : : CaseInsensitive ) ;
}
}
2021-07-30 13:30:30 +00:00
# endif