#include "config.h" #if USE_NLP #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int SUPPORT_IS_DISABLED; } namespace { struct LemmatizeImpl { static void vector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, Lemmatizers::LemmPtr & lemmatizer) { res_data.resize(data.size()); res_offsets.assign(offsets); UInt64 data_size = 0; for (UInt64 i = 0; i < offsets.size(); ++i) { /// lemmatize() uses the fact the fact that each string ends with '\0' auto result = lemmatizer->lemmatize(reinterpret_cast(data.data() + offsets[i - 1])); size_t new_size = strlen(result.get()) + 1; if (data_size + new_size > res_data.size()) res_data.resize(data_size + new_size); memcpy(res_data.data() + data_size, reinterpret_cast(result.get()), new_size); data_size += new_size; res_offsets[i] = data_size; } res_data.resize(data_size); } }; class FunctionLemmatize : public IFunction { public: static constexpr auto name = "lemmatize"; static FunctionPtr create(ContextPtr context) { if (!context->getSettingsRef().allow_experimental_nlp_functions) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. " "Set `allow_experimental_nlp_functions` setting to enable it", name); return std::make_shared(context->getLemmatizers()); } private: Lemmatizers & lemmatizers; public: explicit FunctionLemmatize(Lemmatizers & lemmatizers_) : lemmatizers(lemmatizers_) {} String getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); if (!isString(arguments[1])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName()); return arguments[1]; } bool useDefaultImplementationForConstants() const override { return true; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { const auto & langcolumn = arguments[0].column; const auto & strcolumn = arguments[1].column; const ColumnConst * lang_col = checkAndGetColumn(langcolumn.get()); const ColumnString * words_col = checkAndGetColumn(strcolumn.get()); if (!lang_col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()); if (!words_col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[1].column->getName(), getName()); String language = lang_col->getValue(); auto lemmatizer = lemmatizers.getLemmatizer(language); auto col_res = ColumnString::create(); LemmatizeImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), lemmatizer); return col_res; } }; } REGISTER_FUNCTION(Lemmatize) { factory.registerFunction(); } } #endif