diff --git a/CMakeLists.txt b/CMakeLists.txt index 875a6d1ab61..24022c256ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -542,6 +542,7 @@ include (cmake/find/libpqxx.cmake) include (cmake/find/nuraft.cmake) include (cmake/find/yaml-cpp.cmake) include (cmake/find/s2geometry.cmake) +include (cmake/find/nlp.cmake) if(NOT USE_INTERNAL_PARQUET_LIBRARY) set (ENABLE_ORC OFF CACHE INTERNAL "") diff --git a/cmake/find/nlp.cmake b/cmake/find/nlp.cmake new file mode 100644 index 00000000000..f1204a85dea --- /dev/null +++ b/cmake/find/nlp.cmake @@ -0,0 +1,32 @@ +option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_NLP) + + message (STATUS "NLP functions disabled") + return() +endif() + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c/Makefile") + message (WARNING "submodule contrib/libstemmer_c is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libstemmer_c library, NLP functions will be disabled") + set (USE_NLP 0) + return() +endif () + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast/CMakeLists.txt") + message (WARNING "submodule contrib/wordnet-blast is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal wordnet-blast library, NLP functions will be disabled") + set (USE_NLP 0) + return() +endif () + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c/README.md") + message (WARNING "submodule contrib/lemmagen-c is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal lemmagen-c library, NLP functions will be disabled") + set (USE_NLP 0) + return() +endif () + +set (USE_NLP 1) + +message (STATUS "Using Libraries for NLP functions: contrib/wordnet-blast, contrib/libstemmer_c, contrib/lemmagen-c") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 83651fbc6d6..82cddb0ace0 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -327,9 +327,12 @@ if (USE_NURAFT) endif() add_subdirectory(fast_float) -add_subdirectory(libstemmer-c-cmake) -add_subdirectory(wordnet-blast-cmake) -add_subdirectory(lemmagen-c-cmake) + +if (USE_NLP) + add_subdirectory(libstemmer-c-cmake) + add_subdirectory(wordnet-blast-cmake) + add_subdirectory(lemmagen-c-cmake) +endif() if (USE_SQLITE) add_subdirectory(sqlite-cmake) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e70f4fe4b5a..a99201e4aaa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -473,9 +473,11 @@ endif () dbms_target_link_libraries(PRIVATE _boost_context) -dbms_target_link_libraries (PUBLIC stemmer) -dbms_target_link_libraries (PUBLIC wnb) -dbms_target_link_libraries (PUBLIC lemmagen) +if (USE_NLP) + dbms_target_link_libraries (PUBLIC stemmer) + dbms_target_link_libraries (PUBLIC wnb) + dbms_target_link_libraries (PUBLIC lemmagen) +endif() include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 45cbc6efe19..cc9c993b205 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -15,4 +15,5 @@ #cmakedefine01 USE_LIBPQXX #cmakedefine01 USE_SQLITE #cmakedefine01 USE_NURAFT +#cmakedefine01 USE_NLP #cmakedefine01 USE_KRB5 diff --git a/src/Functions/lemmatize.cpp b/src/Functions/lemmatize.cpp index 704d5f87575..25e3dcbeee2 100644 --- a/src/Functions/lemmatize.cpp +++ b/src/Functions/lemmatize.cpp @@ -1,3 +1,9 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include #include @@ -116,3 +122,5 @@ void registerFunctionLemmatize(FunctionFactory & factory) } } + +#endif diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 214ddfe1143..c364d7e0bf0 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -1,5 +1,6 @@ #if !defined(ARCADIA_BUILD) # include "config_functions.h" +# include "config_core.h" #endif namespace DB @@ -37,18 +38,20 @@ void registerFunctionCountMatches(FunctionFactory &); void registerFunctionEncodeXMLComponent(FunctionFactory &); void registerFunctionDecodeXMLComponent(FunctionFactory &); void registerFunctionExtractTextFromHTML(FunctionFactory &); -void registerFunctionStem(FunctionFactory &); -void registerFunctionSynonyms(FunctionFactory &); -void registerFunctionLemmatize(FunctionFactory &); void registerFunctionToStringCutToZero(FunctionFactory &); - #if USE_BASE64 void registerFunctionBase64Encode(FunctionFactory &); void registerFunctionBase64Decode(FunctionFactory &); void registerFunctionTryBase64Decode(FunctionFactory &); #endif +#if USE_NLP +void registerFunctionStem(FunctionFactory &); +void registerFunctionSynonyms(FunctionFactory &); +void registerFunctionLemmatize(FunctionFactory &); +#endif + void registerFunctionsString(FunctionFactory & factory) { registerFunctionRepeat(factory); @@ -81,15 +84,19 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionEncodeXMLComponent(factory); registerFunctionDecodeXMLComponent(factory); registerFunctionExtractTextFromHTML(factory); - registerFunctionStem(factory); - registerFunctionSynonyms(factory); - registerFunctionLemmatize(factory); + registerFunctionToStringCutToZero(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); registerFunctionBase64Decode(factory); registerFunctionTryBase64Decode(factory); #endif + +#if USE_NLP + registerFunctionStem(factory); + registerFunctionSynonyms(factory); + registerFunctionLemmatize(factory); +#endif } } diff --git a/src/Functions/stem.cpp b/src/Functions/stem.cpp index f1b666caa92..03386c1c7a2 100644 --- a/src/Functions/stem.cpp +++ b/src/Functions/stem.cpp @@ -1,3 +1,9 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include #include @@ -116,3 +122,5 @@ void registerFunctionStem(FunctionFactory & factory) } } + +#endif diff --git a/src/Functions/synonyms.cpp b/src/Functions/synonyms.cpp index ea8b36dd2e2..d4b67a17e2e 100644 --- a/src/Functions/synonyms.cpp +++ b/src/Functions/synonyms.cpp @@ -1,3 +1,9 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include #include @@ -114,3 +120,5 @@ void registerFunctionSynonyms(FunctionFactory & factory) } } + +#endif diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index f5d2094cbf3..d241ee18d41 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -351,8 +351,10 @@ struct ContextSharedPart scope_guard dictionaries_xmls; +#if USE_NLP mutable std::optional synonyms_extensions; mutable std::optional lemmatizers; +#endif String default_profile_name; /// Default profile name used for default values. String system_profile_name; /// Profile used by system processes @@ -1507,6 +1509,8 @@ void Context::loadDictionaries(const Poco::Util::AbstractConfiguration & config) std::make_unique(config, "dictionaries_config")); } +#if USE_NLP + SynonymsExtensions & Context::getSynonymsExtensions() const { auto lock = getLock(); @@ -1526,6 +1530,7 @@ Lemmatizers & Context::getLemmatizers() const return *shared->lemmatizers; } +#endif void Context::setProgressCallback(ProgressCallback callback) { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 2b099e2b5b7..ed2b718bc61 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -52,8 +52,6 @@ class AccessRightsElements; class EmbeddedDictionaries; class ExternalDictionariesLoader; class ExternalModelsLoader; -class SynonymsExtensions; -class Lemmatizers; class InterserverCredentials; using InterserverCredentialsPtr = std::shared_ptr; class InterserverIOHandler; @@ -116,6 +114,11 @@ using VolumePtr = std::shared_ptr; struct NamedSession; struct BackgroundTaskSchedulingSettings; +#if USE_NLP + class SynonymsExtensions; + class Lemmatizers; +#endif + class Throttler; using ThrottlerPtr = std::shared_ptr; @@ -536,8 +539,10 @@ public: void tryCreateEmbeddedDictionaries() const; void loadDictionaries(const Poco::Util::AbstractConfiguration & config); +#if USE_NLP SynonymsExtensions & getSynonymsExtensions() const; Lemmatizers & getLemmatizers() const; +#endif void setExternalModelsConfig(const ConfigurationPtr & config, const std::string & config_name = "models_config"); diff --git a/src/Interpreters/Lemmatizers.cpp b/src/Interpreters/Lemmatizers.cpp index 0d4ad8b0c95..38cd4c33678 100644 --- a/src/Interpreters/Lemmatizers.cpp +++ b/src/Interpreters/Lemmatizers.cpp @@ -1,3 +1,10 @@ + +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include #include @@ -89,3 +96,5 @@ Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name) } } + +#endif diff --git a/src/Interpreters/Lemmatizers.h b/src/Interpreters/Lemmatizers.h index dd3050aed45..3d7fafc8699 100644 --- a/src/Interpreters/Lemmatizers.h +++ b/src/Interpreters/Lemmatizers.h @@ -1,11 +1,19 @@ #pragma once +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include #include #include + + namespace DB { @@ -37,3 +45,5 @@ public: }; } + +#endif diff --git a/src/Interpreters/SynonymsExtensions.cpp b/src/Interpreters/SynonymsExtensions.cpp index 6991d8897be..22fa91a4349 100644 --- a/src/Interpreters/SynonymsExtensions.cpp +++ b/src/Interpreters/SynonymsExtensions.cpp @@ -1,3 +1,9 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include @@ -49,7 +55,7 @@ public: } } - const Synset * getSynonyms(const std::string_view & token) const override + const Synset * getSynonyms(std::string_view token) const override { auto it = table.find(token); @@ -68,7 +74,7 @@ private: public: explicit WordnetSynonymsExtension(const String & path) : wn(path) {} - const Synset * getSynonyms(const std::string_view & token) const override + const Synset * getSynonyms(std::string_view token) const override { return wn.get_synset(std::string(token)); } @@ -147,3 +153,5 @@ SynonymsExtensions::ExtPtr SynonymsExtensions::getExtension(const String & name) } } + +#endif diff --git a/src/Interpreters/SynonymsExtensions.h b/src/Interpreters/SynonymsExtensions.h index 3b00122a419..fd2bf03e162 100644 --- a/src/Interpreters/SynonymsExtensions.h +++ b/src/Interpreters/SynonymsExtensions.h @@ -1,5 +1,11 @@ #pragma once +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + #include #include @@ -17,7 +23,7 @@ class ISynonymsExtension public: using Synset = std::vector; - virtual const Synset * getSynonyms(const std::string_view & token) const = 0; + virtual const Synset * getSynonyms(std::string_view token) const = 0; virtual ~ISynonymsExtension() = default; }; @@ -47,3 +53,5 @@ private: }; } + +#endif