Add an ability to build ClickHouse without NLP functions

This commit is contained in:
alesapin 2021-07-30 16:30:30 +03:00
parent 916594fe23
commit 4746002776
15 changed files with 133 additions and 18 deletions

View File

@ -542,6 +542,7 @@ include (cmake/find/libpqxx.cmake)
include (cmake/find/nuraft.cmake)
include (cmake/find/yaml-cpp.cmake)
include (cmake/find/s2geometry.cmake)
include (cmake/find/nlp.cmake)
if(NOT USE_INTERNAL_PARQUET_LIBRARY)
set (ENABLE_ORC OFF CACHE INTERNAL "")

32
cmake/find/nlp.cmake Normal file
View File

@ -0,0 +1,32 @@
option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES})
if (NOT ENABLE_NLP)
message (STATUS "NLP functions disabled")
return()
endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c/Makefile")
message (WARNING "submodule contrib/libstemmer_c is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libstemmer_c library, NLP functions will be disabled")
set (USE_NLP 0)
return()
endif ()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast/CMakeLists.txt")
message (WARNING "submodule contrib/wordnet-blast is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal wordnet-blast library, NLP functions will be disabled")
set (USE_NLP 0)
return()
endif ()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c/README.md")
message (WARNING "submodule contrib/lemmagen-c is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal lemmagen-c library, NLP functions will be disabled")
set (USE_NLP 0)
return()
endif ()
set (USE_NLP 1)
message (STATUS "Using Libraries for NLP functions: contrib/wordnet-blast, contrib/libstemmer_c, contrib/lemmagen-c")

View File

@ -327,9 +327,12 @@ if (USE_NURAFT)
endif()
add_subdirectory(fast_float)
add_subdirectory(libstemmer-c-cmake)
add_subdirectory(wordnet-blast-cmake)
add_subdirectory(lemmagen-c-cmake)
if (USE_NLP)
add_subdirectory(libstemmer-c-cmake)
add_subdirectory(wordnet-blast-cmake)
add_subdirectory(lemmagen-c-cmake)
endif()
if (USE_SQLITE)
add_subdirectory(sqlite-cmake)

View File

@ -473,9 +473,11 @@ endif ()
dbms_target_link_libraries(PRIVATE _boost_context)
dbms_target_link_libraries (PUBLIC stemmer)
dbms_target_link_libraries (PUBLIC wnb)
dbms_target_link_libraries (PUBLIC lemmagen)
if (USE_NLP)
dbms_target_link_libraries (PUBLIC stemmer)
dbms_target_link_libraries (PUBLIC wnb)
dbms_target_link_libraries (PUBLIC lemmagen)
endif()
include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake")

View File

@ -15,4 +15,5 @@
#cmakedefine01 USE_LIBPQXX
#cmakedefine01 USE_SQLITE
#cmakedefine01 USE_NURAFT
#cmakedefine01 USE_NLP
#cmakedefine01 USE_KRB5

View File

@ -1,3 +1,9 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
@ -116,3 +122,5 @@ void registerFunctionLemmatize(FunctionFactory & factory)
}
}
#endif

View File

@ -1,5 +1,6 @@
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
# include "config_core.h"
#endif
namespace DB
@ -37,18 +38,20 @@ void registerFunctionCountMatches(FunctionFactory &);
void registerFunctionEncodeXMLComponent(FunctionFactory &);
void registerFunctionDecodeXMLComponent(FunctionFactory &);
void registerFunctionExtractTextFromHTML(FunctionFactory &);
void registerFunctionStem(FunctionFactory &);
void registerFunctionSynonyms(FunctionFactory &);
void registerFunctionLemmatize(FunctionFactory &);
void registerFunctionToStringCutToZero(FunctionFactory &);
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
void registerFunctionBase64Decode(FunctionFactory &);
void registerFunctionTryBase64Decode(FunctionFactory &);
#endif
#if USE_NLP
void registerFunctionStem(FunctionFactory &);
void registerFunctionSynonyms(FunctionFactory &);
void registerFunctionLemmatize(FunctionFactory &);
#endif
void registerFunctionsString(FunctionFactory & factory)
{
registerFunctionRepeat(factory);
@ -81,15 +84,19 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionEncodeXMLComponent(factory);
registerFunctionDecodeXMLComponent(factory);
registerFunctionExtractTextFromHTML(factory);
registerFunctionStem(factory);
registerFunctionSynonyms(factory);
registerFunctionLemmatize(factory);
registerFunctionToStringCutToZero(factory);
#if USE_BASE64
registerFunctionBase64Encode(factory);
registerFunctionBase64Decode(factory);
registerFunctionTryBase64Decode(factory);
#endif
#if USE_NLP
registerFunctionStem(factory);
registerFunctionSynonyms(factory);
registerFunctionLemmatize(factory);
#endif
}
}

View File

@ -1,3 +1,9 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
@ -116,3 +122,5 @@ void registerFunctionStem(FunctionFactory & factory)
}
}
#endif

View File

@ -1,3 +1,9 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Columns/ColumnArray.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
@ -114,3 +120,5 @@ void registerFunctionSynonyms(FunctionFactory & factory)
}
}
#endif

View File

@ -351,8 +351,10 @@ struct ContextSharedPart
scope_guard dictionaries_xmls;
#if USE_NLP
mutable std::optional<SynonymsExtensions> synonyms_extensions;
mutable std::optional<Lemmatizers> lemmatizers;
#endif
String default_profile_name; /// Default profile name used for default values.
String system_profile_name; /// Profile used by system processes
@ -1507,6 +1509,8 @@ void Context::loadDictionaries(const Poco::Util::AbstractConfiguration & config)
std::make_unique<ExternalLoaderXMLConfigRepository>(config, "dictionaries_config"));
}
#if USE_NLP
SynonymsExtensions & Context::getSynonymsExtensions() const
{
auto lock = getLock();
@ -1526,6 +1530,7 @@ Lemmatizers & Context::getLemmatizers() const
return *shared->lemmatizers;
}
#endif
void Context::setProgressCallback(ProgressCallback callback)
{

View File

@ -52,8 +52,6 @@ class AccessRightsElements;
class EmbeddedDictionaries;
class ExternalDictionariesLoader;
class ExternalModelsLoader;
class SynonymsExtensions;
class Lemmatizers;
class InterserverCredentials;
using InterserverCredentialsPtr = std::shared_ptr<const InterserverCredentials>;
class InterserverIOHandler;
@ -116,6 +114,11 @@ using VolumePtr = std::shared_ptr<IVolume>;
struct NamedSession;
struct BackgroundTaskSchedulingSettings;
#if USE_NLP
class SynonymsExtensions;
class Lemmatizers;
#endif
class Throttler;
using ThrottlerPtr = std::shared_ptr<Throttler>;
@ -536,8 +539,10 @@ public:
void tryCreateEmbeddedDictionaries() const;
void loadDictionaries(const Poco::Util::AbstractConfiguration & config);
#if USE_NLP
SynonymsExtensions & getSynonymsExtensions() const;
Lemmatizers & getLemmatizers() const;
#endif
void setExternalModelsConfig(const ConfigurationPtr & config, const std::string & config_name = "models_config");

View File

@ -1,3 +1,10 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Common/Exception.h>
#include <Interpreters/Lemmatizers.h>
#include <RdrLemmatizer.h>
@ -89,3 +96,5 @@ Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name)
}
}
#endif

View File

@ -1,11 +1,19 @@
#pragma once
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <common/types.h>
#include <Poco/Util/Application.h>
#include <mutex>
#include <unordered_map>
namespace DB
{
@ -37,3 +45,5 @@ public:
};
}
#endif

View File

@ -1,3 +1,9 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Common/Exception.h>
#include <Interpreters/SynonymsExtensions.h>
@ -49,7 +55,7 @@ public:
}
}
const Synset * getSynonyms(const std::string_view & token) const override
const Synset * getSynonyms(std::string_view token) const override
{
auto it = table.find(token);
@ -68,7 +74,7 @@ private:
public:
explicit WordnetSynonymsExtension(const String & path) : wn(path) {}
const Synset * getSynonyms(const std::string_view & token) const override
const Synset * getSynonyms(std::string_view token) const override
{
return wn.get_synset(std::string(token));
}
@ -147,3 +153,5 @@ SynonymsExtensions::ExtPtr SynonymsExtensions::getExtension(const String & name)
}
}
#endif

View File

@ -1,5 +1,11 @@
#pragma once
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <common/types.h>
#include <Poco/Util/Application.h>
@ -17,7 +23,7 @@ class ISynonymsExtension
public:
using Synset = std::vector<String>;
virtual const Synset * getSynonyms(const std::string_view & token) const = 0;
virtual const Synset * getSynonyms(std::string_view token) const = 0;
virtual ~ISynonymsExtension() = default;
};
@ -47,3 +53,5 @@ private:
};
}
#endif