diff --git a/.gitmodules b/.gitmodules index 4df7798e1e7..43c878427ec 100644 --- a/.gitmodules +++ b/.gitmodules @@ -225,6 +225,15 @@ [submodule "contrib/yaml-cpp"] path = contrib/yaml-cpp url = https://github.com/ClickHouse-Extras/yaml-cpp.git +[submodule "contrib/libstemmer_c"] + path = contrib/libstemmer_c + url = https://github.com/ClickHouse-Extras/libstemmer_c.git +[submodule "contrib/wordnet-blast"] + path = contrib/wordnet-blast + url = https://github.com/ClickHouse-Extras/wordnet-blast.git +[submodule "contrib/lemmagen-c"] + path = contrib/lemmagen-c + url = https://github.com/ClickHouse-Extras/lemmagen-c.git [submodule "contrib/libpqxx"] path = contrib/libpqxx url = https://github.com/ClickHouse-Extras/libpqxx.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 875a6d1ab61..24022c256ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -542,6 +542,7 @@ include (cmake/find/libpqxx.cmake) include (cmake/find/nuraft.cmake) include (cmake/find/yaml-cpp.cmake) include (cmake/find/s2geometry.cmake) +include (cmake/find/nlp.cmake) if(NOT USE_INTERNAL_PARQUET_LIBRARY) set (ENABLE_ORC OFF CACHE INTERNAL "") diff --git a/cmake/find/nlp.cmake b/cmake/find/nlp.cmake new file mode 100644 index 00000000000..f1204a85dea --- /dev/null +++ b/cmake/find/nlp.cmake @@ -0,0 +1,32 @@ +option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_NLP) + + message (STATUS "NLP functions disabled") + return() +endif() + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c/Makefile") + message (WARNING "submodule contrib/libstemmer_c is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libstemmer_c library, NLP functions will be disabled") + set (USE_NLP 0) + return() +endif () + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast/CMakeLists.txt") + message (WARNING "submodule contrib/wordnet-blast is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal wordnet-blast library, NLP functions will be disabled") + set (USE_NLP 0) + return() +endif () + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c/README.md") + message (WARNING "submodule contrib/lemmagen-c is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal lemmagen-c library, NLP functions will be disabled") + set (USE_NLP 0) + return() +endif () + +set (USE_NLP 1) + +message (STATUS "Using Libraries for NLP functions: contrib/wordnet-blast, contrib/libstemmer_c, contrib/lemmagen-c") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 2b6629d0817..82cddb0ace0 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -328,6 +328,12 @@ endif() add_subdirectory(fast_float) +if (USE_NLP) + add_subdirectory(libstemmer-c-cmake) + add_subdirectory(wordnet-blast-cmake) + add_subdirectory(lemmagen-c-cmake) +endif() + if (USE_SQLITE) add_subdirectory(sqlite-cmake) endif() diff --git a/contrib/boost b/contrib/boost index 1ccbb5a522a..9cf09dbfd55 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 1ccbb5a522a571ce83b606dbc2e1011c42ecccfb +Subproject commit 9cf09dbfd55a5c6202dedbdf40781a51b02c2675 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 9f6c5b1255d..675931d319f 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) regex context coroutine + graph ) if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND - Boost_COROUTINE_LIBRARY) + Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY) set(EXTERNAL_BOOST_FOUND 1) @@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (_boost_system INTERFACE) add_library (_boost_context INTERFACE) add_library (_boost_coroutine INTERFACE) + add_library (_boost_graph INTERFACE) target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY}) target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY}) @@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY}) target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY}) target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY}) + target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY}) add_library (boost::filesystem ALIAS _boost_filesystem) add_library (boost::iostreams ALIAS _boost_iostreams) @@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (boost::system ALIAS _boost_system) add_library (boost::context ALIAS _boost_context) add_library (boost::coroutine ALIAS _boost_coroutine) + add_library (boost::graph ALIAS _boost_graph) else() set(EXTERNAL_BOOST_FOUND 0) message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost") @@ -221,4 +225,17 @@ if (NOT EXTERNAL_BOOST_FOUND) add_library (boost::coroutine ALIAS _boost_coroutine) target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR}) target_link_libraries(_boost_coroutine PRIVATE _boost_context) + + # graph + + set (SRCS_GRAPH + "${LIBRARY_DIR}/libs/graph/src/graphml.cpp" + "${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp" + ) + + add_library (_boost_graph ${SRCS_GRAPH}) + add_library (boost::graph ALIAS _boost_graph) + target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR}) + target_link_libraries(_boost_graph PRIVATE _boost_regex) + endif () diff --git a/contrib/lemmagen-c b/contrib/lemmagen-c new file mode 160000 index 00000000000..59537bdcf57 --- /dev/null +++ b/contrib/lemmagen-c @@ -0,0 +1 @@ +Subproject commit 59537bdcf57bbed17913292cb4502d15657231f1 diff --git a/contrib/lemmagen-c-cmake/CMakeLists.txt b/contrib/lemmagen-c-cmake/CMakeLists.txt new file mode 100644 index 00000000000..b5b92b774e1 --- /dev/null +++ b/contrib/lemmagen-c-cmake/CMakeLists.txt @@ -0,0 +1,9 @@ +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c") +set(LEMMAGEN_INCLUDE_DIR "${LIBRARY_DIR}/include") + +set(SRCS + "${LIBRARY_DIR}/src/RdrLemmatizer.cpp" +) + +add_library(lemmagen STATIC ${SRCS}) +target_include_directories(lemmagen PUBLIC "${LEMMAGEN_INCLUDE_DIR}") diff --git a/contrib/libstemmer-c-cmake/CMakeLists.txt b/contrib/libstemmer-c-cmake/CMakeLists.txt new file mode 100644 index 00000000000..2d38e5f3612 --- /dev/null +++ b/contrib/libstemmer-c-cmake/CMakeLists.txt @@ -0,0 +1,31 @@ +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c") +set(STEMMER_INCLUDE_DIR "${LIBRARY_DIR}/include") + +FILE ( READ "${LIBRARY_DIR}/mkinc.mak" _CONTENT ) +# replace '\ ' into one big line +STRING ( REGEX REPLACE "\\\\\n " " ${LIBRARY_DIR}/" _CONTENT "${_CONTENT}" ) +# escape ';' (if any) +STRING ( REGEX REPLACE ";" "\\\\;" _CONTENT "${_CONTENT}" ) +# now replace lf into ';' (it makes list from the line) +STRING ( REGEX REPLACE "\n" ";" _CONTENT "${_CONTENT}" ) +FOREACH ( LINE ${_CONTENT} ) + # skip comments (beginning with #) + IF ( NOT "${LINE}" MATCHES "^#.*" ) + # parse 'name=value1 value2..." - extract the 'name' part + STRING ( REGEX REPLACE "=.*$" "" _NAME "${LINE}" ) + # extract the list of values part + STRING ( REGEX REPLACE "^.*=" "" _LIST "${LINE}" ) + # replace (multi)spaces into ';' (it makes list from the line) + STRING ( REGEX REPLACE " +" ";" _LIST "${_LIST}" ) + # finally get our two variables + IF ( "${_NAME}" MATCHES "snowball_sources" ) + SET ( _SOURCES "${_LIST}" ) + ELSEIF ( "${_NAME}" MATCHES "snowball_headers" ) + SET ( _HEADERS "${_LIST}" ) + ENDIF () + endif () +endforeach () + +# all the sources parsed. Now just add the lib +add_library ( stemmer STATIC ${_SOURCES} ${_HEADERS} ) +target_include_directories (stemmer PUBLIC "${STEMMER_INCLUDE_DIR}") diff --git a/contrib/libstemmer_c b/contrib/libstemmer_c new file mode 160000 index 00000000000..c753054304d --- /dev/null +++ b/contrib/libstemmer_c @@ -0,0 +1 @@ +Subproject commit c753054304d87daf460057c1a649c482aa094835 diff --git a/contrib/wordnet-blast b/contrib/wordnet-blast new file mode 160000 index 00000000000..1d16ac28036 --- /dev/null +++ b/contrib/wordnet-blast @@ -0,0 +1 @@ +Subproject commit 1d16ac28036e19fe8da7ba72c16a307fbdf8c87e diff --git a/contrib/wordnet-blast-cmake/CMakeLists.txt b/contrib/wordnet-blast-cmake/CMakeLists.txt new file mode 100644 index 00000000000..8d59c312664 --- /dev/null +++ b/contrib/wordnet-blast-cmake/CMakeLists.txt @@ -0,0 +1,13 @@ +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast") + +set(SRCS + "${LIBRARY_DIR}/wnb/core/info_helper.cc" + "${LIBRARY_DIR}/wnb/core/load_wordnet.cc" + "${LIBRARY_DIR}/wnb/core/wordnet.cc" +) + +add_library(wnb ${SRCS}) + +target_link_libraries(wnb PRIVATE boost::headers_only boost::graph) + +target_include_directories(wnb PUBLIC "${LIBRARY_DIR}") \ No newline at end of file diff --git a/docker/packager/unbundled/Dockerfile b/docker/packager/unbundled/Dockerfile index d2bda7db833..07031aa2d1b 100644 --- a/docker/packager/unbundled/Dockerfile +++ b/docker/packager/unbundled/Dockerfile @@ -23,6 +23,7 @@ RUN apt-get update \ libboost-regex-dev \ libboost-context-dev \ libboost-coroutine-dev \ + libboost-graph-dev \ zlib1g-dev \ liblz4-dev \ libdouble-conversion-dev \ diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 3c7d019833b..6419ea3659c 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -311,6 +311,7 @@ function run_tests 01411_bayesian_ab_testing 01798_uniq_theta_sketch 01799_long_uniq_theta_sketch + 01890_stem # depends on libstemmer_c collate collation _orc_ diff --git a/docs/en/sql-reference/functions/nlp-functions.md b/docs/en/sql-reference/functions/nlp-functions.md new file mode 100644 index 00000000000..2d5a09c0897 --- /dev/null +++ b/docs/en/sql-reference/functions/nlp-functions.md @@ -0,0 +1,125 @@ +--- +toc_priority: 67 +toc_title: NLP +--- + +# Natural Language Processing functions {#nlp-functions} + +## stem {#stem} + +Performs stemming on a previously tokenized text. + +**Syntax** + +``` sql +stem('language', word) +``` + +**Arguments** + +- `language` — Language which rules will be applied. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string). +- `word` — word that needs to be stemmed. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string). + +**Examples** + +Query: + +``` sql +SELECT SELECT arrayMap(x -> stem('en', x), ['I', 'think', 'it', 'is', 'a', 'blessing', 'in', 'disguise']) as res; +``` + +Result: + +``` text +┌─res────────────────────────────────────────────────┐ +│ ['I','think','it','is','a','bless','in','disguis'] │ +└────────────────────────────────────────────────────┘ +``` + +## lemmatize {#lemmatize} + +Performs lemmatization on a given word. + +**Syntax** + +``` sql +lemmatize('language', word) +``` + +**Arguments** + +- `language` — Language which rules will be applied. [String](../../sql-reference/data-types/string.md#string). +- `word` — Word that needs to be lemmatized. Must be lowercase. [String](../../sql-reference/data-types/string.md#string). + +**Examples** + +Query: + +``` sql +SELECT lemmatize('en', 'wolves'); +``` + +Result: + +``` text +┌─lemmatize("wolves")─┐ +│ "wolf" │ +└─────────────────────┘ +``` + +Configuration: +``` xml + + + en + en.bin + + +``` + +## synonyms {#synonyms} + +Finds synonyms to a given word. + +**Syntax** + +``` sql +synonyms('extension_name', word) +``` + +**Arguments** + +- `extension_name` — Name of the extention in which search will be performed. [String](../../sql-reference/data-types/string.md#string). +- `word` — Word that will be searched in extension. [String](../../sql-reference/data-types/string.md#string). + +**Examples** + +Query: + +``` sql +SELECT synonyms('list', 'important'); +``` + +Result: + +``` text +┌─synonyms('list', 'important')────────────┐ +│ ['important','big','critical','crucial'] │ +└──────────────────────────────────────────┘ +``` + +Configuration: +``` xml + + + en + plain + en.txt + + + en + wordnet + en/ + + +``` \ No newline at end of file diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index ffb1029cc36..718d5a977b9 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -145,6 +145,72 @@ Result: └────────────────────────────┘ ``` +## splitByWhitespace(s) {#splitbywhitespaceseparator-s} + +Splits a string into substrings separated by whitespace characters. +Returns an array of selected substrings. + +**Syntax** + +``` sql +splitByWhitespace(s) +``` + +**Arguments** + +- `s` — The string to split. [String](../../sql-reference/data-types/string.md). + +**Returned value(s)** + +Returns an array of selected substrings. + +Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +**Example** + +``` sql +SELECT splitByWhitespace(' 1! a, b. '); +``` + +``` text +┌─splitByWhitespace(' 1! a, b. ')─┐ +│ ['1!','a,','b.'] │ +└─────────────────────────────────────┘ +``` + +## splitByNonAlpha(s) {#splitbynonalphaseparator-s} + +Splits a string into substrings separated by whitespace and punctuation characters. +Returns an array of selected substrings. + +**Syntax** + +``` sql +splitByNonAlpha(s) +``` + +**Arguments** + +- `s` — The string to split. [String](../../sql-reference/data-types/string.md). + +**Returned value(s)** + +Returns an array of selected substrings. + +Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +**Example** + +``` sql +SELECT splitByNonAlpha(' 1! a, b. '); +``` + +``` text +┌─splitByNonAlpha(' 1! a, b. ')─┐ +│ ['1','a','b'] │ +└───────────────────────────────────┘ +``` + ## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator} Concatenates the strings listed in the array with the separator.’separator’ is an optional parameter: a constant string, set to an empty string by default. diff --git a/docs/ru/sql-reference/functions/nlp-functions.md b/docs/ru/sql-reference/functions/nlp-functions.md new file mode 100644 index 00000000000..582b5c93b93 --- /dev/null +++ b/docs/ru/sql-reference/functions/nlp-functions.md @@ -0,0 +1,125 @@ +--- +toc_priority: 67 +toc_title: NLP +--- + +# Функции для работы с ествественным языком {#nlp-functions} + +## stem {#stem} + +Данная функция проводит стемминг заданного слова. + +**Синтаксис** + +``` sql +stem('language', word) +``` + +**Аргументы** + +- `language` — Язык, правила которого будут применены для стемминга. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string). +- `word` — Слово подлежащее стеммингу. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string). + +**Examples** + +Query: + +``` sql +SELECT SELECT arrayMap(x -> stem('en', x), ['I', 'think', 'it', 'is', 'a', 'blessing', 'in', 'disguise']) as res; +``` + +Result: + +``` text +┌─res────────────────────────────────────────────────┐ +│ ['I','think','it','is','a','bless','in','disguis'] │ +└────────────────────────────────────────────────────┘ +``` + +## lemmatize {#lemmatize} + +Данная функция проводит лемматизацию для заданного слова. + +**Синтаксис** + +``` sql +lemmatize('language', word) +``` + +**Аргументы** + +- `language` — Язык, правила которого будут применены для лемматизации. [String](../../sql-reference/data-types/string.md#string). +- `word` — Слово, подлежащее лемматизации. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string). + +**Примеры** + +Запрос: + +``` sql +SELECT lemmatize('en', 'wolves'); +``` + +Результат: + +``` text +┌─lemmatize("wolves")─┐ +│ "wolf" │ +└─────────────────────┘ +``` + +Конфигурация: +``` xml + + + en + en.bin + + +``` + +## synonyms {#synonyms} + +Находит синонимы к заданному слову. + +**Синтаксис** + +``` sql +synonyms('extension_name', word) +``` + +**Аргументы** + +- `extension_name` — Название расширения, в котором будет проводиться поиск. [String](../../sql-reference/data-types/string.md#string). +- `word` — Слово, которое будет искаться в расширении. [String](../../sql-reference/data-types/string.md#string). + +**Примеры** + +Запрос: + +``` sql +SELECT synonyms('list', 'important'); +``` + +Результат: + +``` text +┌─synonyms('list', 'important')────────────┐ +│ ['important','big','critical','crucial'] │ +└──────────────────────────────────────────┘ +``` + +Конфигурация: +``` xml + + + en + plain + en.txt + + + en + wordnet + en/ + + +``` \ No newline at end of file diff --git a/docs/ru/sql-reference/functions/splitting-merging-functions.md b/docs/ru/sql-reference/functions/splitting-merging-functions.md index 5a0c540cf3a..efe74dba043 100644 --- a/docs/ru/sql-reference/functions/splitting-merging-functions.md +++ b/docs/ru/sql-reference/functions/splitting-merging-functions.md @@ -146,6 +146,70 @@ SELECT splitByRegexp('', 'abcde'); └────────────────────────────┘ ``` +## splitByWhitespace(s) {#splitbywhitespaceseparator-s} + +Разбивает строку на подстроки, используя в качестве разделителей пробельные символы. + +**Синтаксис** + +``` sql +splitByWhitespace(s) +``` + +**Аргументы** + +- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md). + +**Возвращаемые значения** + +Возвращает массив подстрок. + +Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +**Пример** + +``` sql +SELECT splitByWhitespace(' 1! a, b. '); +``` + +``` text +┌─splitByWhitespace(' 1! a, b. ')─┐ +│ ['1!','a,','b.'] │ +└─────────────────────────────────────┘ +``` + +## splitByNonAlpha(s) {#splitbynonalphaseparator-s} + +Разбивает строку на подстроки, используя в качестве разделителей пробельные символы и символы пунктуации. + +**Синтаксис** + +``` sql +splitByNonAlpha(s) +``` + +**Аргументы** + +- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md). + +**Возвращаемые значения** + +Возвращает массив подстрок. + +Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +**Пример** + +``` sql +SELECT splitByNonAlpha(' 1! a, b. '); +``` + +``` text +┌─splitByNonAlpha(' 1! a, b. ')─┐ +│ ['1','a','b'] │ +└───────────────────────────────────┘ +``` + ## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator} Склеивает строки, перечисленные в массиве, с разделителем separator. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 31286c740d4..a99201e4aaa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -473,6 +473,12 @@ endif () dbms_target_link_libraries(PRIVATE _boost_context) +if (USE_NLP) + dbms_target_link_libraries (PUBLIC stemmer) + dbms_target_link_libraries (PUBLIC wnb) + dbms_target_link_libraries (PUBLIC lemmagen) +endif() + include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") if (ENABLE_TESTS AND USE_GTEST) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index dccc2898de3..64c4c5621b2 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -490,6 +490,7 @@ class IColumn; \ /** Experimental functions */ \ M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ + M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 45cbc6efe19..cc9c993b205 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -15,4 +15,5 @@ #cmakedefine01 USE_LIBPQXX #cmakedefine01 USE_SQLITE #cmakedefine01 USE_NURAFT +#cmakedefine01 USE_NLP #cmakedefine01 USE_KRB5 diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 14092d7dd3d..765317093c1 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -9,6 +9,8 @@ void registerFunctionsStringArray(FunctionFactory & factory) { factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 27f10797651..4d2312f207c 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -33,6 +33,9 @@ namespace ErrorCodes * splitByString(sep, s) * splitByRegexp(regexp, s) * + * splitByWhitespace(s) - split the string by whitespace characters + * splitByNonAlpha(s) - split the string by whitespace and punctuation characters + * * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. * - first subpattern, if regexp has subpattern; * - zero subpattern (the match part, otherwise); @@ -111,6 +114,121 @@ public: } }; +class SplitByNonAlphaImpl +{ +private: + Pos pos; + Pos end; + +public: + /// Get the name of the function. + static constexpr auto name = "splitByNonAlpha"; + static String getName() { return name; } + + static size_t getNumberOfArguments() { return 1; } + + /// Check the type of the function's arguments. + static void checkArguments(const DataTypes & arguments) + { + if (!isString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + /// Initialize by the function arguments. + void init(const ColumnsWithTypeAndName & /*arguments*/) {} + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + } + + /// Returns the position of the argument, that is the column of strings + size_t getStringsArgumentPosition() + { + return 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) + ++pos; + + token_end = pos; + + return true; + } +}; + +class SplitByWhitespaceImpl +{ +private: + Pos pos; + Pos end; + +public: + /// Get the name of the function. + static constexpr auto name = "splitByWhitespace"; + static String getName() { return name; } + + static size_t getNumberOfArguments() { return 1; } + + /// Check the type of the function's arguments. + static void checkArguments(const DataTypes & arguments) + { + if (!isString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + /// Initialize by the function arguments. + void init(const ColumnsWithTypeAndName & /*arguments*/) {} + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + } + + /// Returns the position of the argument, that is the column of strings + size_t getStringsArgumentPosition() + { + return 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && isWhitespaceASCII(*pos)) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + while (pos < end && !isWhitespaceASCII(*pos)) + ++pos; + + token_end = pos; + + return true; + } +}; class SplitByCharImpl { @@ -662,6 +780,8 @@ public: using FunctionAlphaTokens = FunctionTokens; +using FunctionSplitByNonAlpha = FunctionTokens; +using FunctionSplitByWhitespace = FunctionTokens; using FunctionSplitByChar = FunctionTokens; using FunctionSplitByString = FunctionTokens; using FunctionSplitByRegexp = FunctionTokens; diff --git a/src/Functions/lemmatize.cpp b/src/Functions/lemmatize.cpp new file mode 100644 index 00000000000..35d2bfebe08 --- /dev/null +++ b/src/Functions/lemmatize.cpp @@ -0,0 +1,130 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int SUPPORT_IS_DISABLED; +} + +namespace +{ + +struct LemmatizeImpl +{ + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + Lemmatizers::LemmPtr & lemmatizer) + { + res_data.resize(data.size()); + res_offsets.assign(offsets); + + UInt64 data_size = 0; + for (UInt64 i = 0; i < offsets.size(); ++i) + { + /// lemmatize() uses the fact the fact that each string ends with '\0' + auto result = lemmatizer->lemmatize(reinterpret_cast(data.data() + offsets[i - 1])); + size_t new_size = strlen(result.get()) + 1; + + if (data_size + new_size > res_data.size()) + res_data.resize(data_size + new_size); + + memcpy(res_data.data() + data_size, reinterpret_cast(result.get()), new_size); + + data_size += new_size; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + } +}; + + +class FunctionLemmatize : public IFunction +{ +public: + static constexpr auto name = "lemmatize"; + static FunctionPtr create(ContextPtr context) + { + if (!context->getSettingsRef().allow_experimental_nlp_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name); + + return std::make_shared(context->getLemmatizers()); + } + +private: + Lemmatizers & lemmatizers; + +public: + explicit FunctionLemmatize(Lemmatizers & lemmatizers_) + : lemmatizers(lemmatizers_) {} + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isString(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return arguments[1]; + } + + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + { + const auto & langcolumn = arguments[0].column; + const auto & strcolumn = arguments[1].column; + + const ColumnConst * lang_col = checkAndGetColumn(langcolumn.get()); + const ColumnString * words_col = checkAndGetColumn(strcolumn.get()); + + if (!lang_col) + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + if (!words_col) + throw Exception( + "Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + String language = lang_col->getValue(); + auto lemmatizer = lemmatizers.getLemmatizer(language); + + auto col_res = ColumnString::create(); + LemmatizeImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), lemmatizer); + return col_res; + } +}; + +} + +void registerFunctionLemmatize(FunctionFactory & factory) +{ + factory.registerFunction(FunctionFactory::CaseInsensitive); +} + +} + +#endif diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index b0b0e4434bc..ba6a294abba 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -1,5 +1,6 @@ #if !defined(ARCADIA_BUILD) # include "config_functions.h" +# include "config_core.h" #endif namespace DB @@ -39,13 +40,18 @@ void registerFunctionDecodeXMLComponent(FunctionFactory &); void registerFunctionExtractTextFromHTML(FunctionFactory &); void registerFunctionToStringCutToZero(FunctionFactory &); - #if USE_BASE64 void registerFunctionBase64Encode(FunctionFactory &); void registerFunctionBase64Decode(FunctionFactory &); void registerFunctionTryBase64Decode(FunctionFactory &); #endif +#if USE_NLP +void registerFunctionStem(FunctionFactory &); +void registerFunctionSynonyms(FunctionFactory &); +void registerFunctionLemmatize(FunctionFactory &); +#endif + void registerFunctionsString(FunctionFactory & factory) { registerFunctionRepeat(factory); @@ -79,11 +85,18 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionDecodeXMLComponent(factory); registerFunctionExtractTextFromHTML(factory); registerFunctionToStringCutToZero(factory); + #if USE_BASE64 registerFunctionBase64Encode(factory); registerFunctionBase64Decode(factory); registerFunctionTryBase64Decode(factory); #endif + +#if USE_NLP + registerFunctionStem(factory); + registerFunctionSynonyms(factory); + registerFunctionLemmatize(factory); +#endif } } diff --git a/src/Functions/stem.cpp b/src/Functions/stem.cpp new file mode 100644 index 00000000000..98dcbccd005 --- /dev/null +++ b/src/Functions/stem.cpp @@ -0,0 +1,135 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int SUPPORT_IS_DISABLED; +} + +namespace +{ + +struct StemImpl +{ + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const String & language) + { + sb_stemmer * stemmer = sb_stemmer_new(language.data(), "UTF_8"); + + if (stemmer == nullptr) + { + throw Exception( + "Language " + language + " is not supported for function stem", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + res_data.resize(data.size()); + res_offsets.assign(offsets); + + UInt64 data_size = 0; + for (UInt64 i = 0; i < offsets.size(); ++i) + { + /// Note that accessing -1th element is valid for PaddedPODArray. + size_t original_size = offsets[i] - offsets[i - 1]; + const sb_symbol * result = sb_stemmer_stem(stemmer, + reinterpret_cast(data.data() + offsets[i - 1]), + original_size - 1); + size_t new_size = sb_stemmer_length(stemmer) + 1; + + memcpy(res_data.data() + data_size, result, new_size); + + data_size += new_size; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + sb_stemmer_delete(stemmer); + } +}; + + +class FunctionStem : public IFunction +{ +public: + static constexpr auto name = "stem"; + + static FunctionPtr create(ContextPtr context) + { + if (!context->getSettingsRef().allow_experimental_nlp_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name); + + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isString(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return arguments[1]; + } + + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + { + const auto & langcolumn = arguments[0].column; + const auto & strcolumn = arguments[1].column; + + const ColumnConst * lang_col = checkAndGetColumn(langcolumn.get()); + const ColumnString * words_col = checkAndGetColumn(strcolumn.get()); + + if (!lang_col) + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + if (!words_col) + throw Exception( + "Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + String language = lang_col->getValue(); + + auto col_res = ColumnString::create(); + StemImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), language); + return col_res; + } +}; + +} + +void registerFunctionStem(FunctionFactory & factory) +{ + factory.registerFunction(FunctionFactory::CaseInsensitive); +} + +} + +#endif diff --git a/src/Functions/synonyms.cpp b/src/Functions/synonyms.cpp new file mode 100644 index 00000000000..4201fbfa677 --- /dev/null +++ b/src/Functions/synonyms.cpp @@ -0,0 +1,128 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int SUPPORT_IS_DISABLED; +} + +class FunctionSynonyms : public IFunction +{ +public: + static constexpr auto name = "synonyms"; + static FunctionPtr create(ContextPtr context) + { + if (!context->getSettingsRef().allow_experimental_nlp_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name); + + return std::make_shared(context->getSynonymsExtensions()); + } + +private: + SynonymsExtensions & extensions; + +public: + explicit FunctionSynonyms(SynonymsExtensions & extensions_) + : extensions(extensions_) {} + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isString(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(std::make_shared()); + } + + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + const auto & extcolumn = arguments[0].column; + const auto & strcolumn = arguments[1].column; + + const ColumnConst * ext_col = checkAndGetColumn(extcolumn.get()); + const ColumnString * word_col = checkAndGetColumn(strcolumn.get()); + + if (!ext_col) + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + if (!word_col) + throw Exception( + "Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + + String ext_name = ext_col->getValue(); + auto extension = extensions.getExtension(ext_name); + + /// Create and fill the result array. + const DataTypePtr & elem_type = static_cast(*result_type).getNestedType(); + + auto out = ColumnArray::create(elem_type->createColumn()); + IColumn & out_data = out->getData(); + IColumn::Offsets & out_offsets = out->getOffsets(); + + const ColumnString::Chars & data = word_col->getChars(); + const ColumnString::Offsets & offsets = word_col->getOffsets(); + out_data.reserve(input_rows_count); + out_offsets.resize(input_rows_count); + + IColumn::Offset current_offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) + { + std::string_view word(reinterpret_cast(data.data() + offsets[i - 1]), offsets[i] - offsets[i - 1] - 1); + + const auto * synset = extension->getSynonyms(word); + + if (synset) + { + for (const auto & token : *synset) + out_data.insert(Field(token.data(), token.size())); + + current_offset += synset->size(); + } + out_offsets[i] = current_offset; + } + + return out; + } +}; + +void registerFunctionSynonyms(FunctionFactory & factory) +{ + factory.registerFunction(FunctionFactory::CaseInsensitive); +} + +} + +#endif diff --git a/src/Functions/ya.make b/src/Functions/ya.make index d1def6ad90e..2b9b3d94313 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -340,6 +340,7 @@ SRCS( jumpConsistentHash.cpp lcm.cpp least.cpp + lemmatize.cpp lengthUTF8.cpp less.cpp lessOrEquals.cpp @@ -481,6 +482,7 @@ SRCS( sleepEachRow.cpp sqrt.cpp startsWith.cpp + stem.cpp stringCutToZero.cpp stringToH3.cpp substring.cpp @@ -493,6 +495,7 @@ SRCS( subtractWeeks.cpp subtractYears.cpp svg.cpp + synonyms.cpp tan.cpp tanh.cpp tcpPort.cpp diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index effb1245644..4f1fbd7755b 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -77,6 +77,8 @@ #include #include #include +#include +#include #include @@ -349,6 +351,11 @@ struct ContextSharedPart scope_guard dictionaries_xmls; +#if USE_NLP + mutable std::optional synonyms_extensions; + mutable std::optional lemmatizers; +#endif + String default_profile_name; /// Default profile name used for default values. String system_profile_name; /// Profile used by system processes String buffer_profile_name; /// Profile used by Buffer engine for flushing to the underlying @@ -1505,6 +1512,29 @@ void Context::loadDictionaries(const Poco::Util::AbstractConfiguration & config) std::make_unique(config, "dictionaries_config")); } +#if USE_NLP + +SynonymsExtensions & Context::getSynonymsExtensions() const +{ + auto lock = getLock(); + + if (!shared->synonyms_extensions) + shared->synonyms_extensions.emplace(getConfigRef()); + + return *shared->synonyms_extensions; +} + +Lemmatizers & Context::getLemmatizers() const +{ + auto lock = getLock(); + + if (!shared->lemmatizers) + shared->lemmatizers.emplace(getConfigRef()); + + return *shared->lemmatizers; +} +#endif + void Context::setProgressCallback(ProgressCallback callback) { /// Callback is set to a session or to a query. In the session, only one query is processed at a time. Therefore, the lock is not needed. diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index e792fe07ec8..af696d48c6b 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -114,6 +114,11 @@ using VolumePtr = std::shared_ptr; struct NamedSession; struct BackgroundTaskSchedulingSettings; +#if USE_NLP + class SynonymsExtensions; + class Lemmatizers; +#endif + class Throttler; using ThrottlerPtr = std::shared_ptr; @@ -534,6 +539,11 @@ public: void tryCreateEmbeddedDictionaries() const; void loadDictionaries(const Poco::Util::AbstractConfiguration & config); +#if USE_NLP + SynonymsExtensions & getSynonymsExtensions() const; + Lemmatizers & getLemmatizers() const; +#endif + void setExternalModelsConfig(const ConfigurationPtr & config, const std::string & config_name = "models_config"); /// I/O formats. diff --git a/src/Interpreters/Lemmatizers.cpp b/src/Interpreters/Lemmatizers.cpp new file mode 100644 index 00000000000..38cd4c33678 --- /dev/null +++ b/src/Interpreters/Lemmatizers.cpp @@ -0,0 +1,100 @@ + +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_ELEMENT_IN_CONFIG; + extern const int INVALID_CONFIG_PARAMETER; +} + + +class Lemmatizer : public ILemmatizer +{ +private: + RdrLemmatizer lemmatizer; + +public: + explicit Lemmatizer(const String & path) : lemmatizer(path.data()) {} + + TokenPtr lemmatize(const char * token) override + { + return TokenPtr(lemmatizer.Lemmatize(token)); + } +}; + +/// Duplicate of code from StringUtils.h. Copied here for less dependencies. +static bool startsWith(const std::string & s, const char * prefix) +{ + return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); +} + +Lemmatizers::Lemmatizers(const Poco::Util::AbstractConfiguration & config) +{ + String prefix = "lemmatizers"; + Poco::Util::AbstractConfiguration::Keys keys; + + if (!config.has(prefix)) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "No lemmatizers specified in server config on prefix '{}'", prefix); + + config.keys(prefix, keys); + + for (const auto & key : keys) + { + if (startsWith(key, "lemmatizer")) + { + const auto & lemm_name = config.getString(prefix + "." + key + ".lang", ""); + const auto & lemm_path = config.getString(prefix + "." + key + ".path", ""); + + if (lemm_name.empty()) + throw Exception("Lemmatizer language in config is not specified here: " + prefix + "." + key + ".lang", + ErrorCodes::INVALID_CONFIG_PARAMETER); + if (lemm_path.empty()) + throw Exception("Path to lemmatizer in config is not specified here: " + prefix + "." + key + ".path", + ErrorCodes::INVALID_CONFIG_PARAMETER); + + paths[lemm_name] = lemm_path; + } + else + throw Exception("Unknown element in config: " + prefix + "." + key + ", must be 'lemmatizer'", + ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + } +} + +Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name) +{ + std::lock_guard guard(mutex); + + if (lemmatizers.find(name) != lemmatizers.end()) + return lemmatizers[name]; + + if (paths.find(name) != paths.end()) + { + if (!std::filesystem::exists(paths[name])) + throw Exception("Incorrect path to lemmatizer: " + paths[name], + ErrorCodes::INVALID_CONFIG_PARAMETER); + + lemmatizers[name] = std::make_shared(paths[name]); + return lemmatizers[name]; + } + + throw Exception("Lemmatizer named: '" + name + "' is not found", + ErrorCodes::INVALID_CONFIG_PARAMETER); +} + +} + +#endif diff --git a/src/Interpreters/Lemmatizers.h b/src/Interpreters/Lemmatizers.h new file mode 100644 index 00000000000..6682afaa415 --- /dev/null +++ b/src/Interpreters/Lemmatizers.h @@ -0,0 +1,48 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include + +#include +#include + + +namespace DB +{ + +class ILemmatizer +{ +public: + using TokenPtr = std::shared_ptr; + + virtual TokenPtr lemmatize(const char * token) = 0; + + virtual ~ILemmatizer() = default; +}; + + +class Lemmatizers +{ +public: + using LemmPtr = std::shared_ptr; + +private: + std::mutex mutex; + std::unordered_map lemmatizers; + std::unordered_map paths; + +public: + explicit Lemmatizers(const Poco::Util::AbstractConfiguration & config); + + LemmPtr getLemmatizer(const String & name); +}; + +} + +#endif diff --git a/src/Interpreters/SynonymsExtensions.cpp b/src/Interpreters/SynonymsExtensions.cpp new file mode 100644 index 00000000000..22fa91a4349 --- /dev/null +++ b/src/Interpreters/SynonymsExtensions.cpp @@ -0,0 +1,157 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include + +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_ELEMENT_IN_CONFIG; + extern const int INVALID_CONFIG_PARAMETER; +} + +class PlainSynonymsExtension : public ISynonymsExtension +{ +private: + using Container = std::list; + using LookupTable = std::unordered_map; + + Container synsets; + LookupTable table; + +public: + explicit PlainSynonymsExtension(const String & path) + { + std::ifstream file(path); + if (!file.is_open()) + throw Exception("Cannot find synonyms extension at: " + path, + ErrorCodes::INVALID_CONFIG_PARAMETER); + + String line; + while (std::getline(file, line)) + { + Synset synset; + boost::split(synset, line, boost::is_any_of("\t ")); + if (!synset.empty()) + { + synsets.emplace_back(std::move(synset)); + + for (const auto &word : synsets.back()) + table[word] = &synsets.back(); + } + } + } + + const Synset * getSynonyms(std::string_view token) const override + { + auto it = table.find(token); + + if (it != table.end()) + return (*it).second; + + return nullptr; + } +}; + +class WordnetSynonymsExtension : public ISynonymsExtension +{ +private: + wnb::wordnet wn; + +public: + explicit WordnetSynonymsExtension(const String & path) : wn(path) {} + + const Synset * getSynonyms(std::string_view token) const override + { + return wn.get_synset(std::string(token)); + } +}; + +/// Duplicate of code from StringUtils.h. Copied here for less dependencies. +static bool startsWith(const std::string & s, const char * prefix) +{ + return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); +} + +SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config) +{ + String prefix = "synonyms_extensions"; + Poco::Util::AbstractConfiguration::Keys keys; + + if (!config.has(prefix)) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, + "No synonims extensions specified in server config on prefix '{}'", prefix); + + config.keys(prefix, keys); + + for (const auto & key : keys) + { + if (startsWith(key, "extension")) + { + const auto & ext_name = config.getString(prefix + "." + key + ".name", ""); + const auto & ext_path = config.getString(prefix + "." + key + ".path", ""); + const auto & ext_type = config.getString(prefix + "." + key + ".type", ""); + + if (ext_name.empty()) + throw Exception("Extension name in config is not specified here: " + prefix + "." + key + ".name", + ErrorCodes::INVALID_CONFIG_PARAMETER); + if (ext_path.empty()) + throw Exception("Extension path in config is not specified here: " + prefix + "." + key + ".path", + ErrorCodes::INVALID_CONFIG_PARAMETER); + if (ext_type.empty()) + throw Exception("Extension type in config is not specified here: " + prefix + "." + key + ".type", + ErrorCodes::INVALID_CONFIG_PARAMETER); + if (ext_type != "plain" && ext_type != "wordnet") + throw Exception("Unknown extension type in config: " + prefix + "." + key + ".type, must be 'plain' or 'wordnet'", + ErrorCodes::INVALID_CONFIG_PARAMETER); + + info[ext_name].path = ext_path; + info[ext_name].type = ext_type; + } + else + throw Exception("Unknown element in config: " + prefix + "." + key + ", must be 'extension'", + ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + } +} + +SynonymsExtensions::ExtPtr SynonymsExtensions::getExtension(const String & name) +{ + std::lock_guard guard(mutex); + + if (extensions.find(name) != extensions.end()) + return extensions[name]; + + if (info.find(name) != info.end()) + { + const Info & ext_info = info[name]; + + if (ext_info.type == "plain") + extensions[name] = std::make_shared(ext_info.path); + else if (ext_info.type == "wordnet") + extensions[name] = std::make_shared(ext_info.path); + else + throw Exception("Unknown extension type: " + ext_info.type, ErrorCodes::LOGICAL_ERROR); + + return extensions[name]; + } + + throw Exception("Extension named: '" + name + "' is not found", + ErrorCodes::INVALID_CONFIG_PARAMETER); +} + +} + +#endif diff --git a/src/Interpreters/SynonymsExtensions.h b/src/Interpreters/SynonymsExtensions.h new file mode 100644 index 00000000000..fd2bf03e162 --- /dev/null +++ b/src/Interpreters/SynonymsExtensions.h @@ -0,0 +1,57 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_NLP + +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +class ISynonymsExtension +{ +public: + using Synset = std::vector; + + virtual const Synset * getSynonyms(std::string_view token) const = 0; + + virtual ~ISynonymsExtension() = default; +}; + +class SynonymsExtensions +{ +public: + using ExtPtr = std::shared_ptr; + + explicit SynonymsExtensions(const Poco::Util::AbstractConfiguration & config); + + ExtPtr getExtension(const String & name); + +private: + struct Info + { + String path; + String type; + }; + + using ExtContainer = std::unordered_map; + using InfoContainer = std::unordered_map; + + std::mutex mutex; + ExtContainer extensions; + InfoContainer info; +}; + +} + +#endif diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make index 29fc69bc33d..462c778bf3d 100644 --- a/src/Interpreters/ya.make +++ b/src/Interpreters/ya.make @@ -108,6 +108,7 @@ SRCS( JoinSwitcher.cpp JoinToSubqueryTransformVisitor.cpp JoinedTables.cpp + Lemmatizers.cpp LogicalExpressionsOptimizer.cpp MarkTableIdentifiersVisitor.cpp MergeJoin.cpp @@ -145,6 +146,7 @@ SRCS( SortedBlocksWriter.cpp StorageID.cpp SubqueryForSet.cpp + SynonymsExtensions.cpp SystemLog.cpp TableJoin.cpp TablesStatus.cpp diff --git a/tests/integration/test_nlp/__init__.py b/tests/integration/test_nlp/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_nlp/configs/dicts_config.xml b/tests/integration/test_nlp/configs/dicts_config.xml new file mode 100644 index 00000000000..435507ce1d8 --- /dev/null +++ b/tests/integration/test_nlp/configs/dicts_config.xml @@ -0,0 +1,22 @@ + + + + + en + plain + /etc/clickhouse-server/dictionaries/ext-en.txt + + + ru + plain + /etc/clickhouse-server/dictionaries/ext-ru.txt + + + + + + en + /etc/clickhouse-server/dictionaries/lem-en.bin + + + diff --git a/tests/integration/test_nlp/dictionaries/ext-en.txt b/tests/integration/test_nlp/dictionaries/ext-en.txt new file mode 100644 index 00000000000..beb508e437d --- /dev/null +++ b/tests/integration/test_nlp/dictionaries/ext-en.txt @@ -0,0 +1,4 @@ +important big critical crucial essential +happy cheerful delighted ecstatic +however nonetheless but yet +quiz query check exam diff --git a/tests/integration/test_nlp/dictionaries/ext-ru.txt b/tests/integration/test_nlp/dictionaries/ext-ru.txt new file mode 100644 index 00000000000..5466354b264 --- /dev/null +++ b/tests/integration/test_nlp/dictionaries/ext-ru.txt @@ -0,0 +1,4 @@ +важный большой высокий хороший главный +веселый счастливый живой яркий смешной +хотя однако но правда +экзамен испытание проверка \ No newline at end of file diff --git a/tests/integration/test_nlp/dictionaries/lem-en.bin b/tests/integration/test_nlp/dictionaries/lem-en.bin new file mode 100644 index 00000000000..8981bc1ead0 Binary files /dev/null and b/tests/integration/test_nlp/dictionaries/lem-en.bin differ diff --git a/tests/integration/test_nlp/test.py b/tests/integration/test_nlp/test.py new file mode 100644 index 00000000000..24935153608 --- /dev/null +++ b/tests/integration/test_nlp/test.py @@ -0,0 +1,47 @@ +import os +import sys + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + + +cluster = ClickHouseCluster(__file__) +instance = cluster.add_instance('instance', main_configs=['configs/dicts_config.xml']) + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', instance.docker_id) + + yield cluster + finally: + cluster.shutdown() + +def test_lemmatize(start_cluster): + assert instance.query("SELECT lemmatize('en', 'wolves')", settings={"allow_experimental_nlp_functions": 1}) == "wolf\n" + assert instance.query("SELECT lemmatize('en', 'dogs')", settings={"allow_experimental_nlp_functions": 1}) == "dog\n" + assert instance.query("SELECT lemmatize('en', 'looking')", settings={"allow_experimental_nlp_functions": 1}) == "look\n" + assert instance.query("SELECT lemmatize('en', 'took')", settings={"allow_experimental_nlp_functions": 1}) == "take\n" + assert instance.query("SELECT lemmatize('en', 'imported')", settings={"allow_experimental_nlp_functions": 1}) == "import\n" + assert instance.query("SELECT lemmatize('en', 'tokenized')", settings={"allow_experimental_nlp_functions": 1}) == "tokenize\n" + assert instance.query("SELECT lemmatize('en', 'flown')", settings={"allow_experimental_nlp_functions": 1}) == "fly\n" + +def test_synonyms_extensions(start_cluster): + assert instance.query("SELECT synonyms('en', 'crucial')", settings={"allow_experimental_nlp_functions": 1}) == "['important','big','critical','crucial','essential']\n" + assert instance.query("SELECT synonyms('en', 'cheerful')", settings={"allow_experimental_nlp_functions": 1}) == "['happy','cheerful','delighted','ecstatic']\n" + assert instance.query("SELECT synonyms('en', 'yet')", settings={"allow_experimental_nlp_functions": 1}) == "['however','nonetheless','but','yet']\n" + assert instance.query("SELECT synonyms('en', 'quiz')", settings={"allow_experimental_nlp_functions": 1}) == "['quiz','query','check','exam']\n" + + assert instance.query("SELECT synonyms('ru', 'главный')", settings={"allow_experimental_nlp_functions": 1}) == "['важный','большой','высокий','хороший','главный']\n" + assert instance.query("SELECT synonyms('ru', 'веселый')", settings={"allow_experimental_nlp_functions": 1}) == "['веселый','счастливый','живой','яркий','смешной']\n" + assert instance.query("SELECT synonyms('ru', 'правда')", settings={"allow_experimental_nlp_functions": 1}) == "['хотя','однако','но','правда']\n" + assert instance.query("SELECT synonyms('ru', 'экзамен')", settings={"allow_experimental_nlp_functions": 1}) == "['экзамен','испытание','проверка']\n" diff --git a/tests/performance/nlp.xml b/tests/performance/nlp.xml new file mode 100644 index 00000000000..e5006027c59 --- /dev/null +++ b/tests/performance/nlp.xml @@ -0,0 +1,20 @@ + + + 1 + + + + hits_100m_single + + + CREATE TABLE hits_100m_words (words Array(String), UserID UInt64) ENGINE Memory + CREATE TABLE hits_100m_words_ws (words Array(String), UserID UInt64) ENGINE Memory + + INSERT INTO hits_100m_words SELECT splitByNonAlpha(SearchPhrase) AS words, UserID FROM hits_100m_single WHERE length(words) > 0 + INSERT INTO hits_100m_words_ws SELECT splitByWhitespace(SearchPhrase) AS words, UserID FROM hits_100m_single WHERE length(words) > 0 + + SELECT arrayMap(x -> stem('ru', x), words) FROM hits_100m_words FORMAT Null + + DROP TABLE IF EXISTS hits_100m_words + DROP TABLE IF EXISTS hits_100m_words_ws + diff --git a/tests/queries/0_stateless/01889_tokenize.reference b/tests/queries/0_stateless/01889_tokenize.reference new file mode 100644 index 00000000000..4dd6f323929 --- /dev/null +++ b/tests/queries/0_stateless/01889_tokenize.reference @@ -0,0 +1,8 @@ +['It','is','quite','a','wonderful','day','isn','t','it'] +['There','is','so','much','to','learn'] +['22','00','email','yandex','ru'] +['Токенизация','каких','либо','других','языков'] +['It','is','quite','a','wonderful','day,','isn\'t','it?'] +['There','is....','so','much','to','learn!'] +['22:00','email@yandex.ru'] +['Токенизация','каких-либо','других','языков?'] diff --git a/tests/queries/0_stateless/01889_tokenize.sql b/tests/queries/0_stateless/01889_tokenize.sql new file mode 100644 index 00000000000..c9d29a8632b --- /dev/null +++ b/tests/queries/0_stateless/01889_tokenize.sql @@ -0,0 +1,11 @@ +SET allow_experimental_nlp_functions = 1; + +SELECT splitByNonAlpha('It is quite a wonderful day, isn\'t it?'); +SELECT splitByNonAlpha('There is.... so much to learn!'); +SELECT splitByNonAlpha('22:00 email@yandex.ru'); +SELECT splitByNonAlpha('Токенизация каких-либо других языков?'); + +SELECT splitByWhitespace('It is quite a wonderful day, isn\'t it?'); +SELECT splitByWhitespace('There is.... so much to learn!'); +SELECT splitByWhitespace('22:00 email@yandex.ru'); +SELECT splitByWhitespace('Токенизация каких-либо других языков?'); diff --git a/tests/queries/0_stateless/01890_stem.reference b/tests/queries/0_stateless/01890_stem.reference new file mode 100644 index 00000000000..33e18cd6775 --- /dev/null +++ b/tests/queries/0_stateless/01890_stem.reference @@ -0,0 +1,21 @@ +given +combinatori +collect +possibl +studi +commonplac +pack +комбинаторн +получ +огранич +конечн +максимальн +суммарн +стоимост +remplissag +valeur +maximis +dépass +intens +étudi +peuvent diff --git a/tests/queries/0_stateless/01890_stem.sql b/tests/queries/0_stateless/01890_stem.sql new file mode 100644 index 00000000000..472cfb54251 --- /dev/null +++ b/tests/queries/0_stateless/01890_stem.sql @@ -0,0 +1,25 @@ +SET allow_experimental_nlp_functions = 1; + +SELECT stem('en', 'given'); +SELECT stem('en', 'combinatorial'); +SELECT stem('en', 'collection'); +SELECT stem('en', 'possibility'); +SELECT stem('en', 'studied'); +SELECT stem('en', 'commonplace'); +SELECT stem('en', 'packing'); + +SELECT stem('ru', 'комбинаторной'); +SELECT stem('ru', 'получила'); +SELECT stem('ru', 'ограничена'); +SELECT stem('ru', 'конечной'); +SELECT stem('ru', 'максимальной'); +SELECT stem('ru', 'суммарный'); +SELECT stem('ru', 'стоимостью'); + +SELECT stem('fr', 'remplissage'); +SELECT stem('fr', 'valeur'); +SELECT stem('fr', 'maximiser'); +SELECT stem('fr', 'dépasser'); +SELECT stem('fr', 'intensivement'); +SELECT stem('fr', 'étudié'); +SELECT stem('fr', 'peuvent');