diff --git a/.gitmodules b/.gitmodules
index 4df7798e1e7..43c878427ec 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -225,6 +225,15 @@
[submodule "contrib/yaml-cpp"]
path = contrib/yaml-cpp
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
+[submodule "contrib/libstemmer_c"]
+ path = contrib/libstemmer_c
+ url = https://github.com/ClickHouse-Extras/libstemmer_c.git
+[submodule "contrib/wordnet-blast"]
+ path = contrib/wordnet-blast
+ url = https://github.com/ClickHouse-Extras/wordnet-blast.git
+[submodule "contrib/lemmagen-c"]
+ path = contrib/lemmagen-c
+ url = https://github.com/ClickHouse-Extras/lemmagen-c.git
[submodule "contrib/libpqxx"]
path = contrib/libpqxx
url = https://github.com/ClickHouse-Extras/libpqxx.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 875a6d1ab61..24022c256ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -542,6 +542,7 @@ include (cmake/find/libpqxx.cmake)
include (cmake/find/nuraft.cmake)
include (cmake/find/yaml-cpp.cmake)
include (cmake/find/s2geometry.cmake)
+include (cmake/find/nlp.cmake)
if(NOT USE_INTERNAL_PARQUET_LIBRARY)
set (ENABLE_ORC OFF CACHE INTERNAL "")
diff --git a/cmake/find/nlp.cmake b/cmake/find/nlp.cmake
new file mode 100644
index 00000000000..f1204a85dea
--- /dev/null
+++ b/cmake/find/nlp.cmake
@@ -0,0 +1,32 @@
+option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES})
+
+if (NOT ENABLE_NLP)
+
+ message (STATUS "NLP functions disabled")
+ return()
+endif()
+
+if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c/Makefile")
+ message (WARNING "submodule contrib/libstemmer_c is missing. to fix try run: \n git submodule update --init --recursive")
+ message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libstemmer_c library, NLP functions will be disabled")
+ set (USE_NLP 0)
+ return()
+endif ()
+
+if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast/CMakeLists.txt")
+ message (WARNING "submodule contrib/wordnet-blast is missing. to fix try run: \n git submodule update --init --recursive")
+ message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal wordnet-blast library, NLP functions will be disabled")
+ set (USE_NLP 0)
+ return()
+endif ()
+
+if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c/README.md")
+ message (WARNING "submodule contrib/lemmagen-c is missing. to fix try run: \n git submodule update --init --recursive")
+ message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal lemmagen-c library, NLP functions will be disabled")
+ set (USE_NLP 0)
+ return()
+endif ()
+
+set (USE_NLP 1)
+
+message (STATUS "Using Libraries for NLP functions: contrib/wordnet-blast, contrib/libstemmer_c, contrib/lemmagen-c")
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 2b6629d0817..82cddb0ace0 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -328,6 +328,12 @@ endif()
add_subdirectory(fast_float)
+if (USE_NLP)
+ add_subdirectory(libstemmer-c-cmake)
+ add_subdirectory(wordnet-blast-cmake)
+ add_subdirectory(lemmagen-c-cmake)
+endif()
+
if (USE_SQLITE)
add_subdirectory(sqlite-cmake)
endif()
diff --git a/contrib/boost b/contrib/boost
index 1ccbb5a522a..9cf09dbfd55 160000
--- a/contrib/boost
+++ b/contrib/boost
@@ -1 +1 @@
-Subproject commit 1ccbb5a522a571ce83b606dbc2e1011c42ecccfb
+Subproject commit 9cf09dbfd55a5c6202dedbdf40781a51b02c2675
diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt
index 9f6c5b1255d..675931d319f 100644
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
regex
context
coroutine
+ graph
)
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
- Boost_COROUTINE_LIBRARY)
+ Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
set(EXTERNAL_BOOST_FOUND 1)
@@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (_boost_system INTERFACE)
add_library (_boost_context INTERFACE)
add_library (_boost_coroutine INTERFACE)
+ add_library (_boost_graph INTERFACE)
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
@@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
+ target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
add_library (boost::filesystem ALIAS _boost_filesystem)
add_library (boost::iostreams ALIAS _boost_iostreams)
@@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (boost::system ALIAS _boost_system)
add_library (boost::context ALIAS _boost_context)
add_library (boost::coroutine ALIAS _boost_coroutine)
+ add_library (boost::graph ALIAS _boost_graph)
else()
set(EXTERNAL_BOOST_FOUND 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@@ -221,4 +225,17 @@ if (NOT EXTERNAL_BOOST_FOUND)
add_library (boost::coroutine ALIAS _boost_coroutine)
target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
target_link_libraries(_boost_coroutine PRIVATE _boost_context)
+
+ # graph
+
+ set (SRCS_GRAPH
+ "${LIBRARY_DIR}/libs/graph/src/graphml.cpp"
+ "${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp"
+ )
+
+ add_library (_boost_graph ${SRCS_GRAPH})
+ add_library (boost::graph ALIAS _boost_graph)
+ target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
+ target_link_libraries(_boost_graph PRIVATE _boost_regex)
+
endif ()
diff --git a/contrib/lemmagen-c b/contrib/lemmagen-c
new file mode 160000
index 00000000000..59537bdcf57
--- /dev/null
+++ b/contrib/lemmagen-c
@@ -0,0 +1 @@
+Subproject commit 59537bdcf57bbed17913292cb4502d15657231f1
diff --git a/contrib/lemmagen-c-cmake/CMakeLists.txt b/contrib/lemmagen-c-cmake/CMakeLists.txt
new file mode 100644
index 00000000000..b5b92b774e1
--- /dev/null
+++ b/contrib/lemmagen-c-cmake/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c")
+set(LEMMAGEN_INCLUDE_DIR "${LIBRARY_DIR}/include")
+
+set(SRCS
+ "${LIBRARY_DIR}/src/RdrLemmatizer.cpp"
+)
+
+add_library(lemmagen STATIC ${SRCS})
+target_include_directories(lemmagen PUBLIC "${LEMMAGEN_INCLUDE_DIR}")
diff --git a/contrib/libstemmer-c-cmake/CMakeLists.txt b/contrib/libstemmer-c-cmake/CMakeLists.txt
new file mode 100644
index 00000000000..2d38e5f3612
--- /dev/null
+++ b/contrib/libstemmer-c-cmake/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c")
+set(STEMMER_INCLUDE_DIR "${LIBRARY_DIR}/include")
+
+FILE ( READ "${LIBRARY_DIR}/mkinc.mak" _CONTENT )
+# replace '\ ' into one big line
+STRING ( REGEX REPLACE "\\\\\n " " ${LIBRARY_DIR}/" _CONTENT "${_CONTENT}" )
+# escape ';' (if any)
+STRING ( REGEX REPLACE ";" "\\\\;" _CONTENT "${_CONTENT}" )
+# now replace lf into ';' (it makes list from the line)
+STRING ( REGEX REPLACE "\n" ";" _CONTENT "${_CONTENT}" )
+FOREACH ( LINE ${_CONTENT} )
+ # skip comments (beginning with #)
+ IF ( NOT "${LINE}" MATCHES "^#.*" )
+ # parse 'name=value1 value2..." - extract the 'name' part
+ STRING ( REGEX REPLACE "=.*$" "" _NAME "${LINE}" )
+ # extract the list of values part
+ STRING ( REGEX REPLACE "^.*=" "" _LIST "${LINE}" )
+ # replace (multi)spaces into ';' (it makes list from the line)
+ STRING ( REGEX REPLACE " +" ";" _LIST "${_LIST}" )
+ # finally get our two variables
+ IF ( "${_NAME}" MATCHES "snowball_sources" )
+ SET ( _SOURCES "${_LIST}" )
+ ELSEIF ( "${_NAME}" MATCHES "snowball_headers" )
+ SET ( _HEADERS "${_LIST}" )
+ ENDIF ()
+ endif ()
+endforeach ()
+
+# all the sources parsed. Now just add the lib
+add_library ( stemmer STATIC ${_SOURCES} ${_HEADERS} )
+target_include_directories (stemmer PUBLIC "${STEMMER_INCLUDE_DIR}")
diff --git a/contrib/libstemmer_c b/contrib/libstemmer_c
new file mode 160000
index 00000000000..c753054304d
--- /dev/null
+++ b/contrib/libstemmer_c
@@ -0,0 +1 @@
+Subproject commit c753054304d87daf460057c1a649c482aa094835
diff --git a/contrib/wordnet-blast b/contrib/wordnet-blast
new file mode 160000
index 00000000000..1d16ac28036
--- /dev/null
+++ b/contrib/wordnet-blast
@@ -0,0 +1 @@
+Subproject commit 1d16ac28036e19fe8da7ba72c16a307fbdf8c87e
diff --git a/contrib/wordnet-blast-cmake/CMakeLists.txt b/contrib/wordnet-blast-cmake/CMakeLists.txt
new file mode 100644
index 00000000000..8d59c312664
--- /dev/null
+++ b/contrib/wordnet-blast-cmake/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast")
+
+set(SRCS
+ "${LIBRARY_DIR}/wnb/core/info_helper.cc"
+ "${LIBRARY_DIR}/wnb/core/load_wordnet.cc"
+ "${LIBRARY_DIR}/wnb/core/wordnet.cc"
+)
+
+add_library(wnb ${SRCS})
+
+target_link_libraries(wnb PRIVATE boost::headers_only boost::graph)
+
+target_include_directories(wnb PUBLIC "${LIBRARY_DIR}")
\ No newline at end of file
diff --git a/docker/packager/unbundled/Dockerfile b/docker/packager/unbundled/Dockerfile
index d2bda7db833..07031aa2d1b 100644
--- a/docker/packager/unbundled/Dockerfile
+++ b/docker/packager/unbundled/Dockerfile
@@ -23,6 +23,7 @@ RUN apt-get update \
libboost-regex-dev \
libboost-context-dev \
libboost-coroutine-dev \
+ libboost-graph-dev \
zlib1g-dev \
liblz4-dev \
libdouble-conversion-dev \
diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index 3c7d019833b..6419ea3659c 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -311,6 +311,7 @@ function run_tests
01411_bayesian_ab_testing
01798_uniq_theta_sketch
01799_long_uniq_theta_sketch
+ 01890_stem # depends on libstemmer_c
collate
collation
_orc_
diff --git a/docs/en/sql-reference/functions/nlp-functions.md b/docs/en/sql-reference/functions/nlp-functions.md
new file mode 100644
index 00000000000..2d5a09c0897
--- /dev/null
+++ b/docs/en/sql-reference/functions/nlp-functions.md
@@ -0,0 +1,125 @@
+---
+toc_priority: 67
+toc_title: NLP
+---
+
+# Natural Language Processing functions {#nlp-functions}
+
+## stem {#stem}
+
+Performs stemming on a previously tokenized text.
+
+**Syntax**
+
+``` sql
+stem('language', word)
+```
+
+**Arguments**
+
+- `language` — Language which rules will be applied. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string).
+- `word` — word that needs to be stemmed. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT SELECT arrayMap(x -> stem('en', x), ['I', 'think', 'it', 'is', 'a', 'blessing', 'in', 'disguise']) as res;
+```
+
+Result:
+
+``` text
+┌─res────────────────────────────────────────────────┐
+│ ['I','think','it','is','a','bless','in','disguis'] │
+└────────────────────────────────────────────────────┘
+```
+
+## lemmatize {#lemmatize}
+
+Performs lemmatization on a given word.
+
+**Syntax**
+
+``` sql
+lemmatize('language', word)
+```
+
+**Arguments**
+
+- `language` — Language which rules will be applied. [String](../../sql-reference/data-types/string.md#string).
+- `word` — Word that needs to be lemmatized. Must be lowercase. [String](../../sql-reference/data-types/string.md#string).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT lemmatize('en', 'wolves');
+```
+
+Result:
+
+``` text
+┌─lemmatize("wolves")─┐
+│ "wolf" │
+└─────────────────────┘
+```
+
+Configuration:
+``` xml
+
+
+ en
+ en.bin
+
+
+```
+
+## synonyms {#synonyms}
+
+Finds synonyms to a given word.
+
+**Syntax**
+
+``` sql
+synonyms('extension_name', word)
+```
+
+**Arguments**
+
+- `extension_name` — Name of the extention in which search will be performed. [String](../../sql-reference/data-types/string.md#string).
+- `word` — Word that will be searched in extension. [String](../../sql-reference/data-types/string.md#string).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT synonyms('list', 'important');
+```
+
+Result:
+
+``` text
+┌─synonyms('list', 'important')────────────┐
+│ ['important','big','critical','crucial'] │
+└──────────────────────────────────────────┘
+```
+
+Configuration:
+``` xml
+
+
+ en
+ plain
+ en.txt
+
+
+ en
+ wordnet
+ en/
+
+
+```
\ No newline at end of file
diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md
index ffb1029cc36..718d5a977b9 100644
--- a/docs/en/sql-reference/functions/splitting-merging-functions.md
+++ b/docs/en/sql-reference/functions/splitting-merging-functions.md
@@ -145,6 +145,72 @@ Result:
└────────────────────────────┘
```
+## splitByWhitespace(s) {#splitbywhitespaceseparator-s}
+
+Splits a string into substrings separated by whitespace characters.
+Returns an array of selected substrings.
+
+**Syntax**
+
+``` sql
+splitByWhitespace(s)
+```
+
+**Arguments**
+
+- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+
+**Returned value(s)**
+
+Returns an array of selected substrings.
+
+Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
+
+**Example**
+
+``` sql
+SELECT splitByWhitespace(' 1! a, b. ');
+```
+
+``` text
+┌─splitByWhitespace(' 1! a, b. ')─┐
+│ ['1!','a,','b.'] │
+└─────────────────────────────────────┘
+```
+
+## splitByNonAlpha(s) {#splitbynonalphaseparator-s}
+
+Splits a string into substrings separated by whitespace and punctuation characters.
+Returns an array of selected substrings.
+
+**Syntax**
+
+``` sql
+splitByNonAlpha(s)
+```
+
+**Arguments**
+
+- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+
+**Returned value(s)**
+
+Returns an array of selected substrings.
+
+Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
+
+**Example**
+
+``` sql
+SELECT splitByNonAlpha(' 1! a, b. ');
+```
+
+``` text
+┌─splitByNonAlpha(' 1! a, b. ')─┐
+│ ['1','a','b'] │
+└───────────────────────────────────┘
+```
+
## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator}
Concatenates the strings listed in the array with the separator.’separator’ is an optional parameter: a constant string, set to an empty string by default.
diff --git a/docs/ru/sql-reference/functions/nlp-functions.md b/docs/ru/sql-reference/functions/nlp-functions.md
new file mode 100644
index 00000000000..582b5c93b93
--- /dev/null
+++ b/docs/ru/sql-reference/functions/nlp-functions.md
@@ -0,0 +1,125 @@
+---
+toc_priority: 67
+toc_title: NLP
+---
+
+# Функции для работы с ествественным языком {#nlp-functions}
+
+## stem {#stem}
+
+Данная функция проводит стемминг заданного слова.
+
+**Синтаксис**
+
+``` sql
+stem('language', word)
+```
+
+**Аргументы**
+
+- `language` — Язык, правила которого будут применены для стемминга. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string).
+- `word` — Слово подлежащее стеммингу. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT SELECT arrayMap(x -> stem('en', x), ['I', 'think', 'it', 'is', 'a', 'blessing', 'in', 'disguise']) as res;
+```
+
+Result:
+
+``` text
+┌─res────────────────────────────────────────────────┐
+│ ['I','think','it','is','a','bless','in','disguis'] │
+└────────────────────────────────────────────────────┘
+```
+
+## lemmatize {#lemmatize}
+
+Данная функция проводит лемматизацию для заданного слова.
+
+**Синтаксис**
+
+``` sql
+lemmatize('language', word)
+```
+
+**Аргументы**
+
+- `language` — Язык, правила которого будут применены для лемматизации. [String](../../sql-reference/data-types/string.md#string).
+- `word` — Слово, подлежащее лемматизации. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string).
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT lemmatize('en', 'wolves');
+```
+
+Результат:
+
+``` text
+┌─lemmatize("wolves")─┐
+│ "wolf" │
+└─────────────────────┘
+```
+
+Конфигурация:
+``` xml
+
+
+ en
+ en.bin
+
+
+```
+
+## synonyms {#synonyms}
+
+Находит синонимы к заданному слову.
+
+**Синтаксис**
+
+``` sql
+synonyms('extension_name', word)
+```
+
+**Аргументы**
+
+- `extension_name` — Название расширения, в котором будет проводиться поиск. [String](../../sql-reference/data-types/string.md#string).
+- `word` — Слово, которое будет искаться в расширении. [String](../../sql-reference/data-types/string.md#string).
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT synonyms('list', 'important');
+```
+
+Результат:
+
+``` text
+┌─synonyms('list', 'important')────────────┐
+│ ['important','big','critical','crucial'] │
+└──────────────────────────────────────────┘
+```
+
+Конфигурация:
+``` xml
+
+
+ en
+ plain
+ en.txt
+
+
+ en
+ wordnet
+ en/
+
+
+```
\ No newline at end of file
diff --git a/docs/ru/sql-reference/functions/splitting-merging-functions.md b/docs/ru/sql-reference/functions/splitting-merging-functions.md
index 5a0c540cf3a..efe74dba043 100644
--- a/docs/ru/sql-reference/functions/splitting-merging-functions.md
+++ b/docs/ru/sql-reference/functions/splitting-merging-functions.md
@@ -146,6 +146,70 @@ SELECT splitByRegexp('', 'abcde');
└────────────────────────────┘
```
+## splitByWhitespace(s) {#splitbywhitespaceseparator-s}
+
+Разбивает строку на подстроки, используя в качестве разделителей пробельные символы.
+
+**Синтаксис**
+
+``` sql
+splitByWhitespace(s)
+```
+
+**Аргументы**
+
+- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемые значения**
+
+Возвращает массив подстрок.
+
+Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
+
+**Пример**
+
+``` sql
+SELECT splitByWhitespace(' 1! a, b. ');
+```
+
+``` text
+┌─splitByWhitespace(' 1! a, b. ')─┐
+│ ['1!','a,','b.'] │
+└─────────────────────────────────────┘
+```
+
+## splitByNonAlpha(s) {#splitbynonalphaseparator-s}
+
+Разбивает строку на подстроки, используя в качестве разделителей пробельные символы и символы пунктуации.
+
+**Синтаксис**
+
+``` sql
+splitByNonAlpha(s)
+```
+
+**Аргументы**
+
+- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемые значения**
+
+Возвращает массив подстрок.
+
+Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
+
+**Пример**
+
+``` sql
+SELECT splitByNonAlpha(' 1! a, b. ');
+```
+
+``` text
+┌─splitByNonAlpha(' 1! a, b. ')─┐
+│ ['1','a','b'] │
+└───────────────────────────────────┘
+```
+
## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator}
Склеивает строки, перечисленные в массиве, с разделителем separator.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 31286c740d4..a99201e4aaa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -473,6 +473,12 @@ endif ()
dbms_target_link_libraries(PRIVATE _boost_context)
+if (USE_NLP)
+ dbms_target_link_libraries (PUBLIC stemmer)
+ dbms_target_link_libraries (PUBLIC wnb)
+ dbms_target_link_libraries (PUBLIC lemmagen)
+endif()
+
include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake")
if (ENABLE_TESTS AND USE_GTEST)
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index dccc2898de3..64c4c5621b2 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -490,6 +490,7 @@ class IColumn;
\
/** Experimental functions */ \
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
+ M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \
\
\
/** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in
index 45cbc6efe19..cc9c993b205 100644
--- a/src/Core/config_core.h.in
+++ b/src/Core/config_core.h.in
@@ -15,4 +15,5 @@
#cmakedefine01 USE_LIBPQXX
#cmakedefine01 USE_SQLITE
#cmakedefine01 USE_NURAFT
+#cmakedefine01 USE_NLP
#cmakedefine01 USE_KRB5
diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp
index 14092d7dd3d..765317093c1 100644
--- a/src/Functions/FunctionsStringArray.cpp
+++ b/src/Functions/FunctionsStringArray.cpp
@@ -9,6 +9,8 @@ void registerFunctionsStringArray(FunctionFactory & factory)
{
factory.registerFunction();
factory.registerFunction();
+ factory.registerFunction();
+ factory.registerFunction();
factory.registerFunction();
factory.registerFunction();
factory.registerFunction();
diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h
index 27f10797651..4d2312f207c 100644
--- a/src/Functions/FunctionsStringArray.h
+++ b/src/Functions/FunctionsStringArray.h
@@ -33,6 +33,9 @@ namespace ErrorCodes
* splitByString(sep, s)
* splitByRegexp(regexp, s)
*
+ * splitByWhitespace(s) - split the string by whitespace characters
+ * splitByNonAlpha(s) - split the string by whitespace and punctuation characters
+ *
* extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
* - first subpattern, if regexp has subpattern;
* - zero subpattern (the match part, otherwise);
@@ -111,6 +114,121 @@ public:
}
};
+class SplitByNonAlphaImpl
+{
+private:
+ Pos pos;
+ Pos end;
+
+public:
+ /// Get the name of the function.
+ static constexpr auto name = "splitByNonAlpha";
+ static String getName() { return name; }
+
+ static size_t getNumberOfArguments() { return 1; }
+
+ /// Check the type of the function's arguments.
+ static void checkArguments(const DataTypes & arguments)
+ {
+ if (!isString(arguments[0]))
+ throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
+ ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ }
+
+ /// Initialize by the function arguments.
+ void init(const ColumnsWithTypeAndName & /*arguments*/) {}
+
+ /// Called for each next string.
+ void set(Pos pos_, Pos end_)
+ {
+ pos = pos_;
+ end = end_;
+ }
+
+ /// Returns the position of the argument, that is the column of strings
+ size_t getStringsArgumentPosition()
+ {
+ return 0;
+ }
+
+ /// Get the next token, if any, or return false.
+ bool get(Pos & token_begin, Pos & token_end)
+ {
+ /// Skip garbage
+ while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
+ ++pos;
+
+ if (pos == end)
+ return false;
+
+ token_begin = pos;
+
+ while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
+ ++pos;
+
+ token_end = pos;
+
+ return true;
+ }
+};
+
+class SplitByWhitespaceImpl
+{
+private:
+ Pos pos;
+ Pos end;
+
+public:
+ /// Get the name of the function.
+ static constexpr auto name = "splitByWhitespace";
+ static String getName() { return name; }
+
+ static size_t getNumberOfArguments() { return 1; }
+
+ /// Check the type of the function's arguments.
+ static void checkArguments(const DataTypes & arguments)
+ {
+ if (!isString(arguments[0]))
+ throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
+ ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ }
+
+ /// Initialize by the function arguments.
+ void init(const ColumnsWithTypeAndName & /*arguments*/) {}
+
+ /// Called for each next string.
+ void set(Pos pos_, Pos end_)
+ {
+ pos = pos_;
+ end = end_;
+ }
+
+ /// Returns the position of the argument, that is the column of strings
+ size_t getStringsArgumentPosition()
+ {
+ return 0;
+ }
+
+ /// Get the next token, if any, or return false.
+ bool get(Pos & token_begin, Pos & token_end)
+ {
+ /// Skip garbage
+ while (pos < end && isWhitespaceASCII(*pos))
+ ++pos;
+
+ if (pos == end)
+ return false;
+
+ token_begin = pos;
+
+ while (pos < end && !isWhitespaceASCII(*pos))
+ ++pos;
+
+ token_end = pos;
+
+ return true;
+ }
+};
class SplitByCharImpl
{
@@ -662,6 +780,8 @@ public:
using FunctionAlphaTokens = FunctionTokens;
+using FunctionSplitByNonAlpha = FunctionTokens;
+using FunctionSplitByWhitespace = FunctionTokens;
using FunctionSplitByChar = FunctionTokens;
using FunctionSplitByString = FunctionTokens;
using FunctionSplitByRegexp = FunctionTokens;
diff --git a/src/Functions/lemmatize.cpp b/src/Functions/lemmatize.cpp
new file mode 100644
index 00000000000..35d2bfebe08
--- /dev/null
+++ b/src/Functions/lemmatize.cpp
@@ -0,0 +1,130 @@
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+ extern const int SUPPORT_IS_DISABLED;
+}
+
+namespace
+{
+
+struct LemmatizeImpl
+{
+ static void vector(
+ const ColumnString::Chars & data,
+ const ColumnString::Offsets & offsets,
+ ColumnString::Chars & res_data,
+ ColumnString::Offsets & res_offsets,
+ Lemmatizers::LemmPtr & lemmatizer)
+ {
+ res_data.resize(data.size());
+ res_offsets.assign(offsets);
+
+ UInt64 data_size = 0;
+ for (UInt64 i = 0; i < offsets.size(); ++i)
+ {
+ /// lemmatize() uses the fact the fact that each string ends with '\0'
+ auto result = lemmatizer->lemmatize(reinterpret_cast(data.data() + offsets[i - 1]));
+ size_t new_size = strlen(result.get()) + 1;
+
+ if (data_size + new_size > res_data.size())
+ res_data.resize(data_size + new_size);
+
+ memcpy(res_data.data() + data_size, reinterpret_cast(result.get()), new_size);
+
+ data_size += new_size;
+ res_offsets[i] = data_size;
+ }
+ res_data.resize(data_size);
+ }
+};
+
+
+class FunctionLemmatize : public IFunction
+{
+public:
+ static constexpr auto name = "lemmatize";
+ static FunctionPtr create(ContextPtr context)
+ {
+ if (!context->getSettingsRef().allow_experimental_nlp_functions)
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
+
+ return std::make_shared(context->getLemmatizers());
+ }
+
+private:
+ Lemmatizers & lemmatizers;
+
+public:
+ explicit FunctionLemmatize(Lemmatizers & lemmatizers_)
+ : lemmatizers(lemmatizers_) {}
+
+ String getName() const override { return name; }
+
+ size_t getNumberOfArguments() const override { return 2; }
+
+ DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+ {
+ if (!isString(arguments[0]))
+ throw Exception(
+ "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ if (!isString(arguments[1]))
+ throw Exception(
+ "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ return arguments[1];
+ }
+
+ bool useDefaultImplementationForConstants() const override { return true; }
+
+ ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
+ {
+ const auto & langcolumn = arguments[0].column;
+ const auto & strcolumn = arguments[1].column;
+
+ const ColumnConst * lang_col = checkAndGetColumn(langcolumn.get());
+ const ColumnString * words_col = checkAndGetColumn(strcolumn.get());
+
+ if (!lang_col)
+ throw Exception(
+ "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
+ if (!words_col)
+ throw Exception(
+ "Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
+
+ String language = lang_col->getValue();
+ auto lemmatizer = lemmatizers.getLemmatizer(language);
+
+ auto col_res = ColumnString::create();
+ LemmatizeImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), lemmatizer);
+ return col_res;
+ }
+};
+
+}
+
+void registerFunctionLemmatize(FunctionFactory & factory)
+{
+ factory.registerFunction(FunctionFactory::CaseInsensitive);
+}
+
+}
+
+#endif
diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp
index b0b0e4434bc..ba6a294abba 100644
--- a/src/Functions/registerFunctionsString.cpp
+++ b/src/Functions/registerFunctionsString.cpp
@@ -1,5 +1,6 @@
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
+# include "config_core.h"
#endif
namespace DB
@@ -39,13 +40,18 @@ void registerFunctionDecodeXMLComponent(FunctionFactory &);
void registerFunctionExtractTextFromHTML(FunctionFactory &);
void registerFunctionToStringCutToZero(FunctionFactory &);
-
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
void registerFunctionBase64Decode(FunctionFactory &);
void registerFunctionTryBase64Decode(FunctionFactory &);
#endif
+#if USE_NLP
+void registerFunctionStem(FunctionFactory &);
+void registerFunctionSynonyms(FunctionFactory &);
+void registerFunctionLemmatize(FunctionFactory &);
+#endif
+
void registerFunctionsString(FunctionFactory & factory)
{
registerFunctionRepeat(factory);
@@ -79,11 +85,18 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionDecodeXMLComponent(factory);
registerFunctionExtractTextFromHTML(factory);
registerFunctionToStringCutToZero(factory);
+
#if USE_BASE64
registerFunctionBase64Encode(factory);
registerFunctionBase64Decode(factory);
registerFunctionTryBase64Decode(factory);
#endif
+
+#if USE_NLP
+ registerFunctionStem(factory);
+ registerFunctionSynonyms(factory);
+ registerFunctionLemmatize(factory);
+#endif
}
}
diff --git a/src/Functions/stem.cpp b/src/Functions/stem.cpp
new file mode 100644
index 00000000000..98dcbccd005
--- /dev/null
+++ b/src/Functions/stem.cpp
@@ -0,0 +1,135 @@
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+ extern const int SUPPORT_IS_DISABLED;
+}
+
+namespace
+{
+
+struct StemImpl
+{
+ static void vector(
+ const ColumnString::Chars & data,
+ const ColumnString::Offsets & offsets,
+ ColumnString::Chars & res_data,
+ ColumnString::Offsets & res_offsets,
+ const String & language)
+ {
+ sb_stemmer * stemmer = sb_stemmer_new(language.data(), "UTF_8");
+
+ if (stemmer == nullptr)
+ {
+ throw Exception(
+ "Language " + language + " is not supported for function stem",
+ ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ }
+
+ res_data.resize(data.size());
+ res_offsets.assign(offsets);
+
+ UInt64 data_size = 0;
+ for (UInt64 i = 0; i < offsets.size(); ++i)
+ {
+ /// Note that accessing -1th element is valid for PaddedPODArray.
+ size_t original_size = offsets[i] - offsets[i - 1];
+ const sb_symbol * result = sb_stemmer_stem(stemmer,
+ reinterpret_cast(data.data() + offsets[i - 1]),
+ original_size - 1);
+ size_t new_size = sb_stemmer_length(stemmer) + 1;
+
+ memcpy(res_data.data() + data_size, result, new_size);
+
+ data_size += new_size;
+ res_offsets[i] = data_size;
+ }
+ res_data.resize(data_size);
+ sb_stemmer_delete(stemmer);
+ }
+};
+
+
+class FunctionStem : public IFunction
+{
+public:
+ static constexpr auto name = "stem";
+
+ static FunctionPtr create(ContextPtr context)
+ {
+ if (!context->getSettingsRef().allow_experimental_nlp_functions)
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
+
+ return std::make_shared();
+ }
+
+ String getName() const override { return name; }
+
+ size_t getNumberOfArguments() const override { return 2; }
+
+ DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+ {
+ if (!isString(arguments[0]))
+ throw Exception(
+ "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ if (!isString(arguments[1]))
+ throw Exception(
+ "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ return arguments[1];
+ }
+
+ bool useDefaultImplementationForConstants() const override { return true; }
+
+ ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
+ {
+ const auto & langcolumn = arguments[0].column;
+ const auto & strcolumn = arguments[1].column;
+
+ const ColumnConst * lang_col = checkAndGetColumn(langcolumn.get());
+ const ColumnString * words_col = checkAndGetColumn(strcolumn.get());
+
+ if (!lang_col)
+ throw Exception(
+ "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
+ if (!words_col)
+ throw Exception(
+ "Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
+
+ String language = lang_col->getValue();
+
+ auto col_res = ColumnString::create();
+ StemImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), language);
+ return col_res;
+ }
+};
+
+}
+
+void registerFunctionStem(FunctionFactory & factory)
+{
+ factory.registerFunction(FunctionFactory::CaseInsensitive);
+}
+
+}
+
+#endif
diff --git a/src/Functions/synonyms.cpp b/src/Functions/synonyms.cpp
new file mode 100644
index 00000000000..4201fbfa677
--- /dev/null
+++ b/src/Functions/synonyms.cpp
@@ -0,0 +1,128 @@
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+ extern const int SUPPORT_IS_DISABLED;
+}
+
+class FunctionSynonyms : public IFunction
+{
+public:
+ static constexpr auto name = "synonyms";
+ static FunctionPtr create(ContextPtr context)
+ {
+ if (!context->getSettingsRef().allow_experimental_nlp_functions)
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
+
+ return std::make_shared(context->getSynonymsExtensions());
+ }
+
+private:
+ SynonymsExtensions & extensions;
+
+public:
+ explicit FunctionSynonyms(SynonymsExtensions & extensions_)
+ : extensions(extensions_) {}
+
+ String getName() const override { return name; }
+
+ size_t getNumberOfArguments() const override { return 2; }
+
+ DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+ {
+ if (!isString(arguments[0]))
+ throw Exception(
+ "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ if (!isString(arguments[1]))
+ throw Exception(
+ "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+ return std::make_shared(std::make_shared());
+ }
+
+ bool useDefaultImplementationForConstants() const override { return true; }
+
+ ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
+ {
+ const auto & extcolumn = arguments[0].column;
+ const auto & strcolumn = arguments[1].column;
+
+ const ColumnConst * ext_col = checkAndGetColumn(extcolumn.get());
+ const ColumnString * word_col = checkAndGetColumn(strcolumn.get());
+
+ if (!ext_col)
+ throw Exception(
+ "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
+ ErrorCodes::ILLEGAL_COLUMN);
+ if (!word_col)
+ throw Exception(
+ "Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(),
+ ErrorCodes::ILLEGAL_COLUMN);
+
+ String ext_name = ext_col->getValue();
+ auto extension = extensions.getExtension(ext_name);
+
+ /// Create and fill the result array.
+ const DataTypePtr & elem_type = static_cast(*result_type).getNestedType();
+
+ auto out = ColumnArray::create(elem_type->createColumn());
+ IColumn & out_data = out->getData();
+ IColumn::Offsets & out_offsets = out->getOffsets();
+
+ const ColumnString::Chars & data = word_col->getChars();
+ const ColumnString::Offsets & offsets = word_col->getOffsets();
+ out_data.reserve(input_rows_count);
+ out_offsets.resize(input_rows_count);
+
+ IColumn::Offset current_offset = 0;
+ for (size_t i = 0; i < offsets.size(); ++i)
+ {
+ std::string_view word(reinterpret_cast(data.data() + offsets[i - 1]), offsets[i] - offsets[i - 1] - 1);
+
+ const auto * synset = extension->getSynonyms(word);
+
+ if (synset)
+ {
+ for (const auto & token : *synset)
+ out_data.insert(Field(token.data(), token.size()));
+
+ current_offset += synset->size();
+ }
+ out_offsets[i] = current_offset;
+ }
+
+ return out;
+ }
+};
+
+void registerFunctionSynonyms(FunctionFactory & factory)
+{
+ factory.registerFunction(FunctionFactory::CaseInsensitive);
+}
+
+}
+
+#endif
diff --git a/src/Functions/ya.make b/src/Functions/ya.make
index d1def6ad90e..2b9b3d94313 100644
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@@ -340,6 +340,7 @@ SRCS(
jumpConsistentHash.cpp
lcm.cpp
least.cpp
+ lemmatize.cpp
lengthUTF8.cpp
less.cpp
lessOrEquals.cpp
@@ -481,6 +482,7 @@ SRCS(
sleepEachRow.cpp
sqrt.cpp
startsWith.cpp
+ stem.cpp
stringCutToZero.cpp
stringToH3.cpp
substring.cpp
@@ -493,6 +495,7 @@ SRCS(
subtractWeeks.cpp
subtractYears.cpp
svg.cpp
+ synonyms.cpp
tan.cpp
tanh.cpp
tcpPort.cpp
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index effb1245644..4f1fbd7755b 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -77,6 +77,8 @@
#include
#include
#include
+#include
+#include
#include
@@ -349,6 +351,11 @@ struct ContextSharedPart
scope_guard dictionaries_xmls;
+#if USE_NLP
+ mutable std::optional synonyms_extensions;
+ mutable std::optional lemmatizers;
+#endif
+
String default_profile_name; /// Default profile name used for default values.
String system_profile_name; /// Profile used by system processes
String buffer_profile_name; /// Profile used by Buffer engine for flushing to the underlying
@@ -1505,6 +1512,29 @@ void Context::loadDictionaries(const Poco::Util::AbstractConfiguration & config)
std::make_unique(config, "dictionaries_config"));
}
+#if USE_NLP
+
+SynonymsExtensions & Context::getSynonymsExtensions() const
+{
+ auto lock = getLock();
+
+ if (!shared->synonyms_extensions)
+ shared->synonyms_extensions.emplace(getConfigRef());
+
+ return *shared->synonyms_extensions;
+}
+
+Lemmatizers & Context::getLemmatizers() const
+{
+ auto lock = getLock();
+
+ if (!shared->lemmatizers)
+ shared->lemmatizers.emplace(getConfigRef());
+
+ return *shared->lemmatizers;
+}
+#endif
+
void Context::setProgressCallback(ProgressCallback callback)
{
/// Callback is set to a session or to a query. In the session, only one query is processed at a time. Therefore, the lock is not needed.
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index e792fe07ec8..af696d48c6b 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -114,6 +114,11 @@ using VolumePtr = std::shared_ptr;
struct NamedSession;
struct BackgroundTaskSchedulingSettings;
+#if USE_NLP
+ class SynonymsExtensions;
+ class Lemmatizers;
+#endif
+
class Throttler;
using ThrottlerPtr = std::shared_ptr;
@@ -534,6 +539,11 @@ public:
void tryCreateEmbeddedDictionaries() const;
void loadDictionaries(const Poco::Util::AbstractConfiguration & config);
+#if USE_NLP
+ SynonymsExtensions & getSynonymsExtensions() const;
+ Lemmatizers & getLemmatizers() const;
+#endif
+
void setExternalModelsConfig(const ConfigurationPtr & config, const std::string & config_name = "models_config");
/// I/O formats.
diff --git a/src/Interpreters/Lemmatizers.cpp b/src/Interpreters/Lemmatizers.cpp
new file mode 100644
index 00000000000..38cd4c33678
--- /dev/null
+++ b/src/Interpreters/Lemmatizers.cpp
@@ -0,0 +1,100 @@
+
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+#include
+
+#include
+#include
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int UNKNOWN_ELEMENT_IN_CONFIG;
+ extern const int INVALID_CONFIG_PARAMETER;
+}
+
+
+class Lemmatizer : public ILemmatizer
+{
+private:
+ RdrLemmatizer lemmatizer;
+
+public:
+ explicit Lemmatizer(const String & path) : lemmatizer(path.data()) {}
+
+ TokenPtr lemmatize(const char * token) override
+ {
+ return TokenPtr(lemmatizer.Lemmatize(token));
+ }
+};
+
+/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
+static bool startsWith(const std::string & s, const char * prefix)
+{
+ return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
+}
+
+Lemmatizers::Lemmatizers(const Poco::Util::AbstractConfiguration & config)
+{
+ String prefix = "lemmatizers";
+ Poco::Util::AbstractConfiguration::Keys keys;
+
+ if (!config.has(prefix))
+ throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "No lemmatizers specified in server config on prefix '{}'", prefix);
+
+ config.keys(prefix, keys);
+
+ for (const auto & key : keys)
+ {
+ if (startsWith(key, "lemmatizer"))
+ {
+ const auto & lemm_name = config.getString(prefix + "." + key + ".lang", "");
+ const auto & lemm_path = config.getString(prefix + "." + key + ".path", "");
+
+ if (lemm_name.empty())
+ throw Exception("Lemmatizer language in config is not specified here: " + prefix + "." + key + ".lang",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+ if (lemm_path.empty())
+ throw Exception("Path to lemmatizer in config is not specified here: " + prefix + "." + key + ".path",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+
+ paths[lemm_name] = lemm_path;
+ }
+ else
+ throw Exception("Unknown element in config: " + prefix + "." + key + ", must be 'lemmatizer'",
+ ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
+ }
+}
+
+Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name)
+{
+ std::lock_guard guard(mutex);
+
+ if (lemmatizers.find(name) != lemmatizers.end())
+ return lemmatizers[name];
+
+ if (paths.find(name) != paths.end())
+ {
+ if (!std::filesystem::exists(paths[name]))
+ throw Exception("Incorrect path to lemmatizer: " + paths[name],
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+
+ lemmatizers[name] = std::make_shared(paths[name]);
+ return lemmatizers[name];
+ }
+
+ throw Exception("Lemmatizer named: '" + name + "' is not found",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+}
+
+}
+
+#endif
diff --git a/src/Interpreters/Lemmatizers.h b/src/Interpreters/Lemmatizers.h
new file mode 100644
index 00000000000..6682afaa415
--- /dev/null
+++ b/src/Interpreters/Lemmatizers.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+
+#include
+#include
+
+
+namespace DB
+{
+
+class ILemmatizer
+{
+public:
+ using TokenPtr = std::shared_ptr;
+
+ virtual TokenPtr lemmatize(const char * token) = 0;
+
+ virtual ~ILemmatizer() = default;
+};
+
+
+class Lemmatizers
+{
+public:
+ using LemmPtr = std::shared_ptr;
+
+private:
+ std::mutex mutex;
+ std::unordered_map lemmatizers;
+ std::unordered_map paths;
+
+public:
+ explicit Lemmatizers(const Poco::Util::AbstractConfiguration & config);
+
+ LemmPtr getLemmatizer(const String & name);
+};
+
+}
+
+#endif
diff --git a/src/Interpreters/SynonymsExtensions.cpp b/src/Interpreters/SynonymsExtensions.cpp
new file mode 100644
index 00000000000..22fa91a4349
--- /dev/null
+++ b/src/Interpreters/SynonymsExtensions.cpp
@@ -0,0 +1,157 @@
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+ extern const int UNKNOWN_ELEMENT_IN_CONFIG;
+ extern const int INVALID_CONFIG_PARAMETER;
+}
+
+class PlainSynonymsExtension : public ISynonymsExtension
+{
+private:
+ using Container = std::list;
+ using LookupTable = std::unordered_map;
+
+ Container synsets;
+ LookupTable table;
+
+public:
+ explicit PlainSynonymsExtension(const String & path)
+ {
+ std::ifstream file(path);
+ if (!file.is_open())
+ throw Exception("Cannot find synonyms extension at: " + path,
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+
+ String line;
+ while (std::getline(file, line))
+ {
+ Synset synset;
+ boost::split(synset, line, boost::is_any_of("\t "));
+ if (!synset.empty())
+ {
+ synsets.emplace_back(std::move(synset));
+
+ for (const auto &word : synsets.back())
+ table[word] = &synsets.back();
+ }
+ }
+ }
+
+ const Synset * getSynonyms(std::string_view token) const override
+ {
+ auto it = table.find(token);
+
+ if (it != table.end())
+ return (*it).second;
+
+ return nullptr;
+ }
+};
+
+class WordnetSynonymsExtension : public ISynonymsExtension
+{
+private:
+ wnb::wordnet wn;
+
+public:
+ explicit WordnetSynonymsExtension(const String & path) : wn(path) {}
+
+ const Synset * getSynonyms(std::string_view token) const override
+ {
+ return wn.get_synset(std::string(token));
+ }
+};
+
+/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
+static bool startsWith(const std::string & s, const char * prefix)
+{
+ return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
+}
+
+SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
+{
+ String prefix = "synonyms_extensions";
+ Poco::Util::AbstractConfiguration::Keys keys;
+
+ if (!config.has(prefix))
+ throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER,
+ "No synonims extensions specified in server config on prefix '{}'", prefix);
+
+ config.keys(prefix, keys);
+
+ for (const auto & key : keys)
+ {
+ if (startsWith(key, "extension"))
+ {
+ const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
+ const auto & ext_path = config.getString(prefix + "." + key + ".path", "");
+ const auto & ext_type = config.getString(prefix + "." + key + ".type", "");
+
+ if (ext_name.empty())
+ throw Exception("Extension name in config is not specified here: " + prefix + "." + key + ".name",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+ if (ext_path.empty())
+ throw Exception("Extension path in config is not specified here: " + prefix + "." + key + ".path",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+ if (ext_type.empty())
+ throw Exception("Extension type in config is not specified here: " + prefix + "." + key + ".type",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+ if (ext_type != "plain" && ext_type != "wordnet")
+ throw Exception("Unknown extension type in config: " + prefix + "." + key + ".type, must be 'plain' or 'wordnet'",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+
+ info[ext_name].path = ext_path;
+ info[ext_name].type = ext_type;
+ }
+ else
+ throw Exception("Unknown element in config: " + prefix + "." + key + ", must be 'extension'",
+ ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
+ }
+}
+
+SynonymsExtensions::ExtPtr SynonymsExtensions::getExtension(const String & name)
+{
+ std::lock_guard guard(mutex);
+
+ if (extensions.find(name) != extensions.end())
+ return extensions[name];
+
+ if (info.find(name) != info.end())
+ {
+ const Info & ext_info = info[name];
+
+ if (ext_info.type == "plain")
+ extensions[name] = std::make_shared(ext_info.path);
+ else if (ext_info.type == "wordnet")
+ extensions[name] = std::make_shared(ext_info.path);
+ else
+ throw Exception("Unknown extension type: " + ext_info.type, ErrorCodes::LOGICAL_ERROR);
+
+ return extensions[name];
+ }
+
+ throw Exception("Extension named: '" + name + "' is not found",
+ ErrorCodes::INVALID_CONFIG_PARAMETER);
+}
+
+}
+
+#endif
diff --git a/src/Interpreters/SynonymsExtensions.h b/src/Interpreters/SynonymsExtensions.h
new file mode 100644
index 00000000000..fd2bf03e162
--- /dev/null
+++ b/src/Interpreters/SynonymsExtensions.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+# include "config_core.h"
+#endif
+
+#if USE_NLP
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+namespace DB
+{
+
+class ISynonymsExtension
+{
+public:
+ using Synset = std::vector;
+
+ virtual const Synset * getSynonyms(std::string_view token) const = 0;
+
+ virtual ~ISynonymsExtension() = default;
+};
+
+class SynonymsExtensions
+{
+public:
+ using ExtPtr = std::shared_ptr;
+
+ explicit SynonymsExtensions(const Poco::Util::AbstractConfiguration & config);
+
+ ExtPtr getExtension(const String & name);
+
+private:
+ struct Info
+ {
+ String path;
+ String type;
+ };
+
+ using ExtContainer = std::unordered_map;
+ using InfoContainer = std::unordered_map;
+
+ std::mutex mutex;
+ ExtContainer extensions;
+ InfoContainer info;
+};
+
+}
+
+#endif
diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make
index 29fc69bc33d..462c778bf3d 100644
--- a/src/Interpreters/ya.make
+++ b/src/Interpreters/ya.make
@@ -108,6 +108,7 @@ SRCS(
JoinSwitcher.cpp
JoinToSubqueryTransformVisitor.cpp
JoinedTables.cpp
+ Lemmatizers.cpp
LogicalExpressionsOptimizer.cpp
MarkTableIdentifiersVisitor.cpp
MergeJoin.cpp
@@ -145,6 +146,7 @@ SRCS(
SortedBlocksWriter.cpp
StorageID.cpp
SubqueryForSet.cpp
+ SynonymsExtensions.cpp
SystemLog.cpp
TableJoin.cpp
TablesStatus.cpp
diff --git a/tests/integration/test_nlp/__init__.py b/tests/integration/test_nlp/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_nlp/configs/dicts_config.xml b/tests/integration/test_nlp/configs/dicts_config.xml
new file mode 100644
index 00000000000..435507ce1d8
--- /dev/null
+++ b/tests/integration/test_nlp/configs/dicts_config.xml
@@ -0,0 +1,22 @@
+
+
+
+
+ en
+ plain
+ /etc/clickhouse-server/dictionaries/ext-en.txt
+
+
+ ru
+ plain
+ /etc/clickhouse-server/dictionaries/ext-ru.txt
+
+
+
+
+
+ en
+ /etc/clickhouse-server/dictionaries/lem-en.bin
+
+
+
diff --git a/tests/integration/test_nlp/dictionaries/ext-en.txt b/tests/integration/test_nlp/dictionaries/ext-en.txt
new file mode 100644
index 00000000000..beb508e437d
--- /dev/null
+++ b/tests/integration/test_nlp/dictionaries/ext-en.txt
@@ -0,0 +1,4 @@
+important big critical crucial essential
+happy cheerful delighted ecstatic
+however nonetheless but yet
+quiz query check exam
diff --git a/tests/integration/test_nlp/dictionaries/ext-ru.txt b/tests/integration/test_nlp/dictionaries/ext-ru.txt
new file mode 100644
index 00000000000..5466354b264
--- /dev/null
+++ b/tests/integration/test_nlp/dictionaries/ext-ru.txt
@@ -0,0 +1,4 @@
+важный большой высокий хороший главный
+веселый счастливый живой яркий смешной
+хотя однако но правда
+экзамен испытание проверка
\ No newline at end of file
diff --git a/tests/integration/test_nlp/dictionaries/lem-en.bin b/tests/integration/test_nlp/dictionaries/lem-en.bin
new file mode 100644
index 00000000000..8981bc1ead0
Binary files /dev/null and b/tests/integration/test_nlp/dictionaries/lem-en.bin differ
diff --git a/tests/integration/test_nlp/test.py b/tests/integration/test_nlp/test.py
new file mode 100644
index 00000000000..24935153608
--- /dev/null
+++ b/tests/integration/test_nlp/test.py
@@ -0,0 +1,47 @@
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+from helpers.cluster import ClickHouseCluster
+
+
+cluster = ClickHouseCluster(__file__)
+instance = cluster.add_instance('instance', main_configs=['configs/dicts_config.xml'])
+
+def copy_file_to_container(local_path, dist_path, container_id):
+ os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path))
+
+@pytest.fixture(scope="module")
+def start_cluster():
+ try:
+ cluster.start()
+
+ copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', instance.docker_id)
+
+ yield cluster
+ finally:
+ cluster.shutdown()
+
+def test_lemmatize(start_cluster):
+ assert instance.query("SELECT lemmatize('en', 'wolves')", settings={"allow_experimental_nlp_functions": 1}) == "wolf\n"
+ assert instance.query("SELECT lemmatize('en', 'dogs')", settings={"allow_experimental_nlp_functions": 1}) == "dog\n"
+ assert instance.query("SELECT lemmatize('en', 'looking')", settings={"allow_experimental_nlp_functions": 1}) == "look\n"
+ assert instance.query("SELECT lemmatize('en', 'took')", settings={"allow_experimental_nlp_functions": 1}) == "take\n"
+ assert instance.query("SELECT lemmatize('en', 'imported')", settings={"allow_experimental_nlp_functions": 1}) == "import\n"
+ assert instance.query("SELECT lemmatize('en', 'tokenized')", settings={"allow_experimental_nlp_functions": 1}) == "tokenize\n"
+ assert instance.query("SELECT lemmatize('en', 'flown')", settings={"allow_experimental_nlp_functions": 1}) == "fly\n"
+
+def test_synonyms_extensions(start_cluster):
+ assert instance.query("SELECT synonyms('en', 'crucial')", settings={"allow_experimental_nlp_functions": 1}) == "['important','big','critical','crucial','essential']\n"
+ assert instance.query("SELECT synonyms('en', 'cheerful')", settings={"allow_experimental_nlp_functions": 1}) == "['happy','cheerful','delighted','ecstatic']\n"
+ assert instance.query("SELECT synonyms('en', 'yet')", settings={"allow_experimental_nlp_functions": 1}) == "['however','nonetheless','but','yet']\n"
+ assert instance.query("SELECT synonyms('en', 'quiz')", settings={"allow_experimental_nlp_functions": 1}) == "['quiz','query','check','exam']\n"
+
+ assert instance.query("SELECT synonyms('ru', 'главный')", settings={"allow_experimental_nlp_functions": 1}) == "['важный','большой','высокий','хороший','главный']\n"
+ assert instance.query("SELECT synonyms('ru', 'веселый')", settings={"allow_experimental_nlp_functions": 1}) == "['веселый','счастливый','живой','яркий','смешной']\n"
+ assert instance.query("SELECT synonyms('ru', 'правда')", settings={"allow_experimental_nlp_functions": 1}) == "['хотя','однако','но','правда']\n"
+ assert instance.query("SELECT synonyms('ru', 'экзамен')", settings={"allow_experimental_nlp_functions": 1}) == "['экзамен','испытание','проверка']\n"
diff --git a/tests/performance/nlp.xml b/tests/performance/nlp.xml
new file mode 100644
index 00000000000..e5006027c59
--- /dev/null
+++ b/tests/performance/nlp.xml
@@ -0,0 +1,20 @@
+
+
+ 1
+
+
+
+ hits_100m_single
+
+
+ CREATE TABLE hits_100m_words (words Array(String), UserID UInt64) ENGINE Memory
+ CREATE TABLE hits_100m_words_ws (words Array(String), UserID UInt64) ENGINE Memory
+
+ INSERT INTO hits_100m_words SELECT splitByNonAlpha(SearchPhrase) AS words, UserID FROM hits_100m_single WHERE length(words) > 0
+ INSERT INTO hits_100m_words_ws SELECT splitByWhitespace(SearchPhrase) AS words, UserID FROM hits_100m_single WHERE length(words) > 0
+
+ SELECT arrayMap(x -> stem('ru', x), words) FROM hits_100m_words FORMAT Null
+
+ DROP TABLE IF EXISTS hits_100m_words
+ DROP TABLE IF EXISTS hits_100m_words_ws
+
diff --git a/tests/queries/0_stateless/01889_tokenize.reference b/tests/queries/0_stateless/01889_tokenize.reference
new file mode 100644
index 00000000000..4dd6f323929
--- /dev/null
+++ b/tests/queries/0_stateless/01889_tokenize.reference
@@ -0,0 +1,8 @@
+['It','is','quite','a','wonderful','day','isn','t','it']
+['There','is','so','much','to','learn']
+['22','00','email','yandex','ru']
+['Токенизация','каких','либо','других','языков']
+['It','is','quite','a','wonderful','day,','isn\'t','it?']
+['There','is....','so','much','to','learn!']
+['22:00','email@yandex.ru']
+['Токенизация','каких-либо','других','языков?']
diff --git a/tests/queries/0_stateless/01889_tokenize.sql b/tests/queries/0_stateless/01889_tokenize.sql
new file mode 100644
index 00000000000..c9d29a8632b
--- /dev/null
+++ b/tests/queries/0_stateless/01889_tokenize.sql
@@ -0,0 +1,11 @@
+SET allow_experimental_nlp_functions = 1;
+
+SELECT splitByNonAlpha('It is quite a wonderful day, isn\'t it?');
+SELECT splitByNonAlpha('There is.... so much to learn!');
+SELECT splitByNonAlpha('22:00 email@yandex.ru');
+SELECT splitByNonAlpha('Токенизация каких-либо других языков?');
+
+SELECT splitByWhitespace('It is quite a wonderful day, isn\'t it?');
+SELECT splitByWhitespace('There is.... so much to learn!');
+SELECT splitByWhitespace('22:00 email@yandex.ru');
+SELECT splitByWhitespace('Токенизация каких-либо других языков?');
diff --git a/tests/queries/0_stateless/01890_stem.reference b/tests/queries/0_stateless/01890_stem.reference
new file mode 100644
index 00000000000..33e18cd6775
--- /dev/null
+++ b/tests/queries/0_stateless/01890_stem.reference
@@ -0,0 +1,21 @@
+given
+combinatori
+collect
+possibl
+studi
+commonplac
+pack
+комбинаторн
+получ
+огранич
+конечн
+максимальн
+суммарн
+стоимост
+remplissag
+valeur
+maximis
+dépass
+intens
+étudi
+peuvent
diff --git a/tests/queries/0_stateless/01890_stem.sql b/tests/queries/0_stateless/01890_stem.sql
new file mode 100644
index 00000000000..472cfb54251
--- /dev/null
+++ b/tests/queries/0_stateless/01890_stem.sql
@@ -0,0 +1,25 @@
+SET allow_experimental_nlp_functions = 1;
+
+SELECT stem('en', 'given');
+SELECT stem('en', 'combinatorial');
+SELECT stem('en', 'collection');
+SELECT stem('en', 'possibility');
+SELECT stem('en', 'studied');
+SELECT stem('en', 'commonplace');
+SELECT stem('en', 'packing');
+
+SELECT stem('ru', 'комбинаторной');
+SELECT stem('ru', 'получила');
+SELECT stem('ru', 'ограничена');
+SELECT stem('ru', 'конечной');
+SELECT stem('ru', 'максимальной');
+SELECT stem('ru', 'суммарный');
+SELECT stem('ru', 'стоимостью');
+
+SELECT stem('fr', 'remplissage');
+SELECT stem('fr', 'valeur');
+SELECT stem('fr', 'maximiser');
+SELECT stem('fr', 'dépasser');
+SELECT stem('fr', 'intensivement');
+SELECT stem('fr', 'étudié');
+SELECT stem('fr', 'peuvent');