Merge pull request #24997 from evillique/nlp

NLP functions
This commit is contained in:
alesapin 2021-08-02 10:50:40 +03:00 committed by GitHub
commit 181f93c60a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 1611 additions and 3 deletions

9
.gitmodules vendored
View File

@ -225,6 +225,15 @@
[submodule "contrib/yaml-cpp"]
path = contrib/yaml-cpp
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
[submodule "contrib/libstemmer_c"]
path = contrib/libstemmer_c
url = https://github.com/ClickHouse-Extras/libstemmer_c.git
[submodule "contrib/wordnet-blast"]
path = contrib/wordnet-blast
url = https://github.com/ClickHouse-Extras/wordnet-blast.git
[submodule "contrib/lemmagen-c"]
path = contrib/lemmagen-c
url = https://github.com/ClickHouse-Extras/lemmagen-c.git
[submodule "contrib/libpqxx"]
path = contrib/libpqxx
url = https://github.com/ClickHouse-Extras/libpqxx.git

View File

@ -542,6 +542,7 @@ include (cmake/find/libpqxx.cmake)
include (cmake/find/nuraft.cmake)
include (cmake/find/yaml-cpp.cmake)
include (cmake/find/s2geometry.cmake)
include (cmake/find/nlp.cmake)
if(NOT USE_INTERNAL_PARQUET_LIBRARY)
set (ENABLE_ORC OFF CACHE INTERNAL "")

32
cmake/find/nlp.cmake Normal file
View File

@ -0,0 +1,32 @@
option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES})
if (NOT ENABLE_NLP)
message (STATUS "NLP functions disabled")
return()
endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c/Makefile")
message (WARNING "submodule contrib/libstemmer_c is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libstemmer_c library, NLP functions will be disabled")
set (USE_NLP 0)
return()
endif ()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast/CMakeLists.txt")
message (WARNING "submodule contrib/wordnet-blast is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal wordnet-blast library, NLP functions will be disabled")
set (USE_NLP 0)
return()
endif ()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c/README.md")
message (WARNING "submodule contrib/lemmagen-c is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal lemmagen-c library, NLP functions will be disabled")
set (USE_NLP 0)
return()
endif ()
set (USE_NLP 1)
message (STATUS "Using Libraries for NLP functions: contrib/wordnet-blast, contrib/libstemmer_c, contrib/lemmagen-c")

View File

@ -328,6 +328,12 @@ endif()
add_subdirectory(fast_float)
if (USE_NLP)
add_subdirectory(libstemmer-c-cmake)
add_subdirectory(wordnet-blast-cmake)
add_subdirectory(lemmagen-c-cmake)
endif()
if (USE_SQLITE)
add_subdirectory(sqlite-cmake)
endif()

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit 1ccbb5a522a571ce83b606dbc2e1011c42ecccfb
Subproject commit 9cf09dbfd55a5c6202dedbdf40781a51b02c2675

View File

@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
regex
context
coroutine
graph
)
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
Boost_COROUTINE_LIBRARY)
Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
set(EXTERNAL_BOOST_FOUND 1)
@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (_boost_system INTERFACE)
add_library (_boost_context INTERFACE)
add_library (_boost_coroutine INTERFACE)
add_library (_boost_graph INTERFACE)
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
add_library (boost::filesystem ALIAS _boost_filesystem)
add_library (boost::iostreams ALIAS _boost_iostreams)
@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (boost::system ALIAS _boost_system)
add_library (boost::context ALIAS _boost_context)
add_library (boost::coroutine ALIAS _boost_coroutine)
add_library (boost::graph ALIAS _boost_graph)
else()
set(EXTERNAL_BOOST_FOUND 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@ -221,4 +225,17 @@ if (NOT EXTERNAL_BOOST_FOUND)
add_library (boost::coroutine ALIAS _boost_coroutine)
target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
target_link_libraries(_boost_coroutine PRIVATE _boost_context)
# graph
set (SRCS_GRAPH
"${LIBRARY_DIR}/libs/graph/src/graphml.cpp"
"${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp"
)
add_library (_boost_graph ${SRCS_GRAPH})
add_library (boost::graph ALIAS _boost_graph)
target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
target_link_libraries(_boost_graph PRIVATE _boost_regex)
endif ()

1
contrib/lemmagen-c vendored Submodule

@ -0,0 +1 @@
Subproject commit 59537bdcf57bbed17913292cb4502d15657231f1

View File

@ -0,0 +1,9 @@
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/lemmagen-c")
set(LEMMAGEN_INCLUDE_DIR "${LIBRARY_DIR}/include")
set(SRCS
"${LIBRARY_DIR}/src/RdrLemmatizer.cpp"
)
add_library(lemmagen STATIC ${SRCS})
target_include_directories(lemmagen PUBLIC "${LEMMAGEN_INCLUDE_DIR}")

View File

@ -0,0 +1,31 @@
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/libstemmer_c")
set(STEMMER_INCLUDE_DIR "${LIBRARY_DIR}/include")
FILE ( READ "${LIBRARY_DIR}/mkinc.mak" _CONTENT )
# replace '\ ' into one big line
STRING ( REGEX REPLACE "\\\\\n " " ${LIBRARY_DIR}/" _CONTENT "${_CONTENT}" )
# escape ';' (if any)
STRING ( REGEX REPLACE ";" "\\\\;" _CONTENT "${_CONTENT}" )
# now replace lf into ';' (it makes list from the line)
STRING ( REGEX REPLACE "\n" ";" _CONTENT "${_CONTENT}" )
FOREACH ( LINE ${_CONTENT} )
# skip comments (beginning with #)
IF ( NOT "${LINE}" MATCHES "^#.*" )
# parse 'name=value1 value2..." - extract the 'name' part
STRING ( REGEX REPLACE "=.*$" "" _NAME "${LINE}" )
# extract the list of values part
STRING ( REGEX REPLACE "^.*=" "" _LIST "${LINE}" )
# replace (multi)spaces into ';' (it makes list from the line)
STRING ( REGEX REPLACE " +" ";" _LIST "${_LIST}" )
# finally get our two variables
IF ( "${_NAME}" MATCHES "snowball_sources" )
SET ( _SOURCES "${_LIST}" )
ELSEIF ( "${_NAME}" MATCHES "snowball_headers" )
SET ( _HEADERS "${_LIST}" )
ENDIF ()
endif ()
endforeach ()
# all the sources parsed. Now just add the lib
add_library ( stemmer STATIC ${_SOURCES} ${_HEADERS} )
target_include_directories (stemmer PUBLIC "${STEMMER_INCLUDE_DIR}")

1
contrib/libstemmer_c vendored Submodule

@ -0,0 +1 @@
Subproject commit c753054304d87daf460057c1a649c482aa094835

1
contrib/wordnet-blast vendored Submodule

@ -0,0 +1 @@
Subproject commit 1d16ac28036e19fe8da7ba72c16a307fbdf8c87e

View File

@ -0,0 +1,13 @@
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast")
set(SRCS
"${LIBRARY_DIR}/wnb/core/info_helper.cc"
"${LIBRARY_DIR}/wnb/core/load_wordnet.cc"
"${LIBRARY_DIR}/wnb/core/wordnet.cc"
)
add_library(wnb ${SRCS})
target_link_libraries(wnb PRIVATE boost::headers_only boost::graph)
target_include_directories(wnb PUBLIC "${LIBRARY_DIR}")

View File

@ -23,6 +23,7 @@ RUN apt-get update \
libboost-regex-dev \
libboost-context-dev \
libboost-coroutine-dev \
libboost-graph-dev \
zlib1g-dev \
liblz4-dev \
libdouble-conversion-dev \

View File

@ -311,6 +311,7 @@ function run_tests
01411_bayesian_ab_testing
01798_uniq_theta_sketch
01799_long_uniq_theta_sketch
01890_stem # depends on libstemmer_c
collate
collation
_orc_

View File

@ -0,0 +1,125 @@
---
toc_priority: 67
toc_title: NLP
---
# Natural Language Processing functions {#nlp-functions}
## stem {#stem}
Performs stemming on a previously tokenized text.
**Syntax**
``` sql
stem('language', word)
```
**Arguments**
- `language` — Language which rules will be applied. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string).
- `word` — word that needs to be stemmed. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string).
**Examples**
Query:
``` sql
SELECT SELECT arrayMap(x -> stem('en', x), ['I', 'think', 'it', 'is', 'a', 'blessing', 'in', 'disguise']) as res;
```
Result:
``` text
┌─res────────────────────────────────────────────────┐
│ ['I','think','it','is','a','bless','in','disguis'] │
└────────────────────────────────────────────────────┘
```
## lemmatize {#lemmatize}
Performs lemmatization on a given word.
**Syntax**
``` sql
lemmatize('language', word)
```
**Arguments**
- `language` — Language which rules will be applied. [String](../../sql-reference/data-types/string.md#string).
- `word` — Word that needs to be lemmatized. Must be lowercase. [String](../../sql-reference/data-types/string.md#string).
**Examples**
Query:
``` sql
SELECT lemmatize('en', 'wolves');
```
Result:
``` text
┌─lemmatize("wolves")─┐
│ "wolf" │
└─────────────────────┘
```
Configuration:
``` xml
<lemmatizers>
<lemmatizer>
<lang>en</lang>
<path>en.bin</path>
</lemmatizer>
</lemmatizers>
```
## synonyms {#synonyms}
Finds synonyms to a given word.
**Syntax**
``` sql
synonyms('extension_name', word)
```
**Arguments**
- `extension_name` — Name of the extention in which search will be performed. [String](../../sql-reference/data-types/string.md#string).
- `word` — Word that will be searched in extension. [String](../../sql-reference/data-types/string.md#string).
**Examples**
Query:
``` sql
SELECT synonyms('list', 'important');
```
Result:
``` text
┌─synonyms('list', 'important')────────────┐
│ ['important','big','critical','crucial'] │
└──────────────────────────────────────────┘
```
Configuration:
``` xml
<synonyms_extensions>
<extension>
<name>en</name>
<type>plain</type>
<path>en.txt</path>
</extension>
<extension>
<name>en</name>
<type>wordnet</type>
<path>en/</path>
</extension>
</synonyms_extensions>
```

View File

@ -145,6 +145,72 @@ Result:
└────────────────────────────┘
```
## splitByWhitespace(s) {#splitbywhitespaceseparator-s}
Splits a string into substrings separated by whitespace characters.
Returns an array of selected substrings.
**Syntax**
``` sql
splitByWhitespace(s)
```
**Arguments**
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
**Returned value(s)**
Returns an array of selected substrings.
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
**Example**
``` sql
SELECT splitByWhitespace(' 1! a, b. ');
```
``` text
┌─splitByWhitespace(' 1! a, b. ')─┐
│ ['1!','a,','b.'] │
└─────────────────────────────────────┘
```
## splitByNonAlpha(s) {#splitbynonalphaseparator-s}
Splits a string into substrings separated by whitespace and punctuation characters.
Returns an array of selected substrings.
**Syntax**
``` sql
splitByNonAlpha(s)
```
**Arguments**
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
**Returned value(s)**
Returns an array of selected substrings.
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
**Example**
``` sql
SELECT splitByNonAlpha(' 1! a, b. ');
```
``` text
┌─splitByNonAlpha(' 1! a, b. ')─┐
│ ['1','a','b'] │
└───────────────────────────────────┘
```
## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator}
Concatenates the strings listed in the array with the separator.separator is an optional parameter: a constant string, set to an empty string by default.

View File

@ -0,0 +1,125 @@
---
toc_priority: 67
toc_title: NLP
---
# Функции для работы с ествественным языком {#nlp-functions}
## stem {#stem}
Данная функция проводит стемминг заданного слова.
**Синтаксис**
``` sql
stem('language', word)
```
**Аргументы**
- `language` — Язык, правила которого будут применены для стемминга. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string).
- `word` — Слово подлежащее стеммингу. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string).
**Examples**
Query:
``` sql
SELECT SELECT arrayMap(x -> stem('en', x), ['I', 'think', 'it', 'is', 'a', 'blessing', 'in', 'disguise']) as res;
```
Result:
``` text
┌─res────────────────────────────────────────────────┐
│ ['I','think','it','is','a','bless','in','disguis'] │
└────────────────────────────────────────────────────┘
```
## lemmatize {#lemmatize}
Данная функция проводит лемматизацию для заданного слова.
**Синтаксис**
``` sql
lemmatize('language', word)
```
**Аргументы**
- `language` — Язык, правила которого будут применены для лемматизации. [String](../../sql-reference/data-types/string.md#string).
- `word` — Слово, подлежащее лемматизации. Допускается только нижний регистр. [String](../../sql-reference/data-types/string.md#string).
**Примеры**
Запрос:
``` sql
SELECT lemmatize('en', 'wolves');
```
Результат:
``` text
┌─lemmatize("wolves")─┐
│ "wolf" │
└─────────────────────┘
```
Конфигурация:
``` xml
<lemmatizers>
<lemmatizer>
<lang>en</lang>
<path>en.bin</path>
</lemmatizer>
</lemmatizers>
```
## synonyms {#synonyms}
Находит синонимы к заданному слову.
**Синтаксис**
``` sql
synonyms('extension_name', word)
```
**Аргументы**
- `extension_name` — Название расширения, в котором будет проводиться поиск. [String](../../sql-reference/data-types/string.md#string).
- `word` — Слово, которое будет искаться в расширении. [String](../../sql-reference/data-types/string.md#string).
**Примеры**
Запрос:
``` sql
SELECT synonyms('list', 'important');
```
Результат:
``` text
┌─synonyms('list', 'important')────────────┐
│ ['important','big','critical','crucial'] │
└──────────────────────────────────────────┘
```
Конфигурация:
``` xml
<synonyms_extensions>
<extension>
<name>en</name>
<type>plain</type>
<path>en.txt</path>
</extension>
<extension>
<name>en</name>
<type>wordnet</type>
<path>en/</path>
</extension>
</synonyms_extensions>
```

View File

@ -146,6 +146,70 @@ SELECT splitByRegexp('', 'abcde');
└────────────────────────────┘
```
## splitByWhitespace(s) {#splitbywhitespaceseparator-s}
Разбивает строку на подстроки, используя в качестве разделителей пробельные символы.
**Синтаксис**
``` sql
splitByWhitespace(s)
```
**Аргументы**
- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md).
**Возвращаемые значения**
Возвращает массив подстрок.
Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
**Пример**
``` sql
SELECT splitByWhitespace(' 1! a, b. ');
```
``` text
┌─splitByWhitespace(' 1! a, b. ')─┐
│ ['1!','a,','b.'] │
└─────────────────────────────────────┘
```
## splitByNonAlpha(s) {#splitbynonalphaseparator-s}
Разбивает строку на подстроки, используя в качестве разделителей пробельные символы и символы пунктуации.
**Синтаксис**
``` sql
splitByNonAlpha(s)
```
**Аргументы**
- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md).
**Возвращаемые значения**
Возвращает массив подстрок.
Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
**Пример**
``` sql
SELECT splitByNonAlpha(' 1! a, b. ');
```
``` text
┌─splitByNonAlpha(' 1! a, b. ')─┐
│ ['1','a','b'] │
└───────────────────────────────────┘
```
## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator}
Склеивает строки, перечисленные в массиве, с разделителем separator.

View File

@ -473,6 +473,12 @@ endif ()
dbms_target_link_libraries(PRIVATE _boost_context)
if (USE_NLP)
dbms_target_link_libraries (PUBLIC stemmer)
dbms_target_link_libraries (PUBLIC wnb)
dbms_target_link_libraries (PUBLIC lemmagen)
endif()
include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake")
if (ENABLE_TESTS AND USE_GTEST)

View File

@ -490,6 +490,7 @@ class IColumn;
\
/** Experimental functions */ \
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \
\
\
/** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \

View File

@ -15,4 +15,5 @@
#cmakedefine01 USE_LIBPQXX
#cmakedefine01 USE_SQLITE
#cmakedefine01 USE_NURAFT
#cmakedefine01 USE_NLP
#cmakedefine01 USE_KRB5

View File

@ -9,6 +9,8 @@ void registerFunctionsStringArray(FunctionFactory & factory)
{
factory.registerFunction<FunctionExtractAll>();
factory.registerFunction<FunctionAlphaTokens>();
factory.registerFunction<FunctionSplitByNonAlpha>();
factory.registerFunction<FunctionSplitByWhitespace>();
factory.registerFunction<FunctionSplitByChar>();
factory.registerFunction<FunctionSplitByString>();
factory.registerFunction<FunctionSplitByRegexp>();

View File

@ -33,6 +33,9 @@ namespace ErrorCodes
* splitByString(sep, s)
* splitByRegexp(regexp, s)
*
* splitByWhitespace(s) - split the string by whitespace characters
* splitByNonAlpha(s) - split the string by whitespace and punctuation characters
*
* extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
* - first subpattern, if regexp has subpattern;
* - zero subpattern (the match part, otherwise);
@ -111,6 +114,121 @@ public:
}
};
class SplitByNonAlphaImpl
{
private:
Pos pos;
Pos end;
public:
/// Get the name of the function.
static constexpr auto name = "splitByNonAlpha";
static String getName() { return name; }
static size_t getNumberOfArguments() { return 1; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
{
if (!isString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
size_t getStringsArgumentPosition()
{
return 0;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Skip garbage
while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
++pos;
if (pos == end)
return false;
token_begin = pos;
while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
++pos;
token_end = pos;
return true;
}
};
class SplitByWhitespaceImpl
{
private:
Pos pos;
Pos end;
public:
/// Get the name of the function.
static constexpr auto name = "splitByWhitespace";
static String getName() { return name; }
static size_t getNumberOfArguments() { return 1; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
{
if (!isString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
size_t getStringsArgumentPosition()
{
return 0;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Skip garbage
while (pos < end && isWhitespaceASCII(*pos))
++pos;
if (pos == end)
return false;
token_begin = pos;
while (pos < end && !isWhitespaceASCII(*pos))
++pos;
token_end = pos;
return true;
}
};
class SplitByCharImpl
{
@ -662,6 +780,8 @@ public:
using FunctionAlphaTokens = FunctionTokens<AlphaTokensImpl>;
using FunctionSplitByNonAlpha = FunctionTokens<SplitByNonAlphaImpl>;
using FunctionSplitByWhitespace = FunctionTokens<SplitByWhitespaceImpl>;
using FunctionSplitByChar = FunctionTokens<SplitByCharImpl>;
using FunctionSplitByString = FunctionTokens<SplitByStringImpl>;
using FunctionSplitByRegexp = FunctionTokens<SplitByRegexpImpl>;

130
src/Functions/lemmatize.cpp Normal file
View File

@ -0,0 +1,130 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Interpreters/Context.h>
#include <Interpreters/Lemmatizers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int SUPPORT_IS_DISABLED;
}
namespace
{
struct LemmatizeImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets,
Lemmatizers::LemmPtr & lemmatizer)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
UInt64 data_size = 0;
for (UInt64 i = 0; i < offsets.size(); ++i)
{
/// lemmatize() uses the fact the fact that each string ends with '\0'
auto result = lemmatizer->lemmatize(reinterpret_cast<const char *>(data.data() + offsets[i - 1]));
size_t new_size = strlen(result.get()) + 1;
if (data_size + new_size > res_data.size())
res_data.resize(data_size + new_size);
memcpy(res_data.data() + data_size, reinterpret_cast<const unsigned char *>(result.get()), new_size);
data_size += new_size;
res_offsets[i] = data_size;
}
res_data.resize(data_size);
}
};
class FunctionLemmatize : public IFunction
{
public:
static constexpr auto name = "lemmatize";
static FunctionPtr create(ContextPtr context)
{
if (!context->getSettingsRef().allow_experimental_nlp_functions)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
return std::make_shared<FunctionLemmatize>(context->getLemmatizers());
}
private:
Lemmatizers & lemmatizers;
public:
explicit FunctionLemmatize(Lemmatizers & lemmatizers_)
: lemmatizers(lemmatizers_) {}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isString(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return arguments[1];
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
{
const auto & langcolumn = arguments[0].column;
const auto & strcolumn = arguments[1].column;
const ColumnConst * lang_col = checkAndGetColumn<ColumnConst>(langcolumn.get());
const ColumnString * words_col = checkAndGetColumn<ColumnString>(strcolumn.get());
if (!lang_col)
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
if (!words_col)
throw Exception(
"Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
String language = lang_col->getValue<String>();
auto lemmatizer = lemmatizers.getLemmatizer(language);
auto col_res = ColumnString::create();
LemmatizeImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), lemmatizer);
return col_res;
}
};
}
void registerFunctionLemmatize(FunctionFactory & factory)
{
factory.registerFunction<FunctionLemmatize>(FunctionFactory::CaseInsensitive);
}
}
#endif

View File

@ -1,5 +1,6 @@
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
# include "config_core.h"
#endif
namespace DB
@ -39,13 +40,18 @@ void registerFunctionDecodeXMLComponent(FunctionFactory &);
void registerFunctionExtractTextFromHTML(FunctionFactory &);
void registerFunctionToStringCutToZero(FunctionFactory &);
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
void registerFunctionBase64Decode(FunctionFactory &);
void registerFunctionTryBase64Decode(FunctionFactory &);
#endif
#if USE_NLP
void registerFunctionStem(FunctionFactory &);
void registerFunctionSynonyms(FunctionFactory &);
void registerFunctionLemmatize(FunctionFactory &);
#endif
void registerFunctionsString(FunctionFactory & factory)
{
registerFunctionRepeat(factory);
@ -79,11 +85,18 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionDecodeXMLComponent(factory);
registerFunctionExtractTextFromHTML(factory);
registerFunctionToStringCutToZero(factory);
#if USE_BASE64
registerFunctionBase64Encode(factory);
registerFunctionBase64Decode(factory);
registerFunctionTryBase64Decode(factory);
#endif
#if USE_NLP
registerFunctionStem(factory);
registerFunctionSynonyms(factory);
registerFunctionLemmatize(factory);
#endif
}
}

135
src/Functions/stem.cpp Normal file
View File

@ -0,0 +1,135 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Interpreters/Context.h>
#include <libstemmer.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int SUPPORT_IS_DISABLED;
}
namespace
{
struct StemImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets,
const String & language)
{
sb_stemmer * stemmer = sb_stemmer_new(language.data(), "UTF_8");
if (stemmer == nullptr)
{
throw Exception(
"Language " + language + " is not supported for function stem",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
res_data.resize(data.size());
res_offsets.assign(offsets);
UInt64 data_size = 0;
for (UInt64 i = 0; i < offsets.size(); ++i)
{
/// Note that accessing -1th element is valid for PaddedPODArray.
size_t original_size = offsets[i] - offsets[i - 1];
const sb_symbol * result = sb_stemmer_stem(stemmer,
reinterpret_cast<const uint8_t *>(data.data() + offsets[i - 1]),
original_size - 1);
size_t new_size = sb_stemmer_length(stemmer) + 1;
memcpy(res_data.data() + data_size, result, new_size);
data_size += new_size;
res_offsets[i] = data_size;
}
res_data.resize(data_size);
sb_stemmer_delete(stemmer);
}
};
class FunctionStem : public IFunction
{
public:
static constexpr auto name = "stem";
static FunctionPtr create(ContextPtr context)
{
if (!context->getSettingsRef().allow_experimental_nlp_functions)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
return std::make_shared<FunctionStem>();
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isString(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return arguments[1];
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
{
const auto & langcolumn = arguments[0].column;
const auto & strcolumn = arguments[1].column;
const ColumnConst * lang_col = checkAndGetColumn<ColumnConst>(langcolumn.get());
const ColumnString * words_col = checkAndGetColumn<ColumnString>(strcolumn.get());
if (!lang_col)
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
if (!words_col)
throw Exception(
"Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
String language = lang_col->getValue<String>();
auto col_res = ColumnString::create();
StemImpl::vector(words_col->getChars(), words_col->getOffsets(), col_res->getChars(), col_res->getOffsets(), language);
return col_res;
}
};
}
void registerFunctionStem(FunctionFactory & factory)
{
factory.registerFunction<FunctionStem>(FunctionFactory::CaseInsensitive);
}
}
#endif

128
src/Functions/synonyms.cpp Normal file
View File

@ -0,0 +1,128 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Columns/ColumnArray.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Interpreters/Context.h>
#include <Interpreters/SynonymsExtensions.h>
#include <string_view>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int SUPPORT_IS_DISABLED;
}
class FunctionSynonyms : public IFunction
{
public:
static constexpr auto name = "synonyms";
static FunctionPtr create(ContextPtr context)
{
if (!context->getSettingsRef().allow_experimental_nlp_functions)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
return std::make_shared<FunctionSynonyms>(context->getSynonymsExtensions());
}
private:
SynonymsExtensions & extensions;
public:
explicit FunctionSynonyms(SynonymsExtensions & extensions_)
: extensions(extensions_) {}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isString(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
{
const auto & extcolumn = arguments[0].column;
const auto & strcolumn = arguments[1].column;
const ColumnConst * ext_col = checkAndGetColumn<ColumnConst>(extcolumn.get());
const ColumnString * word_col = checkAndGetColumn<ColumnString>(strcolumn.get());
if (!ext_col)
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
if (!word_col)
throw Exception(
"Illegal column " + arguments[1].column->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
String ext_name = ext_col->getValue<String>();
auto extension = extensions.getExtension(ext_name);
/// Create and fill the result array.
const DataTypePtr & elem_type = static_cast<const DataTypeArray &>(*result_type).getNestedType();
auto out = ColumnArray::create(elem_type->createColumn());
IColumn & out_data = out->getData();
IColumn::Offsets & out_offsets = out->getOffsets();
const ColumnString::Chars & data = word_col->getChars();
const ColumnString::Offsets & offsets = word_col->getOffsets();
out_data.reserve(input_rows_count);
out_offsets.resize(input_rows_count);
IColumn::Offset current_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
std::string_view word(reinterpret_cast<const char *>(data.data() + offsets[i - 1]), offsets[i] - offsets[i - 1] - 1);
const auto * synset = extension->getSynonyms(word);
if (synset)
{
for (const auto & token : *synset)
out_data.insert(Field(token.data(), token.size()));
current_offset += synset->size();
}
out_offsets[i] = current_offset;
}
return out;
}
};
void registerFunctionSynonyms(FunctionFactory & factory)
{
factory.registerFunction<FunctionSynonyms>(FunctionFactory::CaseInsensitive);
}
}
#endif

View File

@ -340,6 +340,7 @@ SRCS(
jumpConsistentHash.cpp
lcm.cpp
least.cpp
lemmatize.cpp
lengthUTF8.cpp
less.cpp
lessOrEquals.cpp
@ -481,6 +482,7 @@ SRCS(
sleepEachRow.cpp
sqrt.cpp
startsWith.cpp
stem.cpp
stringCutToZero.cpp
stringToH3.cpp
substring.cpp
@ -493,6 +495,7 @@ SRCS(
subtractWeeks.cpp
subtractYears.cpp
svg.cpp
synonyms.cpp
tan.cpp
tanh.cpp
tcpPort.cpp

View File

@ -77,6 +77,8 @@
#include <Interpreters/JIT/CompiledExpressionCache.h>
#include <Storages/MergeTree/BackgroundJobsExecutor.h>
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
#include <Interpreters/SynonymsExtensions.h>
#include <Interpreters/Lemmatizers.h>
#include <filesystem>
@ -349,6 +351,11 @@ struct ContextSharedPart
scope_guard dictionaries_xmls;
#if USE_NLP
mutable std::optional<SynonymsExtensions> synonyms_extensions;
mutable std::optional<Lemmatizers> lemmatizers;
#endif
String default_profile_name; /// Default profile name used for default values.
String system_profile_name; /// Profile used by system processes
String buffer_profile_name; /// Profile used by Buffer engine for flushing to the underlying
@ -1505,6 +1512,29 @@ void Context::loadDictionaries(const Poco::Util::AbstractConfiguration & config)
std::make_unique<ExternalLoaderXMLConfigRepository>(config, "dictionaries_config"));
}
#if USE_NLP
SynonymsExtensions & Context::getSynonymsExtensions() const
{
auto lock = getLock();
if (!shared->synonyms_extensions)
shared->synonyms_extensions.emplace(getConfigRef());
return *shared->synonyms_extensions;
}
Lemmatizers & Context::getLemmatizers() const
{
auto lock = getLock();
if (!shared->lemmatizers)
shared->lemmatizers.emplace(getConfigRef());
return *shared->lemmatizers;
}
#endif
void Context::setProgressCallback(ProgressCallback callback)
{
/// Callback is set to a session or to a query. In the session, only one query is processed at a time. Therefore, the lock is not needed.

View File

@ -114,6 +114,11 @@ using VolumePtr = std::shared_ptr<IVolume>;
struct NamedSession;
struct BackgroundTaskSchedulingSettings;
#if USE_NLP
class SynonymsExtensions;
class Lemmatizers;
#endif
class Throttler;
using ThrottlerPtr = std::shared_ptr<Throttler>;
@ -534,6 +539,11 @@ public:
void tryCreateEmbeddedDictionaries() const;
void loadDictionaries(const Poco::Util::AbstractConfiguration & config);
#if USE_NLP
SynonymsExtensions & getSynonymsExtensions() const;
Lemmatizers & getLemmatizers() const;
#endif
void setExternalModelsConfig(const ConfigurationPtr & config, const std::string & config_name = "models_config");
/// I/O formats.

View File

@ -0,0 +1,100 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Common/Exception.h>
#include <Interpreters/Lemmatizers.h>
#include <RdrLemmatizer.h>
#include <vector>
#include <filesystem>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_ELEMENT_IN_CONFIG;
extern const int INVALID_CONFIG_PARAMETER;
}
class Lemmatizer : public ILemmatizer
{
private:
RdrLemmatizer lemmatizer;
public:
explicit Lemmatizer(const String & path) : lemmatizer(path.data()) {}
TokenPtr lemmatize(const char * token) override
{
return TokenPtr(lemmatizer.Lemmatize(token));
}
};
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
static bool startsWith(const std::string & s, const char * prefix)
{
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
}
Lemmatizers::Lemmatizers(const Poco::Util::AbstractConfiguration & config)
{
String prefix = "lemmatizers";
Poco::Util::AbstractConfiguration::Keys keys;
if (!config.has(prefix))
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "No lemmatizers specified in server config on prefix '{}'", prefix);
config.keys(prefix, keys);
for (const auto & key : keys)
{
if (startsWith(key, "lemmatizer"))
{
const auto & lemm_name = config.getString(prefix + "." + key + ".lang", "");
const auto & lemm_path = config.getString(prefix + "." + key + ".path", "");
if (lemm_name.empty())
throw Exception("Lemmatizer language in config is not specified here: " + prefix + "." + key + ".lang",
ErrorCodes::INVALID_CONFIG_PARAMETER);
if (lemm_path.empty())
throw Exception("Path to lemmatizer in config is not specified here: " + prefix + "." + key + ".path",
ErrorCodes::INVALID_CONFIG_PARAMETER);
paths[lemm_name] = lemm_path;
}
else
throw Exception("Unknown element in config: " + prefix + "." + key + ", must be 'lemmatizer'",
ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
}
}
Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name)
{
std::lock_guard guard(mutex);
if (lemmatizers.find(name) != lemmatizers.end())
return lemmatizers[name];
if (paths.find(name) != paths.end())
{
if (!std::filesystem::exists(paths[name]))
throw Exception("Incorrect path to lemmatizer: " + paths[name],
ErrorCodes::INVALID_CONFIG_PARAMETER);
lemmatizers[name] = std::make_shared<Lemmatizer>(paths[name]);
return lemmatizers[name];
}
throw Exception("Lemmatizer named: '" + name + "' is not found",
ErrorCodes::INVALID_CONFIG_PARAMETER);
}
}
#endif

View File

@ -0,0 +1,48 @@
#pragma once
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <common/types.h>
#include <Poco/Util/Application.h>
#include <mutex>
#include <unordered_map>
namespace DB
{
class ILemmatizer
{
public:
using TokenPtr = std::shared_ptr<char []>;
virtual TokenPtr lemmatize(const char * token) = 0;
virtual ~ILemmatizer() = default;
};
class Lemmatizers
{
public:
using LemmPtr = std::shared_ptr<ILemmatizer>;
private:
std::mutex mutex;
std::unordered_map<String, LemmPtr> lemmatizers;
std::unordered_map<String, String> paths;
public:
explicit Lemmatizers(const Poco::Util::AbstractConfiguration & config);
LemmPtr getLemmatizer(const String & name);
};
}
#endif

View File

@ -0,0 +1,157 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <Common/Exception.h>
#include <Interpreters/SynonymsExtensions.h>
#include <fstream>
#include <list>
#include <boost/algorithm/string.hpp>
#include <wnb/core/wordnet.hh>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int UNKNOWN_ELEMENT_IN_CONFIG;
extern const int INVALID_CONFIG_PARAMETER;
}
class PlainSynonymsExtension : public ISynonymsExtension
{
private:
using Container = std::list<Synset>;
using LookupTable = std::unordered_map<std::string_view, Synset *>;
Container synsets;
LookupTable table;
public:
explicit PlainSynonymsExtension(const String & path)
{
std::ifstream file(path);
if (!file.is_open())
throw Exception("Cannot find synonyms extension at: " + path,
ErrorCodes::INVALID_CONFIG_PARAMETER);
String line;
while (std::getline(file, line))
{
Synset synset;
boost::split(synset, line, boost::is_any_of("\t "));
if (!synset.empty())
{
synsets.emplace_back(std::move(synset));
for (const auto &word : synsets.back())
table[word] = &synsets.back();
}
}
}
const Synset * getSynonyms(std::string_view token) const override
{
auto it = table.find(token);
if (it != table.end())
return (*it).second;
return nullptr;
}
};
class WordnetSynonymsExtension : public ISynonymsExtension
{
private:
wnb::wordnet wn;
public:
explicit WordnetSynonymsExtension(const String & path) : wn(path) {}
const Synset * getSynonyms(std::string_view token) const override
{
return wn.get_synset(std::string(token));
}
};
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
static bool startsWith(const std::string & s, const char * prefix)
{
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
}
SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
{
String prefix = "synonyms_extensions";
Poco::Util::AbstractConfiguration::Keys keys;
if (!config.has(prefix))
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER,
"No synonims extensions specified in server config on prefix '{}'", prefix);
config.keys(prefix, keys);
for (const auto & key : keys)
{
if (startsWith(key, "extension"))
{
const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
const auto & ext_path = config.getString(prefix + "." + key + ".path", "");
const auto & ext_type = config.getString(prefix + "." + key + ".type", "");
if (ext_name.empty())
throw Exception("Extension name in config is not specified here: " + prefix + "." + key + ".name",
ErrorCodes::INVALID_CONFIG_PARAMETER);
if (ext_path.empty())
throw Exception("Extension path in config is not specified here: " + prefix + "." + key + ".path",
ErrorCodes::INVALID_CONFIG_PARAMETER);
if (ext_type.empty())
throw Exception("Extension type in config is not specified here: " + prefix + "." + key + ".type",
ErrorCodes::INVALID_CONFIG_PARAMETER);
if (ext_type != "plain" && ext_type != "wordnet")
throw Exception("Unknown extension type in config: " + prefix + "." + key + ".type, must be 'plain' or 'wordnet'",
ErrorCodes::INVALID_CONFIG_PARAMETER);
info[ext_name].path = ext_path;
info[ext_name].type = ext_type;
}
else
throw Exception("Unknown element in config: " + prefix + "." + key + ", must be 'extension'",
ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
}
}
SynonymsExtensions::ExtPtr SynonymsExtensions::getExtension(const String & name)
{
std::lock_guard guard(mutex);
if (extensions.find(name) != extensions.end())
return extensions[name];
if (info.find(name) != info.end())
{
const Info & ext_info = info[name];
if (ext_info.type == "plain")
extensions[name] = std::make_shared<PlainSynonymsExtension>(ext_info.path);
else if (ext_info.type == "wordnet")
extensions[name] = std::make_shared<WordnetSynonymsExtension>(ext_info.path);
else
throw Exception("Unknown extension type: " + ext_info.type, ErrorCodes::LOGICAL_ERROR);
return extensions[name];
}
throw Exception("Extension named: '" + name + "' is not found",
ErrorCodes::INVALID_CONFIG_PARAMETER);
}
}
#endif

View File

@ -0,0 +1,57 @@
#pragma once
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_NLP
#include <common/types.h>
#include <Poco/Util/Application.h>
#include <memory>
#include <mutex>
#include <string_view>
#include <vector>
#include <unordered_map>
namespace DB
{
class ISynonymsExtension
{
public:
using Synset = std::vector<String>;
virtual const Synset * getSynonyms(std::string_view token) const = 0;
virtual ~ISynonymsExtension() = default;
};
class SynonymsExtensions
{
public:
using ExtPtr = std::shared_ptr<ISynonymsExtension>;
explicit SynonymsExtensions(const Poco::Util::AbstractConfiguration & config);
ExtPtr getExtension(const String & name);
private:
struct Info
{
String path;
String type;
};
using ExtContainer = std::unordered_map<String, ExtPtr>;
using InfoContainer = std::unordered_map<String, Info>;
std::mutex mutex;
ExtContainer extensions;
InfoContainer info;
};
}
#endif

View File

@ -108,6 +108,7 @@ SRCS(
JoinSwitcher.cpp
JoinToSubqueryTransformVisitor.cpp
JoinedTables.cpp
Lemmatizers.cpp
LogicalExpressionsOptimizer.cpp
MarkTableIdentifiersVisitor.cpp
MergeJoin.cpp
@ -145,6 +146,7 @@ SRCS(
SortedBlocksWriter.cpp
StorageID.cpp
SubqueryForSet.cpp
SynonymsExtensions.cpp
SystemLog.cpp
TableJoin.cpp
TablesStatus.cpp

View File

View File

@ -0,0 +1,22 @@
<?xml version="1.0"?>
<yandex>
<synonyms_extensions>
<extension>
<name>en</name>
<type>plain</type>
<path>/etc/clickhouse-server/dictionaries/ext-en.txt</path>
</extension>
<extension>
<name>ru</name>
<type>plain</type>
<path>/etc/clickhouse-server/dictionaries/ext-ru.txt</path>
</extension>
</synonyms_extensions>
<lemmatizers>
<lemmatizer>
<lang>en</lang>
<path>/etc/clickhouse-server/dictionaries/lem-en.bin</path>
</lemmatizer>
</lemmatizers>
</yandex>

View File

@ -0,0 +1,4 @@
important big critical crucial essential
happy cheerful delighted ecstatic
however nonetheless but yet
quiz query check exam

View File

@ -0,0 +1,4 @@
важный большой высокий хороший главный
веселый счастливый живой яркий смешной
хотя однако но правда
экзамен испытание проверка

Binary file not shown.

View File

@ -0,0 +1,47 @@
import os
import sys
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
instance = cluster.add_instance('instance', main_configs=['configs/dicts_config.xml'])
def copy_file_to_container(local_path, dist_path, container_id):
os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path))
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', instance.docker_id)
yield cluster
finally:
cluster.shutdown()
def test_lemmatize(start_cluster):
assert instance.query("SELECT lemmatize('en', 'wolves')", settings={"allow_experimental_nlp_functions": 1}) == "wolf\n"
assert instance.query("SELECT lemmatize('en', 'dogs')", settings={"allow_experimental_nlp_functions": 1}) == "dog\n"
assert instance.query("SELECT lemmatize('en', 'looking')", settings={"allow_experimental_nlp_functions": 1}) == "look\n"
assert instance.query("SELECT lemmatize('en', 'took')", settings={"allow_experimental_nlp_functions": 1}) == "take\n"
assert instance.query("SELECT lemmatize('en', 'imported')", settings={"allow_experimental_nlp_functions": 1}) == "import\n"
assert instance.query("SELECT lemmatize('en', 'tokenized')", settings={"allow_experimental_nlp_functions": 1}) == "tokenize\n"
assert instance.query("SELECT lemmatize('en', 'flown')", settings={"allow_experimental_nlp_functions": 1}) == "fly\n"
def test_synonyms_extensions(start_cluster):
assert instance.query("SELECT synonyms('en', 'crucial')", settings={"allow_experimental_nlp_functions": 1}) == "['important','big','critical','crucial','essential']\n"
assert instance.query("SELECT synonyms('en', 'cheerful')", settings={"allow_experimental_nlp_functions": 1}) == "['happy','cheerful','delighted','ecstatic']\n"
assert instance.query("SELECT synonyms('en', 'yet')", settings={"allow_experimental_nlp_functions": 1}) == "['however','nonetheless','but','yet']\n"
assert instance.query("SELECT synonyms('en', 'quiz')", settings={"allow_experimental_nlp_functions": 1}) == "['quiz','query','check','exam']\n"
assert instance.query("SELECT synonyms('ru', 'главный')", settings={"allow_experimental_nlp_functions": 1}) == "['важный','большой','высокий','хороший','главный']\n"
assert instance.query("SELECT synonyms('ru', 'веселый')", settings={"allow_experimental_nlp_functions": 1}) == "['веселый','счастливый','живой','яркий','смешной']\n"
assert instance.query("SELECT synonyms('ru', 'правда')", settings={"allow_experimental_nlp_functions": 1}) == "['хотя','однако','но','правда']\n"
assert instance.query("SELECT synonyms('ru', 'экзамен')", settings={"allow_experimental_nlp_functions": 1}) == "['экзамен','испытание','проверка']\n"

20
tests/performance/nlp.xml Normal file
View File

@ -0,0 +1,20 @@
<test>
<settings>
<allow_experimental_nlp_functions>1</allow_experimental_nlp_functions>
</settings>
<preconditions>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<create_query>CREATE TABLE hits_100m_words (words Array(String), UserID UInt64) ENGINE Memory</create_query>
<create_query>CREATE TABLE hits_100m_words_ws (words Array(String), UserID UInt64) ENGINE Memory</create_query>
<query>INSERT INTO hits_100m_words SELECT splitByNonAlpha(SearchPhrase) AS words, UserID FROM hits_100m_single WHERE length(words) > 0</query>
<query>INSERT INTO hits_100m_words_ws SELECT splitByWhitespace(SearchPhrase) AS words, UserID FROM hits_100m_single WHERE length(words) > 0</query>
<query>SELECT arrayMap(x -> stem('ru', x), words) FROM hits_100m_words FORMAT Null</query>
<drop_query>DROP TABLE IF EXISTS hits_100m_words</drop_query>
<drop_query>DROP TABLE IF EXISTS hits_100m_words_ws</drop_query>
</test>

View File

@ -0,0 +1,8 @@
['It','is','quite','a','wonderful','day','isn','t','it']
['There','is','so','much','to','learn']
['22','00','email','yandex','ru']
['Токенизация','каких','либо','других','языков']
['It','is','quite','a','wonderful','day,','isn\'t','it?']
['There','is....','so','much','to','learn!']
['22:00','email@yandex.ru']
['Токенизация','каких-либо','других','языков?']

View File

@ -0,0 +1,11 @@
SET allow_experimental_nlp_functions = 1;
SELECT splitByNonAlpha('It is quite a wonderful day, isn\'t it?');
SELECT splitByNonAlpha('There is.... so much to learn!');
SELECT splitByNonAlpha('22:00 email@yandex.ru');
SELECT splitByNonAlpha('Токенизация каких-либо других языков?');
SELECT splitByWhitespace('It is quite a wonderful day, isn\'t it?');
SELECT splitByWhitespace('There is.... so much to learn!');
SELECT splitByWhitespace('22:00 email@yandex.ru');
SELECT splitByWhitespace('Токенизация каких-либо других языков?');

View File

@ -0,0 +1,21 @@
given
combinatori
collect
possibl
studi
commonplac
pack
комбинаторн
получ
огранич
конечн
максимальн
суммарн
стоимост
remplissag
valeur
maximis
dépass
intens
étudi
peuvent

View File

@ -0,0 +1,25 @@
SET allow_experimental_nlp_functions = 1;
SELECT stem('en', 'given');
SELECT stem('en', 'combinatorial');
SELECT stem('en', 'collection');
SELECT stem('en', 'possibility');
SELECT stem('en', 'studied');
SELECT stem('en', 'commonplace');
SELECT stem('en', 'packing');
SELECT stem('ru', 'комбинаторной');
SELECT stem('ru', 'получила');
SELECT stem('ru', 'ограничена');
SELECT stem('ru', 'конечной');
SELECT stem('ru', 'максимальной');
SELECT stem('ru', 'суммарный');
SELECT stem('ru', 'стоимостью');
SELECT stem('fr', 'remplissage');
SELECT stem('fr', 'valeur');
SELECT stem('fr', 'maximiser');
SELECT stem('fr', 'dépasser');
SELECT stem('fr', 'intensivement');
SELECT stem('fr', 'étudié');
SELECT stem('fr', 'peuvent');