2022-09-28 13:29:29 +00:00
|
|
|
#include "config.h"
|
2021-07-30 13:30:30 +00:00
|
|
|
|
|
|
|
#if USE_NLP
|
|
|
|
|
2021-06-05 00:52:35 +00:00
|
|
|
#include <Common/Exception.h>
|
2021-10-27 23:10:39 +00:00
|
|
|
#include <Interpreters/Lemmatizers.h>
|
|
|
|
#include <RdrLemmatizer.h>
|
2021-06-05 00:52:35 +00:00
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
#include <filesystem>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int UNKNOWN_ELEMENT_IN_CONFIG;
|
|
|
|
extern const int INVALID_CONFIG_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-06-20 12:31:07 +00:00
|
|
|
class Lemmatizer : public ILemmatizer
|
2021-06-05 00:52:35 +00:00
|
|
|
{
|
|
|
|
private:
|
|
|
|
RdrLemmatizer lemmatizer;
|
|
|
|
|
|
|
|
public:
|
2021-06-20 12:31:07 +00:00
|
|
|
explicit Lemmatizer(const String & path) : lemmatizer(path.data()) {}
|
2021-06-05 00:52:35 +00:00
|
|
|
|
2021-06-20 12:31:07 +00:00
|
|
|
TokenPtr lemmatize(const char * token) override
|
2021-06-05 00:52:35 +00:00
|
|
|
{
|
|
|
|
return TokenPtr(lemmatizer.Lemmatize(token));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
|
|
|
|
static bool startsWith(const std::string & s, const char * prefix)
|
|
|
|
{
|
|
|
|
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
|
|
|
|
}
|
|
|
|
|
|
|
|
Lemmatizers::Lemmatizers(const Poco::Util::AbstractConfiguration & config)
|
|
|
|
{
|
|
|
|
String prefix = "lemmatizers";
|
|
|
|
Poco::Util::AbstractConfiguration::Keys keys;
|
|
|
|
|
|
|
|
if (!config.has(prefix))
|
2021-07-07 11:07:20 +00:00
|
|
|
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "No lemmatizers specified in server config on prefix '{}'", prefix);
|
2021-06-05 00:52:35 +00:00
|
|
|
|
|
|
|
config.keys(prefix, keys);
|
|
|
|
|
|
|
|
for (const auto & key : keys)
|
|
|
|
{
|
|
|
|
if (startsWith(key, "lemmatizer"))
|
|
|
|
{
|
|
|
|
const auto & lemm_name = config.getString(prefix + "." + key + ".lang", "");
|
|
|
|
const auto & lemm_path = config.getString(prefix + "." + key + ".path", "");
|
|
|
|
|
|
|
|
if (lemm_name.empty())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Lemmatizer language in config is not specified here: "
|
|
|
|
"{}.{}.lang", prefix, key);
|
2021-06-05 00:52:35 +00:00
|
|
|
if (lemm_path.empty())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Path to lemmatizer in config is not specified here: {}.{}.path",
|
|
|
|
prefix, key);
|
2021-06-05 00:52:35 +00:00
|
|
|
|
|
|
|
paths[lemm_name] = lemm_path;
|
|
|
|
}
|
|
|
|
else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown element in config: {}.{}, must be 'lemmatizer'",
|
|
|
|
prefix, key);
|
2021-06-05 00:52:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-19 18:52:09 +00:00
|
|
|
Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name)
|
|
|
|
{
|
|
|
|
std::lock_guard guard(mutex);
|
2021-06-05 00:52:35 +00:00
|
|
|
|
2021-06-19 18:52:09 +00:00
|
|
|
if (lemmatizers.find(name) != lemmatizers.end())
|
|
|
|
return lemmatizers[name];
|
2021-06-05 00:52:35 +00:00
|
|
|
|
2021-06-19 18:52:09 +00:00
|
|
|
if (paths.find(name) != paths.end())
|
|
|
|
{
|
|
|
|
if (!std::filesystem::exists(paths[name]))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Incorrect path to lemmatizer: {}", paths[name]);
|
2021-06-05 00:52:35 +00:00
|
|
|
|
2021-06-19 18:52:09 +00:00
|
|
|
lemmatizers[name] = std::make_shared<Lemmatizer>(paths[name]);
|
|
|
|
return lemmatizers[name];
|
2021-06-05 00:52:35 +00:00
|
|
|
}
|
|
|
|
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Lemmatizer named: '{}' is not found", name);
|
2021-06-19 18:52:09 +00:00
|
|
|
}
|
|
|
|
|
2021-06-05 00:52:35 +00:00
|
|
|
}
|
2021-07-30 13:30:30 +00:00
|
|
|
|
|
|
|
#endif
|