2023-07-23 02:56:47 +00:00
|
|
|
#include <Common/FrequencyHolder.h>
|
|
|
|
|
2023-07-23 15:59:49 +00:00
|
|
|
#if USE_NLP
|
|
|
|
|
2023-07-23 02:56:47 +00:00
|
|
|
#include <incbin.h>
|
|
|
|
|
|
|
|
/// Embedded SQL definitions
|
2023-07-23 22:03:40 +00:00
|
|
|
INCBIN(resource_charset_zst, SOURCE_DIR "/contrib/nlp-data/charset.zst");
|
|
|
|
INCBIN(resource_tonality_ru_zst, SOURCE_DIR "/contrib/nlp-data/tonality_ru.zst");
|
|
|
|
INCBIN(resource_programming_zst, SOURCE_DIR "/contrib/nlp-data/programming.zst");
|
2023-07-23 02:56:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int FILE_DOESNT_EXIST;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
FrequencyHolder & FrequencyHolder::getInstance()
|
|
|
|
{
|
|
|
|
static FrequencyHolder instance;
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
|
|
|
|
FrequencyHolder::FrequencyHolder()
|
|
|
|
{
|
|
|
|
loadEmotionalDict();
|
|
|
|
loadEncodingsFrequency();
|
|
|
|
loadProgrammingFrequency();
|
|
|
|
}
|
|
|
|
|
|
|
|
void FrequencyHolder::loadEncodingsFrequency()
|
|
|
|
{
|
|
|
|
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
|
|
|
|
|
|
|
LOG_TRACE(log, "Loading embedded charset frequencies");
|
|
|
|
|
|
|
|
std::string_view resource(reinterpret_cast<const char *>(gresource_charset_zstData), gresource_charset_zstSize);
|
|
|
|
if (resource.empty())
|
|
|
|
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
|
|
|
|
|
|
|
String line;
|
|
|
|
UInt16 bigram;
|
|
|
|
Float64 frequency;
|
|
|
|
String charset_name;
|
|
|
|
|
|
|
|
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
|
|
|
ZstdInflatingReadBuffer in(std::move(buf));
|
|
|
|
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
readString(line, in);
|
|
|
|
in.ignore();
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ReadBufferFromString buf_line(line);
|
|
|
|
|
|
|
|
// Start loading a new charset
|
|
|
|
if (line.starts_with("// "))
|
|
|
|
{
|
|
|
|
// Skip "// "
|
|
|
|
buf_line.ignore(3);
|
|
|
|
readString(charset_name, buf_line);
|
|
|
|
|
|
|
|
/* In our dictionary we have lines with form: <Language>_<Charset>
|
|
|
|
* If we need to find language of data, we return <Language>
|
|
|
|
* If we need to find charset of data, we return <Charset>.
|
|
|
|
*/
|
|
|
|
size_t sep = charset_name.find('_');
|
|
|
|
|
|
|
|
Encoding enc;
|
|
|
|
enc.lang = charset_name.substr(0, sep);
|
|
|
|
enc.name = charset_name.substr(sep + 1);
|
|
|
|
encodings_freq.push_back(std::move(enc));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
readIntText(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
|
|
|
|
|
|
|
encodings_freq.back().map[bigram] = frequency;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
void FrequencyHolder::loadEmotionalDict()
|
|
|
|
{
|
|
|
|
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
|
|
|
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
|
|
|
|
|
|
|
std::string_view resource(reinterpret_cast<const char *>(gresource_tonality_ru_zstData), gresource_tonality_ru_zstSize);
|
|
|
|
if (resource.empty())
|
|
|
|
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
|
|
|
|
|
|
|
String line;
|
|
|
|
String word;
|
|
|
|
Float64 tonality;
|
|
|
|
size_t count = 0;
|
|
|
|
|
|
|
|
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
|
|
|
ZstdInflatingReadBuffer in(std::move(buf));
|
|
|
|
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
readString(line, in);
|
|
|
|
in.ignore();
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ReadBufferFromString buf_line(line);
|
|
|
|
|
|
|
|
readStringUntilWhitespace(word, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(tonality, buf_line);
|
|
|
|
|
|
|
|
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
|
|
|
emotional_dict[ref] = tonality;
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
|
|
|
}
|
|
|
|
|
|
|
|
void FrequencyHolder::loadProgrammingFrequency()
|
|
|
|
{
|
|
|
|
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
|
|
|
|
|
|
|
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
|
|
|
|
|
|
|
std::string_view resource(reinterpret_cast<const char *>(gresource_programming_zstData), gresource_programming_zstSize);
|
|
|
|
if (resource.empty())
|
|
|
|
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
|
|
|
|
|
|
|
String line;
|
|
|
|
String bigram;
|
|
|
|
Float64 frequency;
|
|
|
|
String programming_language;
|
|
|
|
|
|
|
|
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
|
|
|
ZstdInflatingReadBuffer in(std::move(buf));
|
|
|
|
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
readString(line, in);
|
|
|
|
in.ignore();
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ReadBufferFromString buf_line(line);
|
|
|
|
|
|
|
|
// Start loading a new language
|
|
|
|
if (line.starts_with("// "))
|
|
|
|
{
|
|
|
|
// Skip "// "
|
|
|
|
buf_line.ignore(3);
|
|
|
|
readString(programming_language, buf_line);
|
|
|
|
|
|
|
|
Language lang;
|
|
|
|
lang.name = programming_language;
|
|
|
|
programming_freq.push_back(std::move(lang));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
readStringUntilWhitespace(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
|
|
|
|
|
|
|
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
|
|
|
programming_freq.back().map[ref] = frequency;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
LOG_TRACE(log, "Programming languages frequencies was added");
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2023-07-23 15:59:49 +00:00
|
|
|
|
|
|
|
#endif
|