2021-03-18 14:05:28 +00:00
|
|
|
#pragma once
|
2022-01-10 15:36:32 +00:00
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
#include <base/StringRef.h>
|
2022-04-27 15:05:45 +00:00
|
|
|
#include <Common/logger_useful.h>
|
2022-03-02 14:46:06 +00:00
|
|
|
|
|
|
|
#include <string_view>
|
|
|
|
#include <unordered_map>
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
#include <Common/Arena.h>
|
|
|
|
#include <Common/getResource.h>
|
|
|
|
#include <Common/HashTable/HashMap.h>
|
2021-03-18 21:57:42 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2021-03-23 18:55:14 +00:00
|
|
|
#include <IO/ReadBufferFromString.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/readFloatText.h>
|
2021-12-22 21:03:42 +00:00
|
|
|
#include <IO/ZstdInflatingReadBuffer.h>
|
2021-04-16 11:44:09 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int FILE_DOESNT_EXIST;
|
|
|
|
}
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
/// FrequencyHolder class is responsible for storing and loading dictionaries
|
|
|
|
/// needed for text classification functions:
|
|
|
|
///
|
|
|
|
/// 1. detectLanguageUnknown
|
|
|
|
/// 2. detectCharset
|
|
|
|
/// 3. detectTonality
|
|
|
|
/// 4. detectProgrammingLanguage
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
class FrequencyHolder
|
|
|
|
{
|
|
|
|
public:
|
2021-12-30 02:14:57 +00:00
|
|
|
struct Language
|
|
|
|
{
|
|
|
|
String name;
|
|
|
|
HashMap<StringRef, Float64> map;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct Encoding
|
|
|
|
{
|
|
|
|
String name;
|
2022-01-10 15:36:32 +00:00
|
|
|
String lang;
|
2021-12-30 02:14:57 +00:00
|
|
|
HashMap<UInt16, Float64> map;
|
|
|
|
};
|
|
|
|
|
|
|
|
public:
|
|
|
|
using Map = HashMap<StringRef, Float64>;
|
|
|
|
using Container = std::vector<Language>;
|
2022-03-02 14:46:06 +00:00
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
using EncodingMap = HashMap<UInt16, Float64>;
|
|
|
|
using EncodingContainer = std::vector<Encoding>;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
|
|
static FrequencyHolder & getInstance()
|
|
|
|
{
|
|
|
|
static FrequencyHolder instance;
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
|
2022-03-15 15:43:31 +00:00
|
|
|
const Map & getEmotionalDict() const
|
2022-03-02 14:46:06 +00:00
|
|
|
{
|
|
|
|
return emotional_dict;
|
|
|
|
}
|
|
|
|
|
2022-03-15 15:43:31 +00:00
|
|
|
const EncodingContainer & getEncodingsFrequency() const
|
2022-03-02 14:46:06 +00:00
|
|
|
{
|
|
|
|
return encodings_freq;
|
|
|
|
}
|
|
|
|
|
2022-03-15 15:43:31 +00:00
|
|
|
const Container & getProgrammingFrequency() const
|
2022-03-02 14:46:06 +00:00
|
|
|
{
|
|
|
|
return programming_freq;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
2022-03-16 16:16:08 +00:00
|
|
|
FrequencyHolder()
|
|
|
|
{
|
2022-03-15 15:43:31 +00:00
|
|
|
loadEmotionalDict();
|
|
|
|
loadEncodingsFrequency();
|
|
|
|
loadProgrammingFrequency();
|
|
|
|
}
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
void loadEncodingsFrequency()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-04-14 18:42:33 +00:00
|
|
|
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
LOG_TRACE(log, "Loading embedded charset frequencies");
|
2021-04-14 18:42:33 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
auto resource = getResource("charset.zst");
|
2021-12-22 21:03:42 +00:00
|
|
|
if (resource.empty())
|
|
|
|
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
2021-03-23 18:55:14 +00:00
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
String line;
|
|
|
|
UInt16 bigram;
|
|
|
|
Float64 frequency;
|
|
|
|
String charset_name;
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
2022-01-10 15:36:32 +00:00
|
|
|
ZstdInflatingReadBuffer in(std::move(buf));
|
2021-03-18 21:57:42 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
while (!in.eof())
|
2021-12-22 21:03:42 +00:00
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
readString(line, in);
|
|
|
|
in.ignore();
|
2021-03-18 21:57:42 +00:00
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
ReadBufferFromString buf_line(line);
|
|
|
|
|
|
|
|
// Start loading a new charset
|
2022-01-10 15:36:32 +00:00
|
|
|
if (line.starts_with("// "))
|
2021-03-18 21:57:42 +00:00
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
// Skip "// "
|
2021-12-22 21:03:42 +00:00
|
|
|
buf_line.ignore(3);
|
|
|
|
readString(charset_name, buf_line);
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
/* In our dictionary we have lines with form: <Language>_<Charset>
|
|
|
|
* If we need to find language of data, we return <Language>
|
|
|
|
* If we need to find charset of data, we return <Charset>.
|
|
|
|
*/
|
|
|
|
size_t sep = charset_name.find('_');
|
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
Encoding enc;
|
2022-01-10 15:36:32 +00:00
|
|
|
enc.lang = charset_name.substr(0, sep);
|
|
|
|
enc.name = charset_name.substr(sep + 1);
|
2021-12-30 02:14:57 +00:00
|
|
|
encodings_freq.push_back(std::move(enc));
|
2021-12-22 21:03:42 +00:00
|
|
|
}
|
|
|
|
else
|
2021-03-18 21:57:42 +00:00
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
readIntText(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
encodings_freq.back().map[bigram] = frequency;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
}
|
2021-12-22 21:03:42 +00:00
|
|
|
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
void loadEmotionalDict()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-04-14 18:42:33 +00:00
|
|
|
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
2022-01-10 15:36:32 +00:00
|
|
|
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
2021-12-22 21:03:42 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
auto resource = getResource("tonality_ru.zst");
|
2021-12-22 21:03:42 +00:00
|
|
|
if (resource.empty())
|
|
|
|
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
String line;
|
|
|
|
String word;
|
|
|
|
Float64 tonality;
|
|
|
|
size_t count = 0;
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
2022-01-10 15:36:32 +00:00
|
|
|
ZstdInflatingReadBuffer in(std::move(buf));
|
2021-04-14 18:42:33 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
while (!in.eof())
|
2021-03-19 09:34:33 +00:00
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
readString(line, in);
|
|
|
|
in.ignore();
|
2021-03-19 09:34:33 +00:00
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
if (line.empty())
|
|
|
|
continue;
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
ReadBufferFromString buf_line(line);
|
2021-04-18 17:03:56 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
readStringUntilWhitespace(word, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(tonality, buf_line);
|
2021-03-19 09:34:33 +00:00
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
|
|
|
emotional_dict[ref] = tonality;
|
2021-04-18 17:03:56 +00:00
|
|
|
++count;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
2021-04-18 17:03:56 +00:00
|
|
|
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
void loadProgrammingFrequency()
|
2021-04-15 12:02:53 +00:00
|
|
|
{
|
|
|
|
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
2021-04-15 12:02:53 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
auto resource = getResource("programming.zst");
|
2021-12-22 21:03:42 +00:00
|
|
|
if (resource.empty())
|
|
|
|
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
2021-04-15 12:02:53 +00:00
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
String line;
|
|
|
|
String bigram;
|
|
|
|
Float64 frequency;
|
|
|
|
String programming_language;
|
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
2022-01-10 15:36:32 +00:00
|
|
|
ZstdInflatingReadBuffer in(std::move(buf));
|
2021-04-15 12:02:53 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
while (!in.eof())
|
2021-12-22 21:03:42 +00:00
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
readString(line, in);
|
|
|
|
in.ignore();
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
ReadBufferFromString buf_line(line);
|
|
|
|
|
|
|
|
// Start loading a new language
|
2022-01-10 15:36:32 +00:00
|
|
|
if (line.starts_with("// "))
|
2021-04-15 12:02:53 +00:00
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
// Skip "// "
|
2021-12-22 21:03:42 +00:00
|
|
|
buf_line.ignore(3);
|
|
|
|
readString(programming_language, buf_line);
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
Language lang;
|
|
|
|
lang.name = programming_language;
|
|
|
|
programming_freq.push_back(std::move(lang));
|
2021-05-23 16:39:40 +00:00
|
|
|
}
|
|
|
|
else
|
2021-04-15 12:02:53 +00:00
|
|
|
{
|
|
|
|
readStringUntilWhitespace(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
|
|
|
programming_freq.back().map[ref] = frequency;
|
2021-04-15 12:02:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
LOG_TRACE(log, "Programming languages frequencies was added");
|
|
|
|
}
|
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
Arena string_pool;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
Map emotional_dict;
|
|
|
|
Container programming_freq;
|
|
|
|
EncodingContainer encodings_freq;
|
2021-03-18 14:05:28 +00:00
|
|
|
};
|
|
|
|
}
|