ClickHouse/src/Common/FrequencyHolder.h

238 lines
6.3 KiB
C++
Raw Normal View History

2021-03-18 14:05:28 +00:00
#pragma once
2021-03-18 21:57:42 +00:00
#include <Common/StringUtils/StringUtils.h>
#include <IO/ReadBufferFromFile.h>
2021-03-23 18:55:14 +00:00
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/readFloatText.h>
#include <IO/Operators.h>
2021-12-22 21:03:42 +00:00
#include <IO/ZstdInflatingReadBuffer.h>
2021-04-16 11:44:09 +00:00
2021-12-30 02:14:57 +00:00
#include <Common/Arena.h>
#include <base/StringRef.h>
#include <Common/HashTable/HashMap.h>
2021-03-18 21:57:42 +00:00
#include <string_view>
2021-03-18 14:05:28 +00:00
#include <string>
#include <cstring>
#include <unordered_map>
2021-11-17 18:45:52 +00:00
#include <base/logger_useful.h>
2021-12-30 03:15:21 +00:00
#include <Common/getResource.h>
2021-03-23 18:55:14 +00:00
2021-03-18 14:05:28 +00:00
namespace DB
{
2021-12-22 21:03:42 +00:00
namespace ErrorCodes
{
extern const int FILE_DOESNT_EXIST;
}
2021-03-18 14:05:28 +00:00
class FrequencyHolder
{
2021-03-18 14:05:28 +00:00
public:
2021-12-30 02:14:57 +00:00
struct Language
{
String name;
HashMap<StringRef, Float64> map;
};
struct Encoding
{
String name;
HashMap<UInt16, Float64> map;
};
public:
using Map = HashMap<StringRef, Float64>;
using Container = std::vector<Language>;
using EncodingMap = HashMap<UInt16, Float64>;
using EncodingContainer = std::vector<Encoding>;
2021-03-18 14:05:28 +00:00
static FrequencyHolder & getInstance()
{
static FrequencyHolder instance;
return instance;
}
2021-12-22 21:03:42 +00:00
void loadEncodingsFrequency()
2021-03-18 14:05:28 +00:00
{
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
2021-12-22 21:03:42 +00:00
LOG_TRACE(log, "Loading embedded charset frequencies");
2021-12-22 21:03:42 +00:00
auto resource = getResource("charset_freq.txt.zst");
if (resource.empty())
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
2021-03-23 18:55:14 +00:00
2021-12-30 02:14:57 +00:00
String line;
UInt16 bigram;
Float64 frequency;
String charset_name;
2021-12-22 21:03:42 +00:00
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
std::unique_ptr<ReadBuffer> in = std::make_unique<ZstdInflatingReadBuffer>(std::move(buf));
2021-03-18 21:57:42 +00:00
2021-12-22 21:03:42 +00:00
while (!in->eof())
{
readString(line, *in);
++in->position();
2021-03-18 21:57:42 +00:00
if (line.empty())
continue;
2021-12-30 02:14:57 +00:00
2021-12-22 21:03:42 +00:00
ReadBufferFromString buf_line(line);
// Start loading a new charset
if (line.starts_with("//"))
2021-03-18 21:57:42 +00:00
{
2021-12-22 21:03:42 +00:00
buf_line.ignore(3);
readString(charset_name, buf_line);
2021-12-30 02:14:57 +00:00
Encoding enc;
enc.name = charset_name;
encodings_freq.push_back(std::move(enc));
2021-12-22 21:03:42 +00:00
}
else
2021-03-18 21:57:42 +00:00
{
2021-03-23 18:55:14 +00:00
readIntText(bigram, buf_line);
buf_line.ignore();
readFloatText(frequency, buf_line);
2021-12-30 02:14:57 +00:00
encodings_freq.back().map[bigram] = frequency;
2021-03-18 14:05:28 +00:00
}
}
2021-12-22 21:03:42 +00:00
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
2021-03-18 14:05:28 +00:00
}
2021-12-22 21:03:42 +00:00
void loadEmotionalDict()
2021-03-18 14:05:28 +00:00
{
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
2021-12-22 21:03:42 +00:00
LOG_TRACE(log, "Loading embedded emotional dictionary (RU)");
auto resource = getResource("emotional_dictionary_rus.txt.zst");
if (resource.empty())
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
2021-12-30 02:14:57 +00:00
String line;
String word;
Float64 tonality;
size_t count = 0;
2021-12-22 21:03:42 +00:00
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
std::unique_ptr<ReadBuffer> in = std::make_unique<ZstdInflatingReadBuffer>(std::move(buf));
2021-12-22 21:03:42 +00:00
while (!in->eof())
2021-03-19 09:34:33 +00:00
{
2021-12-22 21:03:42 +00:00
readString(line, *in);
++in->position();
2021-03-19 09:34:33 +00:00
2021-12-22 21:03:42 +00:00
if (line.empty())
continue;
2021-12-30 02:14:57 +00:00
2021-12-22 21:03:42 +00:00
ReadBufferFromString buf_line(line);
2021-04-18 17:03:56 +00:00
2021-03-23 18:55:14 +00:00
readStringUntilWhitespace(word, buf_line);
buf_line.ignore();
readFloatText(tonality, buf_line);
2021-03-19 09:34:33 +00:00
2021-12-30 02:14:57 +00:00
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
emotional_dict[ref] = tonality;
2021-04-18 17:03:56 +00:00
++count;
2021-03-18 14:05:28 +00:00
}
2021-04-18 17:03:56 +00:00
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
2021-03-18 14:05:28 +00:00
}
2021-12-22 21:03:42 +00:00
void loadProgrammingFrequency()
2021-04-15 12:02:53 +00:00
{
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
2021-12-22 21:03:42 +00:00
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
2021-04-15 12:02:53 +00:00
2021-12-22 21:03:42 +00:00
auto resource = getResource("prog_freq.txt.zst");
if (resource.empty())
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
2021-04-15 12:02:53 +00:00
2021-12-30 02:14:57 +00:00
String line;
String bigram;
Float64 frequency;
String programming_language;
2021-12-22 21:03:42 +00:00
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
std::unique_ptr<ReadBuffer> in = std::make_unique<ZstdInflatingReadBuffer>(std::move(buf));
2021-04-15 12:02:53 +00:00
2021-12-22 21:03:42 +00:00
while (!in->eof())
{
readString(line, *in);
++in->position();
2021-04-15 12:02:53 +00:00
if (line.empty())
continue;
2021-12-30 02:14:57 +00:00
2021-12-22 21:03:42 +00:00
ReadBufferFromString buf_line(line);
// Start loading a new language
if (line.starts_with("//"))
2021-04-15 12:02:53 +00:00
{
2021-12-22 21:03:42 +00:00
buf_line.ignore(3);
readString(programming_language, buf_line);
2021-12-30 02:14:57 +00:00
Language lang;
lang.name = programming_language;
programming_freq.push_back(std::move(lang));
2021-05-23 16:39:40 +00:00
}
else
2021-04-15 12:02:53 +00:00
{
readStringUntilWhitespace(bigram, buf_line);
buf_line.ignore();
readFloatText(frequency, buf_line);
2021-12-30 02:14:57 +00:00
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
programming_freq.back().map[ref] = frequency;
2021-04-15 12:02:53 +00:00
}
}
LOG_TRACE(log, "Programming languages frequencies was added");
}
2021-12-30 02:14:57 +00:00
const Map & getEmotionalDict()
2021-03-18 14:05:28 +00:00
{
2021-12-22 21:03:42 +00:00
std::lock_guard lock(mutex);
if (emotional_dict.empty())
loadEmotionalDict();
2021-03-18 14:05:28 +00:00
return emotional_dict;
}
2021-03-19 10:06:21 +00:00
2021-12-30 02:14:57 +00:00
const EncodingContainer & getEncodingsFrequency()
2021-03-18 14:05:28 +00:00
{
2021-12-22 21:03:42 +00:00
std::lock_guard lock(mutex);
if (encodings_freq.empty())
loadEncodingsFrequency();
2021-03-18 14:05:28 +00:00
return encodings_freq;
}
2021-12-30 02:14:57 +00:00
const Container & getProgrammingFrequency()
2021-04-15 12:02:53 +00:00
{
2021-12-22 21:03:42 +00:00
std::lock_guard lock(mutex);
2021-12-30 02:14:57 +00:00
if (programming_freq.empty())
2021-12-22 21:03:42 +00:00
loadProgrammingFrequency();
2021-04-15 12:02:53 +00:00
return programming_freq;
}
2021-03-18 14:05:28 +00:00
2021-04-16 11:44:09 +00:00
private:
2021-12-30 02:14:57 +00:00
Arena string_pool;
2021-03-18 14:05:28 +00:00
2021-12-30 02:14:57 +00:00
Map emotional_dict;
Container programming_freq;
EncodingContainer encodings_freq;
2021-04-16 11:44:09 +00:00
2021-12-22 21:03:42 +00:00
std::mutex mutex;
2021-03-18 14:05:28 +00:00
};
}