ClickHouse/src/Common/FrequencyHolder.h

190 lines
5.5 KiB
C++
Raw Normal View History

2021-03-18 14:05:28 +00:00
#pragma once
2021-03-18 21:57:42 +00:00
#include <Common/TLDListsHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <IO/ReadBufferFromFile.h>
2021-03-23 18:55:14 +00:00
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/readFloatText.h>
#include <IO/Operators.h>
2021-03-18 21:57:42 +00:00
#include <string_view>
2021-03-18 14:05:28 +00:00
#include <string>
2021-03-23 18:55:14 +00:00
#include <common/find_symbols.h>
2021-03-18 14:05:28 +00:00
#include <fstream>
#include <algorithm>
#include <cstring>
#include <limits>
#include <unordered_map>
#include <common/logger_useful.h>
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
2021-03-18 14:05:28 +00:00
namespace DB
{
class FrequencyHolder
{
2021-03-18 14:05:28 +00:00
public:
2021-03-23 18:55:14 +00:00
using Map = std::unordered_map<UInt16, Float64>;
using Container = std::unordered_map<String, Map>;
2021-03-18 14:05:28 +00:00
static FrequencyHolder & getInstance()
{
static FrequencyHolder instance;
return instance;
}
2021-03-23 18:55:14 +00:00
void parseDictionaries(const String & pt)
2021-03-18 14:05:28 +00:00
{
is_true = pt;
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
2021-04-15 12:02:53 +00:00
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
2021-03-18 14:05:28 +00:00
}
2021-03-23 18:55:14 +00:00
void loadEncodingsFrequency(const String & path_to_charset_freq)
2021-03-18 14:05:28 +00:00
{
2021-03-19 09:34:33 +00:00
UInt16 bigram;
2021-03-23 18:55:14 +00:00
Float64 frequency;
String charset_name;
2021-03-19 10:06:21 +00:00
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
LOG_TRACE(log, "Charset frequencies loading from {}", path_to_charset_freq);
2021-03-18 21:57:42 +00:00
ReadBufferFromFile in(path_to_charset_freq);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
2021-03-23 18:55:14 +00:00
2021-03-18 21:57:42 +00:00
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
if (line.empty())
continue;
// Start load new charset
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
{
2021-03-23 18:55:14 +00:00
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
readString(charset_name, bufline);
2021-03-18 21:57:42 +00:00
} else
{
2021-03-23 18:55:14 +00:00
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
readIntText(bigram, buf_line);
buf_line.ignore();
readFloatText(frequency, buf_line);
2021-03-18 21:57:42 +00:00
encodings_freq[charset_name][bigram] = frequency;
2021-03-18 14:05:28 +00:00
}
2021-03-23 18:55:14 +00:00
in.position() = newline + 1;
2021-03-18 14:05:28 +00:00
}
LOG_TRACE(log, "Charset frequencies was added");
2021-03-18 14:05:28 +00:00
}
2021-03-23 18:55:14 +00:00
void loadEmotionalDict(const String & path_to_emotional_dict)
2021-03-18 14:05:28 +00:00
{
2021-03-19 09:34:33 +00:00
2021-03-23 18:55:14 +00:00
String word;
Float64 tonality;
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
2021-03-19 09:34:33 +00:00
ReadBufferFromFile in(path_to_emotional_dict);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
2021-03-23 18:55:14 +00:00
if (newline >= in.buffer().end()) { break; }
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
2021-03-19 09:34:33 +00:00
in.position() = newline + 1;
2021-03-23 18:55:14 +00:00
readStringUntilWhitespace(word, buf_line);
buf_line.ignore();
readFloatText(tonality, buf_line);
2021-03-19 09:34:33 +00:00
emotional_dict[word] = tonality;
2021-03-18 14:05:28 +00:00
}
LOG_TRACE(log, "Emotional dictionary was added");
2021-03-18 14:05:28 +00:00
}
2021-04-15 12:02:53 +00:00
void loadProgrammingFrequency(const String & path_to_programming_freq)
{
String bigram;
Float64 frequency;
String programming_language;
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
LOG_TRACE(log, "Programming langugages frequencies loading from {}", path_to_programming_freq);
ReadBufferFromFile in(path_to_programming_freq);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
if (line.empty())
continue;
// Start load new charset
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
{
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
readString(programming_language, bufline);
} else
{
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
readStringUntilWhitespace(bigram, buf_line);
buf_line.ignore();
readFloatText(frequency, buf_line);
programming_freq[programming_language][bigram] = frequency;
}
in.position() = newline + 1;
}
LOG_TRACE(log, "Programming languages frequencies was added");
}
2021-03-23 18:55:14 +00:00
const String & get_path()
2021-03-18 14:05:28 +00:00
{
return is_true;
}
2021-03-19 10:06:21 +00:00
const std::unordered_map<String, Float64> & getEmotionalDict()
2021-03-18 14:05:28 +00:00
{
return emotional_dict;
}
2021-03-19 10:06:21 +00:00
const Container & getEncodingsFrequency()
2021-03-18 14:05:28 +00:00
{
return encodings_freq;
}
2021-04-15 12:02:53 +00:00
const std::unordered_map<String, std::unordered_map<String, Float64>> & getProgrammingFrequency()
{
return programming_freq;
}
2021-03-18 14:05:28 +00:00
protected:
2021-03-23 18:55:14 +00:00
String is_true;
std::unordered_map<String, Float64> emotional_dict;
2021-03-18 14:05:28 +00:00
Container encodings_freq;
2021-04-15 12:02:53 +00:00
std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq;
2021-03-18 14:05:28 +00:00
};
}