2021-03-18 14:05:28 +00:00
|
|
|
#pragma once
|
2021-03-18 21:57:42 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2021-03-23 18:55:14 +00:00
|
|
|
#include <IO/ReadBufferFromString.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/readFloatText.h>
|
|
|
|
#include <IO/Operators.h>
|
2021-04-16 11:44:09 +00:00
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
#include <string_view>
|
2021-03-18 14:05:28 +00:00
|
|
|
#include <string>
|
|
|
|
#include <cstring>
|
|
|
|
#include <unordered_map>
|
2021-04-14 18:42:33 +00:00
|
|
|
#include <common/logger_useful.h>
|
2021-03-18 14:05:28 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
class FrequencyHolder
|
|
|
|
{
|
2021-04-14 18:42:33 +00:00
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
public:
|
2021-03-23 18:55:14 +00:00
|
|
|
using Map = std::unordered_map<UInt16, Float64>;
|
|
|
|
using Container = std::unordered_map<String, Map>;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
|
|
static FrequencyHolder & getInstance()
|
|
|
|
{
|
|
|
|
static FrequencyHolder instance;
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-04-16 11:44:09 +00:00
|
|
|
void parseEncodingFrequencies(const String & pt)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-04-16 11:44:09 +00:00
|
|
|
path_to_enc_freq = pt;
|
2021-05-21 14:01:07 +00:00
|
|
|
loadEncodingsFrequency(pt);
|
|
|
|
//loadEncodingsFrequency("/home/sergey/ClickHouse/programs/server/charset_freq.txt");
|
2021-04-16 11:44:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void parseEmotionalDict(const String & pt)
|
|
|
|
{
|
|
|
|
path_to_emo_dict = pt;
|
2021-05-21 14:01:07 +00:00
|
|
|
loadEmotionalDict(pt);
|
|
|
|
//loadEmotionalDict("/home/sergey/ClickHouse/programs/server/emotional_dictionary_rus.txt");
|
2021-04-16 11:44:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void parseProgrammingFrequency(const String & pt)
|
|
|
|
{
|
|
|
|
path_to_prog_freq = pt;
|
2021-05-21 14:01:07 +00:00
|
|
|
loadProgrammingFrequency(pt);
|
|
|
|
//loadProgrammingFrequency("/home/sergey/ClickHouse/programs/server/prog_freq.txt");
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
void loadEncodingsFrequency(const String & path_to_charset_freq)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-03-19 09:34:33 +00:00
|
|
|
UInt16 bigram;
|
2021-03-23 18:55:14 +00:00
|
|
|
Float64 frequency;
|
|
|
|
String charset_name;
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
|
|
|
|
|
|
|
LOG_TRACE(log, "Charset frequencies loading from {}", path_to_charset_freq);
|
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
ReadBufferFromFile in(path_to_charset_freq);
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
2021-03-23 18:55:14 +00:00
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
if (newline >= in.buffer().end())
|
|
|
|
break;
|
|
|
|
|
|
|
|
std::string_view line(in.position(), newline - in.position());
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
// Start load new charset
|
|
|
|
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
|
|
|
|
readString(charset_name, bufline);
|
2021-03-18 21:57:42 +00:00
|
|
|
} else
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
|
|
|
readIntText(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
2021-03-18 21:57:42 +00:00
|
|
|
encodings_freq[charset_name][bigram] = frequency;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
2021-03-23 18:55:14 +00:00
|
|
|
in.position() = newline + 1;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
2021-04-14 18:42:33 +00:00
|
|
|
LOG_TRACE(log, "Charset frequencies was added");
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
void loadEmotionalDict(const String & path_to_emotional_dict)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-03-19 09:34:33 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
String word;
|
|
|
|
Float64 tonality;
|
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
|
|
|
LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
|
|
|
|
|
2021-04-18 17:03:56 +00:00
|
|
|
size_t buf_size = 10000000;
|
|
|
|
ReadBufferFromFile in(path_to_emotional_dict, buf_size);
|
|
|
|
size_t count = 0;
|
2021-03-19 09:34:33 +00:00
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
2021-03-19 09:34:33 +00:00
|
|
|
in.position() = newline + 1;
|
|
|
|
|
2021-04-18 17:03:56 +00:00
|
|
|
if (newline >= in.buffer().end())
|
|
|
|
break;
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
readStringUntilWhitespace(word, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(tonality, buf_line);
|
2021-03-19 09:34:33 +00:00
|
|
|
|
|
|
|
emotional_dict[word] = tonality;
|
2021-04-18 17:03:56 +00:00
|
|
|
++count;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
2021-04-18 17:03:56 +00:00
|
|
|
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-04-15 12:02:53 +00:00
|
|
|
void loadProgrammingFrequency(const String & path_to_programming_freq)
|
|
|
|
{
|
|
|
|
String bigram;
|
|
|
|
Float64 frequency;
|
|
|
|
String programming_language;
|
|
|
|
|
|
|
|
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
|
|
|
|
2021-05-21 13:58:48 +00:00
|
|
|
LOG_TRACE(log, "Programming languages frequencies loading from {}", path_to_programming_freq);
|
2021-04-15 12:02:53 +00:00
|
|
|
|
2021-04-18 17:03:56 +00:00
|
|
|
size_t buf_size = 10000000;
|
|
|
|
ReadBufferFromFile in(path_to_programming_freq, buf_size);
|
2021-04-15 12:02:53 +00:00
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
|
|
|
|
|
|
|
if (newline >= in.buffer().end())
|
|
|
|
break;
|
|
|
|
|
|
|
|
std::string_view line(in.position(), newline - in.position());
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
2021-05-21 13:48:18 +00:00
|
|
|
// Start load new language
|
2021-04-15 12:02:53 +00:00
|
|
|
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
|
|
|
|
{
|
|
|
|
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
|
|
|
|
readString(programming_language, bufline);
|
2021-05-21 13:48:18 +00:00
|
|
|
LOG_TRACE(log, "Loading {}", programming_language);
|
2021-04-15 12:02:53 +00:00
|
|
|
} else
|
|
|
|
{
|
|
|
|
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
|
|
|
readStringUntilWhitespace(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
|
|
|
programming_freq[programming_language][bigram] = frequency;
|
2021-05-21 13:48:18 +00:00
|
|
|
LOG_TRACE(log, "Word {}", bigram);
|
2021-04-15 12:02:53 +00:00
|
|
|
}
|
|
|
|
in.position() = newline + 1;
|
|
|
|
}
|
|
|
|
LOG_TRACE(log, "Programming languages frequencies was added");
|
|
|
|
}
|
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
const std::unordered_map<String, Float64> & getEmotionalDict()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
|
|
|
return emotional_dict;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
const Container & getEncodingsFrequency()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
|
|
|
return encodings_freq;
|
|
|
|
}
|
|
|
|
|
2021-04-15 12:02:53 +00:00
|
|
|
const std::unordered_map<String, std::unordered_map<String, Float64>> & getProgrammingFrequency()
|
|
|
|
{
|
|
|
|
return programming_freq;
|
|
|
|
}
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
|
2021-04-16 11:44:09 +00:00
|
|
|
private:
|
2021-03-18 14:05:28 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
std::unordered_map<String, Float64> emotional_dict;
|
2021-03-18 14:05:28 +00:00
|
|
|
Container encodings_freq;
|
2021-04-15 12:02:53 +00:00
|
|
|
std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq;
|
2021-04-16 11:44:09 +00:00
|
|
|
|
|
|
|
String path_to_emo_dict;
|
|
|
|
String path_to_enc_freq;
|
|
|
|
String path_to_prog_freq;
|
2021-03-18 14:05:28 +00:00
|
|
|
};
|
|
|
|
}
|