2021-03-18 14:05:28 +00:00
|
|
|
#pragma once
|
2021-03-18 21:57:42 +00:00
|
|
|
#include <Common/TLDListsHolder.h>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
|
|
|
#include <string_view>
|
2021-03-18 14:05:28 +00:00
|
|
|
#include <string>
|
|
|
|
#include <fstream>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cstring>
|
|
|
|
#include <limits>
|
|
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
class FrequencyHolder
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
using Map = std::unordered_map<UInt16, double>;
|
|
|
|
using Container = std::unordered_map<std::string, Map>;
|
|
|
|
|
|
|
|
|
|
|
|
static FrequencyHolder & getInstance()
|
|
|
|
{
|
|
|
|
static FrequencyHolder instance;
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void parseDictionaries(const std::string& pt)
|
|
|
|
{
|
|
|
|
is_true = pt;
|
2021-03-18 21:57:42 +00:00
|
|
|
loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
|
|
|
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
void loadEncodingsFrequency(const std::string path_to_charset_freq)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-03-18 21:57:42 +00:00
|
|
|
char charset_name_buf [40];
|
2021-03-19 09:34:33 +00:00
|
|
|
UInt16 bigram;
|
|
|
|
double frequency;
|
2021-03-18 21:57:42 +00:00
|
|
|
std::string charset_name;
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
ReadBufferFromFile in(path_to_charset_freq);
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
|
|
|
|
|
|
|
if (newline >= in.buffer().end())
|
|
|
|
break;
|
|
|
|
|
|
|
|
std::string_view line(in.position(), newline - in.position());
|
|
|
|
in.position() = newline + 1;
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
// Start load new charset
|
|
|
|
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
|
|
|
|
{
|
|
|
|
const char * st = line.data();
|
|
|
|
sscanf(st + 2, "%39s", charset_name_buf);
|
|
|
|
std::string s(charset_name_buf);
|
|
|
|
charset_name = s;
|
|
|
|
} else
|
|
|
|
{
|
|
|
|
const char * st = line.data();
|
|
|
|
sscanf(st, "%hd %lg", &bigram, &frequency);
|
|
|
|
encodings_freq[charset_name][bigram] = frequency;
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void loadEmotionalDict(const std::string path_to_emotional_dict)
|
|
|
|
{
|
2021-03-19 09:34:33 +00:00
|
|
|
|
|
|
|
char word_buf [40];
|
|
|
|
double tonality;
|
|
|
|
ReadBufferFromFile in(path_to_emotional_dict);
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
|
|
|
|
|
|
|
if (newline >= in.buffer().end())
|
|
|
|
break;
|
|
|
|
|
|
|
|
std::string_view line(in.position(), newline - in.position());
|
|
|
|
in.position() = newline + 1;
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
const char * st = line.data();
|
|
|
|
sscanf(st, "%39s %lg", word_buf, &tonality);
|
|
|
|
std::string word(word_buf);
|
|
|
|
|
|
|
|
emotional_dict[word] = tonality;
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const std::string & get_path()
|
|
|
|
{
|
|
|
|
return is_true;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:06:21 +00:00
|
|
|
|
|
|
|
const std::unordered_map<std::string, double> getEmotionalDict()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
|
|
|
return emotional_dict;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
const Container getEncodingsFrequency()
|
|
|
|
{
|
|
|
|
return encodings_freq;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
|
|
|
std::string is_true;
|
2021-03-19 09:34:33 +00:00
|
|
|
std::unordered_map<std::string, double> emotional_dict;
|
2021-03-18 14:05:28 +00:00
|
|
|
Container encodings_freq;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|