2021-03-18 14:05:28 +00:00
|
|
|
#pragma once
|
2021-03-18 21:57:42 +00:00
|
|
|
#include <Common/TLDListsHolder.h>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2021-03-23 18:55:14 +00:00
|
|
|
#include <IO/ReadBufferFromString.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/readFloatText.h>
|
|
|
|
#include <IO/Operators.h>
|
2021-03-18 21:57:42 +00:00
|
|
|
#include <string_view>
|
2021-03-18 14:05:28 +00:00
|
|
|
#include <string>
|
2021-03-23 18:55:14 +00:00
|
|
|
#include <common/find_symbols.h>
|
2021-03-18 14:05:28 +00:00
|
|
|
#include <fstream>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cstring>
|
|
|
|
#include <limits>
|
|
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
class FrequencyHolder
|
|
|
|
{
|
|
|
|
public:
|
2021-03-23 18:55:14 +00:00
|
|
|
using Map = std::unordered_map<UInt16, Float64>;
|
|
|
|
using Container = std::unordered_map<String, Map>;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
static FrequencyHolder & getInstance()
|
|
|
|
{
|
|
|
|
static FrequencyHolder instance;
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
void parseDictionaries(const String & pt)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
|
|
|
is_true = pt;
|
2021-03-18 21:57:42 +00:00
|
|
|
loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
|
|
|
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
void loadEncodingsFrequency(const String & path_to_charset_freq)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-03-19 09:34:33 +00:00
|
|
|
UInt16 bigram;
|
2021-03-23 18:55:14 +00:00
|
|
|
Float64 frequency;
|
|
|
|
String charset_name;
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
ReadBufferFromFile in(path_to_charset_freq);
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
2021-03-23 18:55:14 +00:00
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
if (newline >= in.buffer().end())
|
|
|
|
break;
|
|
|
|
|
|
|
|
std::string_view line(in.position(), newline - in.position());
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
// Start load new charset
|
|
|
|
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
|
|
|
|
readString(charset_name, bufline);
|
2021-03-18 21:57:42 +00:00
|
|
|
} else
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
|
|
|
readIntText(bigram, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(frequency, buf_line);
|
2021-03-18 21:57:42 +00:00
|
|
|
encodings_freq[charset_name][bigram] = frequency;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
2021-03-23 18:55:14 +00:00
|
|
|
in.position() = newline + 1;
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
void loadEmotionalDict(const String & path_to_emotional_dict)
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
2021-03-19 09:34:33 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
String word;
|
|
|
|
Float64 tonality;
|
|
|
|
|
2021-03-19 09:34:33 +00:00
|
|
|
ReadBufferFromFile in(path_to_emotional_dict);
|
|
|
|
while (!in.eof())
|
|
|
|
{
|
|
|
|
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
if (newline >= in.buffer().end()) { break; }
|
|
|
|
|
|
|
|
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
2021-03-19 09:34:33 +00:00
|
|
|
in.position() = newline + 1;
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
readStringUntilWhitespace(word, buf_line);
|
|
|
|
buf_line.ignore();
|
|
|
|
readFloatText(tonality, buf_line);
|
2021-03-19 09:34:33 +00:00
|
|
|
|
|
|
|
emotional_dict[word] = tonality;
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
const String & get_path()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
|
|
|
return is_true;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
const std::unordered_map<String, Float64> getEmotionalDict()
|
2021-03-18 14:05:28 +00:00
|
|
|
{
|
|
|
|
return emotional_dict;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:06:21 +00:00
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
const Container getEncodingsFrequency()
|
|
|
|
{
|
|
|
|
return encodings_freq;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
String is_true;
|
|
|
|
std::unordered_map<String, Float64> emotional_dict;
|
2021-03-18 14:05:28 +00:00
|
|
|
Container encodings_freq;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|