ClickHouse/src/Common/FrequencyHolder.h

131 lines
3.2 KiB
C++
Raw Normal View History

2021-03-18 14:05:28 +00:00
#pragma once
2021-03-18 21:57:42 +00:00
#include <Common/TLDListsHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <IO/ReadBufferFromFile.h>
#include <string_view>
2021-03-18 14:05:28 +00:00
#include <string>
#include <fstream>
#include <algorithm>
#include <cstring>
#include <limits>
#include <unordered_map>
namespace DB
{
class FrequencyHolder
{
public:
using Map = std::unordered_map<UInt16, double>;
using Container = std::unordered_map<std::string, Map>;
static FrequencyHolder & getInstance()
{
static FrequencyHolder instance;
return instance;
}
void parseDictionaries(const std::string& pt)
{
is_true = pt;
2021-03-18 21:57:42 +00:00
loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
2021-03-18 14:05:28 +00:00
}
2021-03-18 21:57:42 +00:00
void loadEncodingsFrequency(const std::string path_to_charset_freq)
2021-03-18 14:05:28 +00:00
{
2021-03-18 21:57:42 +00:00
char charset_name_buf [40];
2021-03-19 09:34:33 +00:00
UInt16 bigram;
double frequency;
2021-03-18 21:57:42 +00:00
std::string charset_name;
2021-03-19 10:06:21 +00:00
2021-03-18 21:57:42 +00:00
ReadBufferFromFile in(path_to_charset_freq);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
in.position() = newline + 1;
if (line.empty())
continue;
// Start load new charset
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
{
const char * st = line.data();
sscanf(st + 2, "%39s", charset_name_buf);
std::string s(charset_name_buf);
charset_name = s;
} else
{
const char * st = line.data();
sscanf(st, "%hd %lg", &bigram, &frequency);
encodings_freq[charset_name][bigram] = frequency;
2021-03-18 14:05:28 +00:00
}
}
}
void loadEmotionalDict(const std::string path_to_emotional_dict)
{
2021-03-19 09:34:33 +00:00
char word_buf [40];
double tonality;
ReadBufferFromFile in(path_to_emotional_dict);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
in.position() = newline + 1;
if (line.empty())
continue;
const char * st = line.data();
sscanf(st, "%39s %lg", word_buf, &tonality);
std::string word(word_buf);
emotional_dict[word] = tonality;
2021-03-18 14:05:28 +00:00
}
}
const std::string & get_path()
{
return is_true;
}
2021-03-19 10:06:21 +00:00
const std::unordered_map<std::string, double> getEmotionalDict()
2021-03-18 14:05:28 +00:00
{
return emotional_dict;
}
2021-03-19 10:06:21 +00:00
2021-03-18 14:05:28 +00:00
const Container getEncodingsFrequency()
{
return encodings_freq;
}
protected:
std::string is_true;
2021-03-19 09:34:33 +00:00
std::unordered_map<std::string, double> emotional_dict;
2021-03-18 14:05:28 +00:00
Container encodings_freq;
};
}