Add charset_freq

This commit is contained in:
s-kat 2021-03-19 00:57:42 +03:00
parent 274601232b
commit 473c47abcc
4 changed files with 35783 additions and 57 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,10 @@
#pragma once
#include <Functions/FunctionsTextClassification.h>
#include <Common/TLDListsHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <IO/ReadBufferFromFile.h>
#include <string_view>
#include <string>
#include <Functions/FunctionFactory.h>
#include <fstream>
#include <algorithm>
#include <cstring>
@ -31,29 +32,44 @@ public:
void parseDictionaries(const std::string& pt)
{
is_true = pt;
loadEmotionalDict("/home/sergey/datadump/myemo2.txt");
loadEncodingsFrequency("/home/sergey/data/dumps/encodings/russian/freq_enc/");
loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
}
void loadEncodingsFrequency(const std::string path_to_encodings_freq)
void loadEncodingsFrequency(const std::string path_to_charset_freq)
{
std::vector<std::string> languages = {"freq_CP866", "freq_ISO", "freq_WINDOWS-1251", "freq_UTF-8"};
for (std::string & lang : languages) {
std::ifstream file(path_to_encodings_freq + lang + ".txt");
Map new_lang;
UInt16 bigram;
double count;
double total = 0;
while (file >> bigram >> count) {
new_lang[bigram] = count;
total += count;
char charset_name_buf [40];
std::string charset_name;
ReadBufferFromFile in(path_to_charset_freq);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
in.position() = newline + 1;
if (line.empty())
continue;
// Start load new charset
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
{
const char * st = line.data();
sscanf(st + 2, "%39s", charset_name_buf);
std::string s(charset_name_buf);
charset_name = s;
} else
{
const char * st = line.data();
UInt16 bigram;
double frequency;
sscanf(st, "%hd %lg", &bigram, &frequency);
encodings_freq[charset_name][bigram] = frequency;
}
for (auto & el : new_lang) {
el.second /= total;
}
encodings_freq[lang] = new_lang;
file.close();
}
}

View File

@ -71,15 +71,26 @@ struct TextClassificationImpl
static double Naive_bayes(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
{
double res = 1;
double res = 0;
for (auto & el : model) {
if (standart[el.first] != 0) {
res += el.second * log(standart[el.first]);
} else {
res += el.second * log(0.00001);
}
}
return res;
}
static double Simple(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
{
double res = 0;
for (auto & el : model) {
res += el.second * standart[el.first];
}
return res;
}
static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
{
@ -151,11 +162,6 @@ struct TextClassificationImpl
static std::unordered_map<std::string, std::vector<double>> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
/*
static TextClassificationDictionaries classification_dictionaries;
static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
*/
if (!Emo)
{
@ -163,23 +169,20 @@ struct TextClassificationImpl
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
std::string ans;
double count_bigram = data.size() - 1;
// double count_bigram = data.size() - 1;
std::unordered_map<UInt16, double> model;
for (size_t i = 0; i < len; ++i) {
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram) + "\n";
model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram;
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]);
}
double res1 = L2_distance(encodings_freq["freq_CP866"], model);
double res2 = L2_distance(encodings_freq["freq_ISO"], model);
double res3 = L2_distance(encodings_freq["freq_WINDOWS-1251"], model);
double res4 = L2_distance(encodings_freq["freq_UTF-8"], model);
ans += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
for (const auto& item : encodings_freq) {
ans += item.first + " " + std::to_string(Simple(item.second, model)) + "\n";
}
res = ans;
}
else
{
double freq = 0;
double count_words = 0;
@ -245,33 +248,17 @@ struct TextClassificationImpl
size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
std::string prom;
double count_bigram = data.size() - 1;
std::unordered_map<UInt16, double> model1;
std::unordered_map<UInt16, double> model2;
// double count_bigram = data.size() - 1;
std::unordered_map<UInt16, double> model;
for (size_t j = 0; j < len; ++j)
{
model2[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
model[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
}
for (size_t j = 0; j < len; ++j)
{
model1[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]) / count_bigram;
for (const auto& item : encodings_freq) {
prom += item.first + " " + std::to_string(Simple(item.second, model)) + "\n";
}
double res1 = L2_distance(encodings_freq["freq_CP866"], model1);
double res2 = L2_distance(encodings_freq["freq_ISO"], model1);
double res3 = L2_distance(encodings_freq["freq_WINDOWS-1251"], model1);
double res4 = L2_distance(encodings_freq["freq_UTF-8"], model1);
prom += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
double res12 = Naive_bayes(encodings_freq["freq_CP866"], model2);
double res22 = Naive_bayes(encodings_freq["freq_ISO"], model2);
double res32 = Naive_bayes(encodings_freq["freq_WINDOWS-1251"], model2);
double res42 = Naive_bayes(encodings_freq["freq_UTF-8"], model2);
prom += std::to_string(res12) + " " + std::to_string(res22) + " " + std::to_string(res32) + " " + std::to_string(res42) + "\n";
const auto ans = prom.c_str();