Major improvements

This commit is contained in:
s-kat 2021-03-18 17:05:28 +03:00
parent 4c44e21c29
commit ff30b40bf6
13 changed files with 13783 additions and 37 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -70,6 +70,8 @@
#include <Server/MySQLHandlerFactory.h>
#include <Server/PostgreSQLHandlerFactory.h>
#include <Server/ProtocolServerAdapter.h>
#include <Functions/FrequencyHolder.h>
#include <Functions/FunctionsTextClassification.h>
#if !defined(ARCADIA_BUILD)
@ -631,6 +633,12 @@ int Server::main(const std::vector<std::string> & /*args*/)
TLDListsHolder::getInstance().parseConfig(top_level_domains_path, config());
}
/// my test
{
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "textclassification_frequency/");
FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
}
{
Poco::File(path + "data/").createDirectories();
Poco::File(path + "metadata/").createDirectories();

View File

@ -5,4 +5,5 @@
<format_schema_path replace="replace">./format_schemas/</format_schema_path>
<access_control_path replace="replace">./access/</access_control_path>
<top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
<encodings_frequency_path replace="replace">./encoding_frequency/</encodings_frequency_path>
</yandex>

View File

@ -130,7 +130,7 @@
<password></password>
</interserver_http_credentials>-->
<!-- Listen specified address.
/home/sergey/datadump/test.txt <!-- Listen specified address.
Use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere.
Notes:
If you open connections from wildcard address, make sure that at least one of the following measures applied:
@ -842,6 +842,11 @@
Changes will not be applied w/o server restart.
Path to the list is under top_level_domains_path (see above).
-->
<!-- MY CHANGES -->
<encodings_frequency_path>/var/lib/clickhouse/encodings_frequency/</encodings_frequency_path>
<top_level_domains_lists>
<!--
<public_suffix_list>/path/to/public_suffix_list.dat</public_suffix_list>

View File

@ -0,0 +1 @@
Hello!

View File

@ -0,0 +1,11 @@
/*
#include "FrequencyHolder.h"
namespace DB
{
static FrequencyHolder & getInstance() {
static FrequencyHolder instance;
return instance;
}
}
*/

View File

@ -0,0 +1,97 @@
#pragma once
#include <Functions/FunctionsTextClassification.h>
#include <string>
#include <Functions/FunctionFactory.h>
#include <fstream>
#include <algorithm>
#include <cstring>
#include <limits>
#include <unordered_map>
namespace DB
{
class FrequencyHolder
{
public:
using Map = std::unordered_map<UInt16, double>;
using Container = std::unordered_map<std::string, Map>;
static FrequencyHolder & getInstance()
{
static FrequencyHolder instance;
return instance;
}
void parseDictionaries(const std::string& pt)
{
is_true = pt;
loadEmotionalDict("/home/sergey/datadump/myemo2.txt");
loadEncodingsFrequency("/home/sergey/data/dumps/encodings/russian/freq_enc/");
}
void loadEncodingsFrequency(const std::string path_to_encodings_freq)
{
std::vector<std::string> languages = {"freq_CP866", "freq_ISO", "freq_WINDOWS-1251", "freq_UTF-8"};
for (std::string & lang : languages) {
std::ifstream file(path_to_encodings_freq + lang + ".txt");
Map new_lang;
UInt16 bigram;
double count;
double total = 0;
while (file >> bigram >> count) {
new_lang[bigram] = count;
total += count;
}
for (auto & el : new_lang) {
el.second /= total;
}
encodings_freq[lang] = new_lang;
file.close();
}
}
void loadEmotionalDict(const std::string path_to_emotional_dict)
{
std::ifstream file(path_to_emotional_dict);
std::string term, tag;
double val;
while (file >> term >> tag >> val) {
std::vector<double> cur = {val};
emotional_dict[term] = cur;
}
file.close();
}
const std::string & get_path()
{
return is_true;
}
const std::unordered_map<std::string, std::vector<double>> getEmotionalDict()
{
return emotional_dict;
}
const Container getEncodingsFrequency()
{
return encodings_freq;
}
protected:
std::string is_true;
std::unordered_map<std::string, std::vector<double>> emotional_dict;
Container encodings_freq;
};
}

View File

@ -1,29 +1,42 @@
#include <Functions/FunctionsTextClassification.h>
#include "FrequencyHolder.h"
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsHashing.h>
#include <Common/HashTable/ClearableHashMap.h>
#include <Common/HashTable/Hash.h>
#include <Common/UTF8Helpers.h>
#include <Core/Defines.h>
#include <common/unaligned.h>
#include <algorithm>
#include <climits>
#include <cstring>
#include <cmath>
#include <limits>
#include <map>
#include <unordered_map>
#include <memory>
#include <utility>
#include <sstream>
#include <set>
namespace DB
{
template <size_t N>
/*
struct TextClassificationDictionaries
{
const std::unordered_map<std::string, std::vector<double>> emotional_dict;
const std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_frequency;
const std::string path;
TextClassificationDictionaries()
: emotional_dict(FrequencyHolder::getInstance().getEmotionalDict()),
encodings_frequency(FrequencyHolder::getInstance().getEncodingsFrequency()),
path(FrequencyHolder::getInstance().get_path())
{
}
};
*/
// static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.getEncodingsFrequency();
// static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.getEmotionalDict();
template <size_t N, bool Emo>
struct TextClassificationImpl
{
using ResultType = Float32;
using ResultType = std::string;
using CodePoint = UInt8;
/// map_size for ngram count.
static constexpr size_t map_size = 1u << 16;
@ -43,6 +56,31 @@ struct TextClassificationImpl
*/
using NgramCount = UInt16;
static double L2_distance(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
{
double res = 0;
for (auto& el : standart) {
if (model.find(el.first) != model.end()) {
res += ((model[el.first] - el.second) * (model[el.first] - el.second));
}
}
return res;
}
static double Naive_bayes(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
{
double res = 1;
for (auto & el : model) {
if (standart[el.first] != 0) {
res += el.second * log(standart[el.first]);
}
}
return res;
}
static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
{
constexpr size_t padding_offset = default_padding - N + 1;
@ -54,6 +92,7 @@ struct TextClassificationImpl
return default_padding;
}
static ALWAYS_INLINE inline size_t calculateStats(
const char * data,
const size_t size,
@ -91,34 +130,166 @@ struct TextClassificationImpl
}
static void constant(std::string data, Float32 & res)
static void word_processing(std::string & word)
{
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
while (to_skip.find(word.back()) != to_skip.end())
{
word.pop_back();
}
while (to_skip.find(word.front()) != to_skip.end())
{
word.erase(0, 1);
}
}
static void constant(std::string data, std::string & res)
{
static std::unordered_map<std::string, std::vector<double>> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
/*
static TextClassificationDictionaries classification_dictionaries;
static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
*/
if (!Emo)
{
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
res = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
std::string ans;
double count_bigram = data.size() - 1;
std::unordered_map<UInt16, double> model;
for (size_t i = 0; i < len; ++i) {
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram) + "\n";
model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram;
}
double res1 = L2_distance(encodings_freq["freq_CP866"], model);
double res2 = L2_distance(encodings_freq["freq_ISO"], model);
double res3 = L2_distance(encodings_freq["freq_WINDOWS-1251"], model);
double res4 = L2_distance(encodings_freq["freq_UTF-8"], model);
ans += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
res = ans;
}
else
{
double freq = 0;
double count_words = 0;
std::string ans;
std::stringstream ss;
ss << data;
std::string to_check;
while (ss >> to_check)
{
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
ans += to_check + " " + std::to_string(emotional_dict[to_check][0]) + "\n";
freq += emotional_dict[to_check][0];
}
}
double total_tonality = freq / count_words;
if (total_tonality < 0.5)
{
ans += "NEG";
}
else if (total_tonality > 1)
{
ans += "POS";
}
else
{
ans += "NEUT";
}
ans += " " + std::to_string(total_tonality) + "\n";
res = ans;
}
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
PaddedPODArray<Float32> & res)
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
const size_t offsets_size = offsets.size();
size_t prev_offset = 0;
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
for (size_t i = 0; i < offsets_size; ++i)
/*
static TextClassificationDictionaries classification_dictionaries;
static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
*/
res_data.reserve(1024);
res_offsets.resize(offsets.size());
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
std::string str = haystack;
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
res[i] = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
prev_offset = offsets[i];
size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
std::string prom;
double count_bigram = data.size() - 1;
std::unordered_map<UInt16, double> model1;
std::unordered_map<UInt16, double> model2;
for (size_t j = 0; j < len; ++j)
{
model2[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
}
for (size_t j = 0; j < len; ++j)
{
model1[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]) / count_bigram;
}
double res1 = L2_distance(encodings_freq["freq_CP866"], model1);
double res2 = L2_distance(encodings_freq["freq_ISO"], model1);
double res3 = L2_distance(encodings_freq["freq_WINDOWS-1251"], model1);
double res4 = L2_distance(encodings_freq["freq_UTF-8"], model1);
prom += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
double res12 = Naive_bayes(encodings_freq["freq_CP866"], model2);
double res22 = Naive_bayes(encodings_freq["freq_ISO"], model2);
double res32 = Naive_bayes(encodings_freq["freq_WINDOWS-1251"], model2);
double res42 = Naive_bayes(encodings_freq["freq_UTF-8"], model2);
prom += std::to_string(res12) + " " + std::to_string(res22) + " " + std::to_string(res32) + " " + std::to_string(res42) + "\n";
const auto ans = prom.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
@ -126,13 +297,18 @@ struct NameBiGramcount
{
static constexpr auto name = "biGramcount";
};
struct NameGetEmo
{
static constexpr auto name = "getEmo";
};
using FunctionBiGramcount = FunctionsTextClassification<TextClassificationImpl<2>, NameBiGramcount>;
using FunctionBiGramcount = FunctionsTextClassification<TextClassificationImpl<2, false>, NameBiGramcount>;
using FunctionGetEmo = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetEmo>;
void registerFunctionsTextClassification(FunctionFactory & factory)
{
factory.registerFunction<FunctionBiGramcount>();
factory.registerFunction<FunctionGetEmo>();
}
}

View File

@ -34,7 +34,7 @@ public:
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
return arguments[0];
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
@ -47,21 +47,19 @@ public:
if (col_const)
{
ResultType res{};
ResultType res;
Impl::constant(col_const->getValue<String>(), res);
return result_type->createColumnConst(col_const->size(), toField(res));
}
auto col_res = ColumnVector<ResultType>::create();
typename ColumnVector<ResultType>::Container & vec_res = col_res->getData();
vec_res.resize(column->size());
const ColumnString * col_vector = checkAndGetColumn<ColumnString>(&*column);
if (col_vector)
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
{
Impl::vector(col_vector->getChars(), col_vector->getOffsets(), vec_res);
auto col_res = ColumnString::create();
ColumnString::Chars & vec_res = col_res->getChars();
ColumnString::Offsets & offsets_res = col_res->getOffsets();
Impl::vector(col->getChars(), col->getOffsets(), vec_res, offsets_res);
return col_res;
}
else
{
@ -69,8 +67,6 @@ public:
"Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
return col_res;
}
};

3
src/Functions/test.txt Normal file
View File

@ -0,0 +1,3 @@
12 123
54 2323
abcd 123