mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Major improvements
This commit is contained in:
parent
4c44e21c29
commit
ff30b40bf6
2683
my_data/lang_models/bigram_english
Normal file
2683
my_data/lang_models/bigram_english
Normal file
File diff suppressed because it is too large
Load Diff
3776
my_data/lang_models/bigram_french
Normal file
3776
my_data/lang_models/bigram_french
Normal file
File diff suppressed because it is too large
Load Diff
3237
my_data/lang_models/bigram_german
Normal file
3237
my_data/lang_models/bigram_german
Normal file
File diff suppressed because it is too large
Load Diff
3752
my_data/lang_models/bigram_russian
Normal file
3752
my_data/lang_models/bigram_russian
Normal file
File diff suppressed because it is too large
Load Diff
@ -70,6 +70,8 @@
|
||||
#include <Server/MySQLHandlerFactory.h>
|
||||
#include <Server/PostgreSQLHandlerFactory.h>
|
||||
#include <Server/ProtocolServerAdapter.h>
|
||||
#include <Functions/FrequencyHolder.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
|
||||
#if !defined(ARCADIA_BUILD)
|
||||
@ -631,6 +633,12 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
||||
TLDListsHolder::getInstance().parseConfig(top_level_domains_path, config());
|
||||
}
|
||||
|
||||
/// my test
|
||||
{
|
||||
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "textclassification_frequency/");
|
||||
FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
|
||||
}
|
||||
|
||||
{
|
||||
Poco::File(path + "data/").createDirectories();
|
||||
Poco::File(path + "metadata/").createDirectories();
|
||||
|
@ -5,4 +5,5 @@
|
||||
<format_schema_path replace="replace">./format_schemas/</format_schema_path>
|
||||
<access_control_path replace="replace">./access/</access_control_path>
|
||||
<top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
|
||||
<encodings_frequency_path replace="replace">./encoding_frequency/</encodings_frequency_path>
|
||||
</yandex>
|
||||
|
@ -130,7 +130,7 @@
|
||||
<password></password>
|
||||
</interserver_http_credentials>-->
|
||||
|
||||
<!-- Listen specified address.
|
||||
/home/sergey/datadump/test.txt <!-- Listen specified address.
|
||||
Use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere.
|
||||
Notes:
|
||||
If you open connections from wildcard address, make sure that at least one of the following measures applied:
|
||||
@ -842,6 +842,11 @@
|
||||
Changes will not be applied w/o server restart.
|
||||
Path to the list is under top_level_domains_path (see above).
|
||||
-->
|
||||
|
||||
<!-- MY CHANGES -->
|
||||
|
||||
<encodings_frequency_path>/var/lib/clickhouse/encodings_frequency/</encodings_frequency_path>
|
||||
|
||||
<top_level_domains_lists>
|
||||
<!--
|
||||
<public_suffix_list>/path/to/public_suffix_list.dat</public_suffix_list>
|
||||
|
1
programs/server/textclassification_frequency/test.txt
Normal file
1
programs/server/textclassification_frequency/test.txt
Normal file
@ -0,0 +1 @@
|
||||
Hello!
|
11
src/Functions/FrequencyHolder.cpp
Normal file
11
src/Functions/FrequencyHolder.cpp
Normal file
@ -0,0 +1,11 @@
|
||||
/*
|
||||
#include "FrequencyHolder.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
static FrequencyHolder & getInstance() {
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
}
|
||||
*/
|
97
src/Functions/FrequencyHolder.h
Normal file
97
src/Functions/FrequencyHolder.h
Normal file
@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <string>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class FrequencyHolder
|
||||
{
|
||||
public:
|
||||
using Map = std::unordered_map<UInt16, double>;
|
||||
using Container = std::unordered_map<std::string, Map>;
|
||||
|
||||
|
||||
static FrequencyHolder & getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
|
||||
void parseDictionaries(const std::string& pt)
|
||||
{
|
||||
is_true = pt;
|
||||
loadEmotionalDict("/home/sergey/datadump/myemo2.txt");
|
||||
loadEncodingsFrequency("/home/sergey/data/dumps/encodings/russian/freq_enc/");
|
||||
}
|
||||
|
||||
|
||||
void loadEncodingsFrequency(const std::string path_to_encodings_freq)
|
||||
{
|
||||
std::vector<std::string> languages = {"freq_CP866", "freq_ISO", "freq_WINDOWS-1251", "freq_UTF-8"};
|
||||
for (std::string & lang : languages) {
|
||||
std::ifstream file(path_to_encodings_freq + lang + ".txt");
|
||||
Map new_lang;
|
||||
UInt16 bigram;
|
||||
double count;
|
||||
double total = 0;
|
||||
while (file >> bigram >> count) {
|
||||
new_lang[bigram] = count;
|
||||
total += count;
|
||||
}
|
||||
for (auto & el : new_lang) {
|
||||
el.second /= total;
|
||||
}
|
||||
encodings_freq[lang] = new_lang;
|
||||
file.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loadEmotionalDict(const std::string path_to_emotional_dict)
|
||||
{
|
||||
std::ifstream file(path_to_emotional_dict);
|
||||
std::string term, tag;
|
||||
double val;
|
||||
while (file >> term >> tag >> val) {
|
||||
std::vector<double> cur = {val};
|
||||
emotional_dict[term] = cur;
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
|
||||
|
||||
const std::string & get_path()
|
||||
{
|
||||
return is_true;
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string, std::vector<double>> getEmotionalDict()
|
||||
{
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
const Container getEncodingsFrequency()
|
||||
{
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
std::string is_true;
|
||||
std::unordered_map<std::string, std::vector<double>> emotional_dict;
|
||||
Container encodings_freq;
|
||||
};
|
||||
}
|
||||
|
@ -1,29 +1,42 @@
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include "FrequencyHolder.h"
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsHashing.h>
|
||||
#include <Common/HashTable/ClearableHashMap.h>
|
||||
#include <Common/HashTable/Hash.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
|
||||
#include <Core/Defines.h>
|
||||
|
||||
#include <common/unaligned.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <sstream>
|
||||
#include <set>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
template <size_t N>
|
||||
/*
|
||||
struct TextClassificationDictionaries
|
||||
{
|
||||
const std::unordered_map<std::string, std::vector<double>> emotional_dict;
|
||||
const std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_frequency;
|
||||
const std::string path;
|
||||
TextClassificationDictionaries()
|
||||
: emotional_dict(FrequencyHolder::getInstance().getEmotionalDict()),
|
||||
encodings_frequency(FrequencyHolder::getInstance().getEncodingsFrequency()),
|
||||
path(FrequencyHolder::getInstance().get_path())
|
||||
{
|
||||
}
|
||||
};
|
||||
*/
|
||||
// static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.getEncodingsFrequency();
|
||||
// static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.getEmotionalDict();
|
||||
|
||||
template <size_t N, bool Emo>
|
||||
struct TextClassificationImpl
|
||||
{
|
||||
|
||||
using ResultType = Float32;
|
||||
using ResultType = std::string;
|
||||
using CodePoint = UInt8;
|
||||
/// map_size for ngram count.
|
||||
static constexpr size_t map_size = 1u << 16;
|
||||
@ -43,6 +56,31 @@ struct TextClassificationImpl
|
||||
*/
|
||||
using NgramCount = UInt16;
|
||||
|
||||
|
||||
static double L2_distance(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
|
||||
{
|
||||
double res = 0;
|
||||
for (auto& el : standart) {
|
||||
if (model.find(el.first) != model.end()) {
|
||||
res += ((model[el.first] - el.second) * (model[el.first] - el.second));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static double Naive_bayes(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
|
||||
{
|
||||
double res = 1;
|
||||
for (auto & el : model) {
|
||||
if (standart[el.first] != 0) {
|
||||
res += el.second * log(standart[el.first]);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
|
||||
{
|
||||
constexpr size_t padding_offset = default_padding - N + 1;
|
||||
@ -54,6 +92,7 @@ struct TextClassificationImpl
|
||||
return default_padding;
|
||||
}
|
||||
|
||||
|
||||
static ALWAYS_INLINE inline size_t calculateStats(
|
||||
const char * data,
|
||||
const size_t size,
|
||||
@ -91,34 +130,166 @@ struct TextClassificationImpl
|
||||
}
|
||||
|
||||
|
||||
static void constant(std::string data, Float32 & res)
|
||||
static void word_processing(std::string & word)
|
||||
{
|
||||
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
|
||||
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
|
||||
res = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
|
||||
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
|
||||
|
||||
while (to_skip.find(word.back()) != to_skip.end())
|
||||
{
|
||||
word.pop_back();
|
||||
}
|
||||
|
||||
while (to_skip.find(word.front()) != to_skip.end())
|
||||
{
|
||||
word.erase(0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void constant(std::string data, std::string & res)
|
||||
{
|
||||
static std::unordered_map<std::string, std::vector<double>> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
/*
|
||||
static TextClassificationDictionaries classification_dictionaries;
|
||||
static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
|
||||
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
|
||||
*/
|
||||
if (!Emo)
|
||||
{
|
||||
|
||||
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
|
||||
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
|
||||
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
|
||||
std::string ans;
|
||||
double count_bigram = data.size() - 1;
|
||||
std::unordered_map<UInt16, double> model;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram) + "\n";
|
||||
model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram;
|
||||
}
|
||||
|
||||
double res1 = L2_distance(encodings_freq["freq_CP866"], model);
|
||||
double res2 = L2_distance(encodings_freq["freq_ISO"], model);
|
||||
double res3 = L2_distance(encodings_freq["freq_WINDOWS-1251"], model);
|
||||
double res4 = L2_distance(encodings_freq["freq_UTF-8"], model);
|
||||
ans += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
|
||||
res = ans;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
double freq = 0;
|
||||
double count_words = 0;
|
||||
|
||||
std::string ans;
|
||||
std::stringstream ss;
|
||||
ss << data;
|
||||
std::string to_check;
|
||||
|
||||
while (ss >> to_check)
|
||||
{
|
||||
word_processing(to_check);
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
ans += to_check + " " + std::to_string(emotional_dict[to_check][0]) + "\n";
|
||||
freq += emotional_dict[to_check][0];
|
||||
}
|
||||
}
|
||||
double total_tonality = freq / count_words;
|
||||
if (total_tonality < 0.5)
|
||||
{
|
||||
ans += "NEG";
|
||||
}
|
||||
else if (total_tonality > 1)
|
||||
{
|
||||
ans += "POS";
|
||||
}
|
||||
else
|
||||
{
|
||||
ans += "NEUT";
|
||||
}
|
||||
ans += " " + std::to_string(total_tonality) + "\n";
|
||||
res = ans;
|
||||
}
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
PaddedPODArray<Float32> & res)
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
const size_t offsets_size = offsets.size();
|
||||
size_t prev_offset = 0;
|
||||
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
for (size_t i = 0; i < offsets_size; ++i)
|
||||
/*
|
||||
static TextClassificationDictionaries classification_dictionaries;
|
||||
static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
|
||||
static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
|
||||
*/
|
||||
res_data.reserve(1024);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t prev_offset = 0;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
std::string str = haystack;
|
||||
|
||||
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
|
||||
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
|
||||
|
||||
res[i] = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
|
||||
prev_offset = offsets[i];
|
||||
size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
|
||||
std::string prom;
|
||||
double count_bigram = data.size() - 1;
|
||||
std::unordered_map<UInt16, double> model1;
|
||||
|
||||
std::unordered_map<UInt16, double> model2;
|
||||
|
||||
for (size_t j = 0; j < len; ++j)
|
||||
{
|
||||
model2[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < len; ++j)
|
||||
{
|
||||
model1[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]) / count_bigram;
|
||||
}
|
||||
|
||||
double res1 = L2_distance(encodings_freq["freq_CP866"], model1);
|
||||
double res2 = L2_distance(encodings_freq["freq_ISO"], model1);
|
||||
double res3 = L2_distance(encodings_freq["freq_WINDOWS-1251"], model1);
|
||||
double res4 = L2_distance(encodings_freq["freq_UTF-8"], model1);
|
||||
prom += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
|
||||
|
||||
|
||||
double res12 = Naive_bayes(encodings_freq["freq_CP866"], model2);
|
||||
double res22 = Naive_bayes(encodings_freq["freq_ISO"], model2);
|
||||
double res32 = Naive_bayes(encodings_freq["freq_WINDOWS-1251"], model2);
|
||||
double res42 = Naive_bayes(encodings_freq["freq_UTF-8"], model2);
|
||||
prom += std::to_string(res12) + " " + std::to_string(res22) + " " + std::to_string(res32) + " " + std::to_string(res42) + "\n";
|
||||
|
||||
const auto ans = prom.c_str();
|
||||
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
memcpy(&res_data[res_offset], ans, strlen(ans));
|
||||
res_offset += strlen(ans);
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
prev_offset = cur_offset;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -126,13 +297,18 @@ struct NameBiGramcount
|
||||
{
|
||||
static constexpr auto name = "biGramcount";
|
||||
};
|
||||
struct NameGetEmo
|
||||
{
|
||||
static constexpr auto name = "getEmo";
|
||||
};
|
||||
|
||||
|
||||
using FunctionBiGramcount = FunctionsTextClassification<TextClassificationImpl<2>, NameBiGramcount>;
|
||||
|
||||
using FunctionBiGramcount = FunctionsTextClassification<TextClassificationImpl<2, false>, NameBiGramcount>;
|
||||
using FunctionGetEmo = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetEmo>;
|
||||
void registerFunctionsTextClassification(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionBiGramcount>();
|
||||
factory.registerFunction<FunctionGetEmo>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ public:
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(
|
||||
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
|
||||
return arguments[0];
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
|
||||
@ -47,21 +47,19 @@ public:
|
||||
|
||||
if (col_const)
|
||||
{
|
||||
ResultType res{};
|
||||
ResultType res;
|
||||
Impl::constant(col_const->getValue<String>(), res);
|
||||
return result_type->createColumnConst(col_const->size(), toField(res));
|
||||
}
|
||||
|
||||
auto col_res = ColumnVector<ResultType>::create();
|
||||
|
||||
typename ColumnVector<ResultType>::Container & vec_res = col_res->getData();
|
||||
vec_res.resize(column->size());
|
||||
|
||||
const ColumnString * col_vector = checkAndGetColumn<ColumnString>(&*column);
|
||||
|
||||
if (col_vector)
|
||||
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
|
||||
{
|
||||
Impl::vector(col_vector->getChars(), col_vector->getOffsets(), vec_res);
|
||||
auto col_res = ColumnString::create();
|
||||
ColumnString::Chars & vec_res = col_res->getChars();
|
||||
ColumnString::Offsets & offsets_res = col_res->getOffsets();
|
||||
Impl::vector(col->getChars(), col->getOffsets(), vec_res, offsets_res);
|
||||
return col_res;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -69,8 +67,6 @@ public:
|
||||
"Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
|
3
src/Functions/test.txt
Normal file
3
src/Functions/test.txt
Normal file
@ -0,0 +1,3 @@
|
||||
12 123
|
||||
54 2323
|
||||
abcd 123
|
Loading…
Reference in New Issue
Block a user