add FunctionsTonalityClassification.cpp

This commit is contained in:
s-kat 2021-04-14 21:42:33 +03:00
parent 4412aa39bb
commit cdf8ab71d2
10 changed files with 91478 additions and 134 deletions

View File

@ -634,7 +634,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
/// my test
{
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "textclassification_frequency/");
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "encodings_frequency/");
FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
}

View File

@ -5,5 +5,4 @@
<format_schema_path replace="replace">./format_schemas/</format_schema_path>
<access_control_path replace="replace">./access/</access_control_path>
<top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
<encodings_frequency_path replace="replace">./encoding_frequency/</encodings_frequency_path>
</yandex>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@
#include <cstring>
#include <limits>
#include <unordered_map>
#include <common/logger_useful.h>
namespace DB
@ -22,11 +22,11 @@ namespace DB
class FrequencyHolder
{
public:
using Map = std::unordered_map<UInt16, Float64>;
using Container = std::unordered_map<String, Map>;
static FrequencyHolder & getInstance()
{
static FrequencyHolder instance;
@ -37,8 +37,8 @@ public:
void parseDictionaries(const String & pt)
{
is_true = pt;
loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
}
@ -48,6 +48,10 @@ public:
Float64 frequency;
String charset_name;
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
LOG_TRACE(log, "Charset frequencies loading from {}", path_to_charset_freq);
ReadBufferFromFile in(path_to_charset_freq);
while (!in.eof())
{
@ -75,6 +79,7 @@ public:
}
in.position() = newline + 1;
}
LOG_TRACE(log, "Charset frequencies was added");
}
@ -84,6 +89,9 @@ public:
String word;
Float64 tonality;
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
ReadBufferFromFile in(path_to_emotional_dict);
while (!in.eof())
{
@ -101,6 +109,7 @@ public:
emotional_dict[word] = tonality;
}
LOG_TRACE(log, "Emotional dictionary was added");
}
@ -110,13 +119,13 @@ public:
}
const std::unordered_map<String, Float64> getEmotionalDict()
const std::unordered_map<String, Float64> & getEmotionalDict()
{
return emotional_dict;
}
const Container getEncodingsFrequency()
const Container & getEncodingsFrequency()
{
return encodings_freq;
}

View File

@ -19,7 +19,7 @@ namespace DB
{
template <size_t N, bool Tonality>
template <size_t N>
struct TextClassificationImpl
{
@ -43,19 +43,6 @@ struct TextClassificationImpl
*/
using NgramCount = UInt16;
static ALWAYS_INLINE inline Float64 L2_distance(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
{
Float64 res = 0;
for (auto& el : standart) {
if (model.find(el.first) != model.end()) {
res += ((model[el.first] - el.second) * (model[el.first] - el.second));
}
}
return res;
}
static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
{
Float64 res = 0;
@ -63,7 +50,7 @@ struct TextClassificationImpl
if (standart[el.first] != 0) {
res += el.second * log(standart[el.first]);
} else {
res += el.second * log(0.0000001);
res += el.second * log(0.000001);
}
}
return res;
@ -118,82 +105,32 @@ struct TextClassificationImpl
return len;
}
static ALWAYS_INLINE inline void word_processing(String & word)
{
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
while (to_skip.find(word.back()) != to_skip.end())
{
word.pop_back();
}
while (to_skip.find(word.front()) != to_skip.end())
{
word.erase(0, 1);
}
}
static String get_tonality(const Float64 & tonality_level)
{
if (tonality_level < 0.5) { return "NEG"; }
if (tonality_level > 1) { return "POS"; }
return "NEUT";
}
static void constant(String data, String & res)
{
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
if (!Tonality)
{
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
String ans;
// Float64 count_bigram = data.size() - 1;
std::unordered_map<UInt16, Float64> model;
for (size_t i = 0; i < len; ++i) {
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
model[ngram_storage.get()[i]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]]);
}
for (const auto& item : encodings_freq) {
ans += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
}
res = ans;
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
String ans;
// Float64 count_bigram = data.size() - 1;
std::unordered_map<UInt16, Float64> model;
for (size_t i = 0; i < len; ++i) {
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
model[ngram_storage.get()[i]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]]);
}
else
{
Float64 freq = 0;
Float64 count_words = 0;
String ans;
String to_check;
ReadBufferFromString in(data);
while (!in.eof())
{
readString(to_check, in);
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
res = ans;
for (const auto& item : encodings_freq) {
ans += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
}
res = ans;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
@ -215,50 +152,35 @@ struct TextClassificationImpl
String str = haystack;
String prom;
if (!Tonality)
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
// Float64 count_bigram = data.size() - 1;
std::unordered_map<UInt16, Float64> model;
for (size_t j = 0; j < len; ++j)
{
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
model[ngram_storage.get()[j]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[j]]);
}
size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
// Float64 count_bigram = data.size() - 1;
std::unordered_map<UInt16, Float64> model;
std::vector<std::pair<std::string, Float64>> results;
for (size_t j = 0; j < len; ++j)
{
model[ngram_storage.get()[j]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[j]]);
}
for (const auto& item : encodings_freq)
{
results.push_back(std::make_pair(item.first, Naive_bayes(item.second, model)));
}
for (const auto& item : encodings_freq) {
prom += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
}
std::sort(results.begin(), results.end(), [](auto &left, auto &right)
{
return left.second > right.second;
});
for (size_t ind = 0; ind < 3; ++ind) {
prom += results[ind].first + " result=" + std::to_string(results[ind].second) + "\n";
}
}
else
{
Float64 freq = 0;
Float64 count_words = 0;
String to_check;
ReadBufferFromString in(str);
while (!in.eof())
{
readString(to_check, in);
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
}
const auto ans = prom.c_str();
size_t cur_offset = offsets[i];
@ -283,19 +205,13 @@ struct NameCharsetDetect
{
static constexpr auto name = "charsetDetect";
};
struct NameGetTonality
{
static constexpr auto name = "getTonality";
};
using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2, false>, NameCharsetDetect>;
using FunctionGetTonality = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetTonality>;
using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2>, NameCharsetDetect>;
void registerFunctionsTextClassification(FunctionFactory & factory)
{
factory.registerFunction<FunctionCharsetDetect>();
factory.registerFunction<FunctionGetTonality>();
}
}

View File

@ -14,7 +14,6 @@ namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int TOO_LARGE_STRING_SIZE;
}
template <typename Impl, typename Name>

View File

@ -0,0 +1,163 @@
#include <Functions/FunctionsTextClassification.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <algorithm>
#include <cstring>
#include <cmath>
#include <limits>
#include <unordered_map>
#include <memory>
#include <utility>
#include <sstream>
#include <set>
namespace DB
{
struct TonalityClassificationImpl
{
using ResultType = String;
static ALWAYS_INLINE inline void word_processing(String & word)
{
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
while (to_skip.find(word.back()) != to_skip.end())
{
word.pop_back();
}
while (to_skip.find(word.front()) != to_skip.end())
{
word.erase(0, 1);
}
}
static String get_tonality(const Float64 & tonality_level)
{
if (tonality_level < 0.5) { return "NEG"; }
if (tonality_level > 1) { return "POS"; }
return "NEUT";
}
static void constant(String data, String & res)
{
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
Float64 freq = 0;
Float64 count_words = 0;
String ans;
ReadBufferFromMemory in(data.data(), data.size() + 1);
skipWhitespaceIfAny(in);
String to_check;
while (!in.eof())
{
if (data.size() - (in.position() - data.data()) <= 3) {
break;
}
readStringUntilWhitespace(to_check, in);
skipWhitespaceIfAny(in);
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
res = ans;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
res_data.reserve(1024);
res_offsets.resize(offsets.size());
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
String str = haystack;
String prom;
Float64 freq = 0;
Float64 count_words = 0;
ReadBufferFromMemory in(str.data(), str.size() + 1);
skipWhitespaceIfAny(in);
String to_check;
while (!in.eof())
{
if (str.size() - (in.position() - str.data()) <= 3) {
break;
}
readStringUntilWhitespace(to_check, in);
skipWhitespaceIfAny(in);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
const auto ans = prom.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
struct NameGetTonality
{
static constexpr auto name = "getTonality";
};
using FunctionGetTonality = FunctionsTextClassification<TonalityClassificationImpl, NameGetTonality>;
void registerFunctionsTonalityClassification(FunctionFactory & factory)
{
factory.registerFunction<FunctionGetTonality>();
}
}

View File

@ -34,6 +34,7 @@ void registerFunctionsStringSearch(FunctionFactory &);
void registerFunctionsStringRegexp(FunctionFactory &);
void registerFunctionsStringSimilarity(FunctionFactory &);
void registerFunctionsTextClassification(FunctionFactory &);
void registerFunctionsTonalityClassification(FunctionFactory &);
void registerFunctionsURL(FunctionFactory &);
void registerFunctionsVisitParam(FunctionFactory &);
void registerFunctionsMath(FunctionFactory &);
@ -93,6 +94,7 @@ void registerFunctions()
registerFunctionsStringRegexp(factory);
registerFunctionsStringSimilarity(factory);
registerFunctionsTextClassification(factory);
registerFunctionsTonalityClassification(factory);
registerFunctionsURL(factory);
registerFunctionsVisitParam(factory);
registerFunctionsMath(factory);

View File

@ -56,6 +56,7 @@ SRCS(
FunctionsStringHash.cpp
FunctionsStringSimilarity.cpp
FunctionsTextClassification.cpp
FunctionsTonalityClassification.cpp
GatherUtils/concat.cpp
GatherUtils/createArraySink.cpp
GatherUtils/createArraySource.cpp