mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 01:51:59 +00:00
add FunctionsTonalityClassification.cpp
This commit is contained in:
parent
4412aa39bb
commit
cdf8ab71d2
@ -634,7 +634,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
||||
|
||||
/// my test
|
||||
{
|
||||
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "textclassification_frequency/");
|
||||
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "encodings_frequency/");
|
||||
FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
|
||||
}
|
||||
|
||||
|
@ -5,5 +5,4 @@
|
||||
<format_schema_path replace="replace">./format_schemas/</format_schema_path>
|
||||
<access_control_path replace="replace">./access/</access_control_path>
|
||||
<top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
|
||||
<encodings_frequency_path replace="replace">./encoding_frequency/</encodings_frequency_path>
|
||||
</yandex>
|
||||
|
35722
src/Common/ClassificationDictionaries/charset_freq.txt
Normal file
35722
src/Common/ClassificationDictionaries/charset_freq.txt
Normal file
File diff suppressed because it is too large
Load Diff
55533
src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt
Normal file
55533
src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -14,7 +14,7 @@
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <common/logger_useful.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -22,11 +22,11 @@ namespace DB
|
||||
|
||||
class FrequencyHolder
|
||||
{
|
||||
|
||||
public:
|
||||
using Map = std::unordered_map<UInt16, Float64>;
|
||||
using Container = std::unordered_map<String, Map>;
|
||||
|
||||
|
||||
static FrequencyHolder & getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
@ -37,8 +37,8 @@ public:
|
||||
void parseDictionaries(const String & pt)
|
||||
{
|
||||
is_true = pt;
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
}
|
||||
|
||||
|
||||
@ -48,6 +48,10 @@ public:
|
||||
Float64 frequency;
|
||||
String charset_name;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
|
||||
LOG_TRACE(log, "Charset frequencies loading from {}", path_to_charset_freq);
|
||||
|
||||
ReadBufferFromFile in(path_to_charset_freq);
|
||||
while (!in.eof())
|
||||
{
|
||||
@ -75,6 +79,7 @@ public:
|
||||
}
|
||||
in.position() = newline + 1;
|
||||
}
|
||||
LOG_TRACE(log, "Charset frequencies was added");
|
||||
}
|
||||
|
||||
|
||||
@ -84,6 +89,9 @@ public:
|
||||
String word;
|
||||
Float64 tonality;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
|
||||
|
||||
ReadBufferFromFile in(path_to_emotional_dict);
|
||||
while (!in.eof())
|
||||
{
|
||||
@ -101,6 +109,7 @@ public:
|
||||
emotional_dict[word] = tonality;
|
||||
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added");
|
||||
}
|
||||
|
||||
|
||||
@ -110,13 +119,13 @@ public:
|
||||
}
|
||||
|
||||
|
||||
const std::unordered_map<String, Float64> getEmotionalDict()
|
||||
const std::unordered_map<String, Float64> & getEmotionalDict()
|
||||
{
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
|
||||
const Container getEncodingsFrequency()
|
||||
const Container & getEncodingsFrequency()
|
||||
{
|
||||
return encodings_freq;
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ namespace DB
|
||||
{
|
||||
|
||||
|
||||
template <size_t N, bool Tonality>
|
||||
template <size_t N>
|
||||
struct TextClassificationImpl
|
||||
{
|
||||
|
||||
@ -43,19 +43,6 @@ struct TextClassificationImpl
|
||||
*/
|
||||
using NgramCount = UInt16;
|
||||
|
||||
|
||||
static ALWAYS_INLINE inline Float64 L2_distance(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (auto& el : standart) {
|
||||
if (model.find(el.first) != model.end()) {
|
||||
res += ((model[el.first] - el.second) * (model[el.first] - el.second));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
|
||||
{
|
||||
Float64 res = 0;
|
||||
@ -63,7 +50,7 @@ struct TextClassificationImpl
|
||||
if (standart[el.first] != 0) {
|
||||
res += el.second * log(standart[el.first]);
|
||||
} else {
|
||||
res += el.second * log(0.0000001);
|
||||
res += el.second * log(0.000001);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
@ -120,38 +107,14 @@ struct TextClassificationImpl
|
||||
}
|
||||
|
||||
|
||||
static ALWAYS_INLINE inline void word_processing(String & word)
|
||||
{
|
||||
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
|
||||
|
||||
while (to_skip.find(word.back()) != to_skip.end())
|
||||
{
|
||||
word.pop_back();
|
||||
}
|
||||
|
||||
while (to_skip.find(word.front()) != to_skip.end())
|
||||
{
|
||||
word.erase(0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static String get_tonality(const Float64 & tonality_level)
|
||||
{
|
||||
if (tonality_level < 0.5) { return "NEG"; }
|
||||
if (tonality_level > 1) { return "POS"; }
|
||||
return "NEUT";
|
||||
}
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
if (!Tonality)
|
||||
{
|
||||
|
||||
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
|
||||
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
|
||||
|
||||
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
|
||||
String ans;
|
||||
// Float64 count_bigram = data.size() - 1;
|
||||
@ -166,33 +129,7 @@ struct TextClassificationImpl
|
||||
}
|
||||
res = ans;
|
||||
}
|
||||
else
|
||||
{
|
||||
Float64 freq = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
String ans;
|
||||
|
||||
String to_check;
|
||||
ReadBufferFromString in(data);
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(to_check, in);
|
||||
word_processing(to_check);
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
|
||||
res = ans;
|
||||
}
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
@ -215,8 +152,7 @@ struct TextClassificationImpl
|
||||
String str = haystack;
|
||||
|
||||
String prom;
|
||||
if (!Tonality)
|
||||
{
|
||||
|
||||
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
|
||||
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
|
||||
|
||||
@ -229,36 +165,22 @@ struct TextClassificationImpl
|
||||
model[ngram_storage.get()[j]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[j]]);
|
||||
}
|
||||
|
||||
for (const auto& item : encodings_freq) {
|
||||
prom += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
|
||||
}
|
||||
std::vector<std::pair<std::string, Float64>> results;
|
||||
|
||||
}
|
||||
else
|
||||
for (const auto& item : encodings_freq)
|
||||
{
|
||||
Float64 freq = 0;
|
||||
Float64 count_words = 0;
|
||||
results.push_back(std::make_pair(item.first, Naive_bayes(item.second, model)));
|
||||
}
|
||||
|
||||
|
||||
String to_check;
|
||||
ReadBufferFromString in(str);
|
||||
|
||||
while (!in.eof())
|
||||
std::sort(results.begin(), results.end(), [](auto &left, auto &right)
|
||||
{
|
||||
readString(to_check, in);
|
||||
return left.second > right.second;
|
||||
});
|
||||
|
||||
word_processing(to_check);
|
||||
for (size_t ind = 0; ind < 3; ++ind) {
|
||||
prom += results[ind].first + " result=" + std::to_string(results[ind].second) + "\n";
|
||||
}
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
|
||||
}
|
||||
|
||||
const auto ans = prom.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
@ -283,19 +205,13 @@ struct NameCharsetDetect
|
||||
{
|
||||
static constexpr auto name = "charsetDetect";
|
||||
};
|
||||
struct NameGetTonality
|
||||
{
|
||||
static constexpr auto name = "getTonality";
|
||||
};
|
||||
|
||||
|
||||
using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2, false>, NameCharsetDetect>;
|
||||
using FunctionGetTonality = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetTonality>;
|
||||
using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2>, NameCharsetDetect>;
|
||||
|
||||
void registerFunctionsTextClassification(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionCharsetDetect>();
|
||||
factory.registerFunction<FunctionGetTonality>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -14,7 +14,6 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int TOO_LARGE_STRING_SIZE;
|
||||
}
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
|
163
src/Functions/FunctionsTonalityClassification.cpp
Normal file
163
src/Functions/FunctionsTonalityClassification.cpp
Normal file
@ -0,0 +1,163 @@
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <sstream>
|
||||
#include <set>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
struct TonalityClassificationImpl
|
||||
{
|
||||
|
||||
using ResultType = String;
|
||||
|
||||
|
||||
static ALWAYS_INLINE inline void word_processing(String & word)
|
||||
{
|
||||
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
|
||||
|
||||
while (to_skip.find(word.back()) != to_skip.end())
|
||||
{
|
||||
word.pop_back();
|
||||
}
|
||||
|
||||
while (to_skip.find(word.front()) != to_skip.end())
|
||||
{
|
||||
word.erase(0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static String get_tonality(const Float64 & tonality_level)
|
||||
{
|
||||
if (tonality_level < 0.5) { return "NEG"; }
|
||||
if (tonality_level > 1) { return "POS"; }
|
||||
return "NEUT";
|
||||
}
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
|
||||
Float64 freq = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
String ans;
|
||||
|
||||
ReadBufferFromMemory in(data.data(), data.size() + 1);
|
||||
skipWhitespaceIfAny(in);
|
||||
String to_check;
|
||||
while (!in.eof())
|
||||
{
|
||||
if (data.size() - (in.position() - data.data()) <= 3) {
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(to_check, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
word_processing(to_check);
|
||||
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
|
||||
res = ans;
|
||||
}
|
||||
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
|
||||
res_data.reserve(1024);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t prev_offset = 0;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
String str = haystack;
|
||||
|
||||
String prom;
|
||||
|
||||
Float64 freq = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
|
||||
ReadBufferFromMemory in(str.data(), str.size() + 1);
|
||||
skipWhitespaceIfAny(in);
|
||||
String to_check;
|
||||
while (!in.eof())
|
||||
{
|
||||
if (str.size() - (in.position() - str.data()) <= 3) {
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(to_check, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
|
||||
|
||||
const auto ans = prom.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
memcpy(&res_data[res_offset], ans, strlen(ans));
|
||||
res_offset += strlen(ans);
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
prev_offset = cur_offset;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct NameGetTonality
|
||||
{
|
||||
static constexpr auto name = "getTonality";
|
||||
};
|
||||
|
||||
|
||||
using FunctionGetTonality = FunctionsTextClassification<TonalityClassificationImpl, NameGetTonality>;
|
||||
|
||||
void registerFunctionsTonalityClassification(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionGetTonality>();
|
||||
}
|
||||
|
||||
}
|
@ -34,6 +34,7 @@ void registerFunctionsStringSearch(FunctionFactory &);
|
||||
void registerFunctionsStringRegexp(FunctionFactory &);
|
||||
void registerFunctionsStringSimilarity(FunctionFactory &);
|
||||
void registerFunctionsTextClassification(FunctionFactory &);
|
||||
void registerFunctionsTonalityClassification(FunctionFactory &);
|
||||
void registerFunctionsURL(FunctionFactory &);
|
||||
void registerFunctionsVisitParam(FunctionFactory &);
|
||||
void registerFunctionsMath(FunctionFactory &);
|
||||
@ -93,6 +94,7 @@ void registerFunctions()
|
||||
registerFunctionsStringRegexp(factory);
|
||||
registerFunctionsStringSimilarity(factory);
|
||||
registerFunctionsTextClassification(factory);
|
||||
registerFunctionsTonalityClassification(factory);
|
||||
registerFunctionsURL(factory);
|
||||
registerFunctionsVisitParam(factory);
|
||||
registerFunctionsMath(factory);
|
||||
|
@ -56,6 +56,7 @@ SRCS(
|
||||
FunctionsStringHash.cpp
|
||||
FunctionsStringSimilarity.cpp
|
||||
FunctionsTextClassification.cpp
|
||||
FunctionsTonalityClassification.cpp
|
||||
GatherUtils/concat.cpp
|
||||
GatherUtils/createArraySink.cpp
|
||||
GatherUtils/createArraySource.cpp
|
||||
|
Loading…
Reference in New Issue
Block a user