ClickHouse/src/Functions/FunctionsTextClassification.cpp

302 lines
10 KiB
C++
Raw Normal View History

2021-02-07 18:40:55 +00:00
#include <Functions/FunctionsTextClassification.h>
2021-03-19 10:06:21 +00:00
#include <Common/FrequencyHolder.h>
2021-02-07 18:40:55 +00:00
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
2021-03-23 19:32:54 +00:00
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
2021-02-07 18:40:55 +00:00
#include <algorithm>
#include <cstring>
2021-03-18 14:05:28 +00:00
#include <cmath>
2021-02-07 18:40:55 +00:00
#include <limits>
2021-03-18 14:05:28 +00:00
#include <unordered_map>
2021-02-07 18:40:55 +00:00
#include <memory>
#include <utility>
2021-03-18 14:05:28 +00:00
#include <sstream>
#include <set>
2021-02-07 18:40:55 +00:00
namespace DB
{
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
template <size_t N, bool Tonality>
2021-02-07 18:40:55 +00:00
struct TextClassificationImpl
{
2021-03-23 18:55:14 +00:00
using ResultType = String;
2021-02-07 18:40:55 +00:00
using CodePoint = UInt8;
/// map_size for ngram count.
static constexpr size_t map_size = 1u << 16;
/// If the data size is bigger than this, behaviour is unspecified for this function.
static constexpr size_t max_string_size = 1u << 15;
/// Default padding to read safely.
static constexpr size_t default_padding = 16;
/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
/** map_size of this fits mostly in L2 cache all the time.
* Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
* integer array.
*/
using NgramCount = UInt16;
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
static ALWAYS_INLINE inline Float64 L2_distance(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
2021-03-18 14:05:28 +00:00
{
2021-03-23 18:55:14 +00:00
Float64 res = 0;
2021-03-18 14:05:28 +00:00
for (auto& el : standart) {
if (model.find(el.first) != model.end()) {
res += ((model[el.first] - el.second) * (model[el.first] - el.second));
}
}
return res;
}
2021-03-23 18:55:14 +00:00
static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
2021-03-18 14:05:28 +00:00
{
2021-03-23 18:55:14 +00:00
Float64 res = 0;
2021-03-18 14:05:28 +00:00
for (auto & el : model) {
if (standart[el.first] != 0) {
res += el.second * log(standart[el.first]);
2021-03-18 21:57:42 +00:00
} else {
2021-03-23 18:55:14 +00:00
res += el.second * log(0.0000001);
2021-03-18 14:05:28 +00:00
}
}
return res;
}
2021-03-18 21:57:42 +00:00
2021-03-18 14:05:28 +00:00
2021-02-07 18:40:55 +00:00
static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
{
constexpr size_t padding_offset = default_padding - N + 1;
memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
pos += padding_offset;
if (pos > end)
return default_padding - (pos - end);
return default_padding;
}
2021-03-18 14:05:28 +00:00
2021-02-07 18:40:55 +00:00
static ALWAYS_INLINE inline size_t calculateStats(
const char * data,
const size_t size,
NgramCount * ngram_stats,
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
NgramCount * ngram_storage)
{
const char * start = data;
const char * end = data + size;
CodePoint cp[simultaneously_codepoints_num] = {};
/// read_code_points returns the position of cp where it stopped reading codepoints.
size_t found = read_code_points(cp, start, end);
/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
size_t i = N - 1;
size_t len = 0;
do
{
for (; i + N <= found; ++i)
{
2021-02-08 12:23:51 +00:00
UInt32 hash = 0;
2021-02-07 18:40:55 +00:00
for (size_t j = 0; j < N; ++j) {
hash <<= 8;
hash += *(cp + i + j);
}
if (ngram_stats[hash] == 0) {
ngram_storage[len] = hash;
++len;
}
++ngram_stats[hash];
}
i = 0;
} while (start < end && (found = read_code_points(cp, start, end)));
return len;
}
2021-03-23 18:55:14 +00:00
static ALWAYS_INLINE inline void word_processing(String & word)
2021-02-07 18:40:55 +00:00
{
2021-03-18 14:05:28 +00:00
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
while (to_skip.find(word.back()) != to_skip.end())
{
word.pop_back();
}
while (to_skip.find(word.front()) != to_skip.end())
{
word.erase(0, 1);
}
}
2021-03-23 19:32:54 +00:00
static String get_tonality(const Float64 & tonality_level)
2021-03-23 18:55:14 +00:00
{
if (tonality_level < 0.5) { return "NEG"; }
if (tonality_level > 1) { return "POS"; }
return "NEUT";
}
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
static void constant(String data, String & res)
2021-03-18 14:05:28 +00:00
{
2021-03-23 18:55:14 +00:00
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
if (!Tonality)
2021-03-18 14:05:28 +00:00
{
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
2021-03-23 18:55:14 +00:00
String ans;
// Float64 count_bigram = data.size() - 1;
std::unordered_map<UInt16, Float64> model;
2021-03-18 14:05:28 +00:00
for (size_t i = 0; i < len; ++i) {
2021-03-23 18:55:14 +00:00
ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
model[ngram_storage.get()[i]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]]);
2021-03-18 14:05:28 +00:00
}
2021-03-18 21:57:42 +00:00
for (const auto& item : encodings_freq) {
2021-03-23 18:55:14 +00:00
ans += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
2021-03-18 21:57:42 +00:00
}
2021-03-18 14:05:28 +00:00
res = ans;
}
else
{
2021-03-23 18:55:14 +00:00
Float64 freq = 0;
Float64 count_words = 0;
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
String ans;
2021-03-23 19:32:54 +00:00
2021-03-23 18:55:14 +00:00
String to_check;
2021-03-23 19:32:54 +00:00
ReadBufferFromString in(data);
2021-03-18 14:05:28 +00:00
2021-03-23 19:32:54 +00:00
while (!in.eof())
2021-03-18 14:05:28 +00:00
{
2021-03-23 19:32:54 +00:00
readString(to_check, in);
2021-03-18 14:05:28 +00:00
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
2021-03-19 09:34:33 +00:00
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
2021-03-18 14:05:28 +00:00
}
}
2021-03-23 18:55:14 +00:00
Float64 total_tonality = freq / count_words;
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
2021-03-18 14:05:28 +00:00
res = ans;
}
2021-02-07 18:40:55 +00:00
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
2021-03-18 14:05:28 +00:00
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
2021-02-07 18:40:55 +00:00
{
2021-03-23 18:55:14 +00:00
static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
2021-03-18 14:05:28 +00:00
res_data.reserve(1024);
res_offsets.resize(offsets.size());
2021-02-07 18:40:55 +00:00
size_t prev_offset = 0;
2021-03-18 14:05:28 +00:00
size_t res_offset = 0;
2021-02-07 18:40:55 +00:00
2021-03-18 14:05:28 +00:00
for (size_t i = 0; i < offsets.size(); ++i)
2021-02-07 18:40:55 +00:00
{
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
2021-03-23 18:55:14 +00:00
String str = haystack;
2021-03-18 14:05:28 +00:00
2021-03-23 18:55:14 +00:00
String prom;
if (!Tonality)
2021-03-18 14:05:28 +00:00
{
2021-03-23 18:55:14 +00:00
std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams
std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
// Float64 count_bigram = data.size() - 1;
std::unordered_map<UInt16, Float64> model;
for (size_t j = 0; j < len; ++j)
{
model[ngram_storage.get()[j]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[j]]);
}
for (const auto& item : encodings_freq) {
prom += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
}
2021-03-18 14:05:28 +00:00
}
2021-03-23 18:55:14 +00:00
else
{
Float64 freq = 0;
Float64 count_words = 0;
2021-03-18 14:05:28 +00:00
2021-03-23 19:32:54 +00:00
2021-03-23 18:55:14 +00:00
String to_check;
2021-03-23 19:32:54 +00:00
ReadBufferFromString in(str);
2021-03-23 18:55:14 +00:00
2021-03-23 19:32:54 +00:00
while (!in.eof())
2021-03-23 18:55:14 +00:00
{
2021-03-23 19:32:54 +00:00
readString(to_check, in);
2021-03-23 18:55:14 +00:00
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
2021-03-18 14:05:28 +00:00
}
2021-03-23 18:55:14 +00:00
const auto ans = prom.c_str();
2021-03-18 14:05:28 +00:00
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
2021-02-07 18:40:55 +00:00
}
}
2021-03-18 14:05:28 +00:00
2021-02-07 18:40:55 +00:00
};
2021-03-23 18:55:14 +00:00
struct NameCharsetDetect
2021-02-07 18:40:55 +00:00
{
2021-03-23 18:55:14 +00:00
static constexpr auto name = "charsetDetect";
2021-02-07 18:40:55 +00:00
};
2021-03-23 18:55:14 +00:00
struct NameGetTonality
2021-03-18 14:05:28 +00:00
{
2021-03-23 18:55:14 +00:00
static constexpr auto name = "getTonality";
2021-03-18 14:05:28 +00:00
};
2021-02-07 19:46:33 +00:00
2021-02-07 18:40:55 +00:00
2021-03-23 18:55:14 +00:00
using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2, false>, NameCharsetDetect>;
using FunctionGetTonality = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetTonality>;
2021-02-07 18:40:55 +00:00
void registerFunctionsTextClassification(FunctionFactory & factory)
{
2021-03-23 18:55:14 +00:00
factory.registerFunction<FunctionCharsetDetect>();
factory.registerFunction<FunctionGetTonality>();
2021-02-07 18:40:55 +00:00
}
}