2021-02-07 18:40:55 +00:00
|
|
|
|
#include <Functions/FunctionsTextClassification.h>
|
2021-03-19 10:06:21 +00:00
|
|
|
|
#include <Common/FrequencyHolder.h>
|
2021-02-07 18:40:55 +00:00
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
|
#include <Common/UTF8Helpers.h>
|
2021-03-23 19:32:54 +00:00
|
|
|
|
#include <IO/ReadBufferFromString.h>
|
|
|
|
|
#include <IO/ReadHelpers.h>
|
2021-05-21 13:48:18 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
#include <cstring>
|
2021-03-18 14:05:28 +00:00
|
|
|
|
#include <cmath>
|
|
|
|
|
#include <unordered_map>
|
2021-02-07 18:40:55 +00:00
|
|
|
|
#include <memory>
|
|
|
|
|
#include <utility>
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
|
|
|
|
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
|
|
|
|
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
|
|
|
|
*/
|
2021-03-23 18:55:14 +00:00
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
template <size_t N, bool detect_language>
|
2021-04-15 17:16:32 +00:00
|
|
|
|
struct CharsetClassificationImpl
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
|
using ResultType = String;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
using CodePoint = UInt8;
|
2021-05-06 07:04:00 +00:00
|
|
|
|
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
|
|
|
|
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
|
|
|
|
|
* 1e-06 is minimal value in our marked-up dictionary.
|
|
|
|
|
*/
|
|
|
|
|
static constexpr Float64 zero_frequency = 1e-06;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
|
|
|
|
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
|
|
|
|
static constexpr size_t max_string_size = 1u << 15;
|
|
|
|
|
|
|
|
|
|
/// Default padding to read safely.
|
|
|
|
|
static constexpr size_t default_padding = 16;
|
|
|
|
|
|
|
|
|
|
/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
|
|
|
|
|
static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
|
|
|
|
|
|
|
|
|
|
|
2021-05-19 10:01:09 +00:00
|
|
|
|
static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64>& standart,
|
|
|
|
|
std::unordered_map<UInt16, Float64>& model,
|
|
|
|
|
Float64 max_result)
|
2021-03-18 14:05:28 +00:00
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
|
Float64 res = 0;
|
2021-05-06 07:04:00 +00:00
|
|
|
|
for (auto & el : model)
|
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Try to find bigram in the dictionary.
|
2021-05-19 10:01:09 +00:00
|
|
|
|
if (standart.find(el.first) != standart.end())
|
2021-05-06 07:04:00 +00:00
|
|
|
|
{
|
2021-03-18 14:05:28 +00:00
|
|
|
|
res += el.second * log(standart[el.first]);
|
2021-05-06 07:04:00 +00:00
|
|
|
|
} else
|
|
|
|
|
{
|
|
|
|
|
res += el.second * log(zero_frequency);
|
2021-03-18 14:05:28 +00:00
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
|
2021-05-19 10:01:09 +00:00
|
|
|
|
if (res < max_result) {
|
|
|
|
|
return res;
|
|
|
|
|
}
|
2021-03-18 14:05:28 +00:00
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-18 21:57:42 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t padding_offset = default_padding - N + 1;
|
|
|
|
|
memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
|
|
|
|
|
memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
|
|
|
|
|
pos += padding_offset;
|
|
|
|
|
if (pos > end)
|
|
|
|
|
return default_padding - (pos - end);
|
|
|
|
|
return default_padding;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Сount how many times each bigram occurs in the text.
|
2021-02-07 18:40:55 +00:00
|
|
|
|
static ALWAYS_INLINE inline size_t calculateStats(
|
|
|
|
|
const char * data,
|
|
|
|
|
const size_t size,
|
|
|
|
|
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
|
2021-05-06 07:04:00 +00:00
|
|
|
|
std::unordered_map<UInt16, Float64>& model)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-05-06 07:04:00 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
const char * start = data;
|
|
|
|
|
const char * end = data + size;
|
|
|
|
|
CodePoint cp[simultaneously_codepoints_num] = {};
|
|
|
|
|
/// read_code_points returns the position of cp where it stopped reading codepoints.
|
|
|
|
|
size_t found = read_code_points(cp, start, end);
|
|
|
|
|
/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
|
|
|
|
|
size_t i = N - 1;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
do
|
|
|
|
|
{
|
|
|
|
|
for (; i + N <= found; ++i)
|
|
|
|
|
{
|
2021-02-08 12:23:51 +00:00
|
|
|
|
UInt32 hash = 0;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
for (size_t j = 0; j < N; ++j) {
|
|
|
|
|
hash <<= 8;
|
|
|
|
|
hash += *(cp + i + j);
|
|
|
|
|
}
|
2021-05-06 07:04:00 +00:00
|
|
|
|
if (model[hash] == 0) {
|
|
|
|
|
model[hash] = 1;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
++len;
|
|
|
|
|
}
|
2021-05-06 07:04:00 +00:00
|
|
|
|
++model[hash];
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
|
|
|
|
i = 0;
|
|
|
|
|
} while (start < end && (found = read_code_points(cp, start, end)));
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
|
static void constant(String data, String & res)
|
2021-03-18 14:05:28 +00:00
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
|
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
|
|
|
|
static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
2021-05-06 07:04:00 +00:00
|
|
|
|
std::unordered_map<UInt16, Float64> model;
|
|
|
|
|
calculateStats(data.data(), data.size(), readCodePoints, model);
|
2021-04-15 17:16:32 +00:00
|
|
|
|
|
2021-05-19 10:01:09 +00:00
|
|
|
|
Float64 max_result = log(zero_frequency) * (max_string_size);
|
2021-05-07 14:18:06 +00:00
|
|
|
|
String poss_ans;
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Go through the dictionary and find the charset with the highest weight
|
2021-05-19 10:01:09 +00:00
|
|
|
|
for (auto& item : encodings_freq)
|
2021-04-15 17:16:32 +00:00
|
|
|
|
{
|
2021-05-19 10:01:09 +00:00
|
|
|
|
Float64 score = Naive_bayes(item.second, model, max_result);
|
|
|
|
|
if (max_result < score)
|
2021-05-06 07:04:00 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
poss_ans = item.first;
|
|
|
|
|
max_result = score;
|
2021-05-06 07:04:00 +00:00
|
|
|
|
}
|
2021-03-18 14:05:28 +00:00
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
|
|
|
|
|
|
/* In our dictionary we have lines with form: <Language>_<Charset>
|
|
|
|
|
* If we need to find language of data, we return <Language>
|
|
|
|
|
* If we need to find charset of data, we return <Charset>.
|
|
|
|
|
*/
|
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
size_t sep = poss_ans.find('_');
|
|
|
|
|
if (detect_language)
|
|
|
|
|
{
|
|
|
|
|
res = poss_ans.erase(0, sep + 1);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
res = poss_ans.erase(sep, poss_ans.size() - sep);
|
|
|
|
|
}
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
|
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
static void vector(
|
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
|
const ColumnString::Offsets & offsets,
|
2021-03-18 14:05:28 +00:00
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
|
ColumnString::Offsets & res_offsets)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
|
static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
|
|
|
|
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
|
|
|
|
res_data.reserve(1024);
|
|
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
size_t prev_offset = 0;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
size_t res_offset = 0;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
|
|
|
|
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
2021-03-23 18:55:14 +00:00
|
|
|
|
String str = haystack;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
String poss_ans;
|
2021-03-23 18:55:14 +00:00
|
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
|
std::unordered_map<UInt16, Float64> model;
|
2021-05-06 07:04:00 +00:00
|
|
|
|
calculateStats(str.data(), str.size(), readCodePoints, model);
|
2021-03-23 18:55:14 +00:00
|
|
|
|
|
2021-05-19 10:01:09 +00:00
|
|
|
|
Float64 max_result = log(zero_frequency) * (max_string_size);
|
|
|
|
|
for (auto& item : encodings_freq)
|
2021-04-14 18:42:33 +00:00
|
|
|
|
{
|
2021-05-19 10:01:09 +00:00
|
|
|
|
Float64 score = Naive_bayes(item.second, model, max_result);
|
|
|
|
|
if (max_result < score)
|
2021-05-06 07:04:00 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
max_result = score;
|
|
|
|
|
poss_ans = item.first;
|
2021-05-06 07:04:00 +00:00
|
|
|
|
}
|
2021-04-14 18:42:33 +00:00
|
|
|
|
}
|
2021-05-07 14:18:06 +00:00
|
|
|
|
|
|
|
|
|
size_t sep = poss_ans.find('_');
|
|
|
|
|
String ans_str;
|
2021-05-19 10:01:09 +00:00
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
if (detect_language)
|
2021-05-06 10:04:38 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
ans_str = poss_ans.erase(0, sep + 1);
|
2021-05-06 10:04:38 +00:00
|
|
|
|
}
|
2021-05-07 14:18:06 +00:00
|
|
|
|
else
|
2021-05-06 10:04:38 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
ans_str = poss_ans.erase(sep, poss_ans.size() - sep);
|
|
|
|
|
}
|
2021-05-06 10:04:38 +00:00
|
|
|
|
|
2021-05-19 10:01:09 +00:00
|
|
|
|
ans_str = poss_ans;
|
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
const auto ans = ans_str.c_str();
|
2021-03-18 14:05:28 +00:00
|
|
|
|
size_t cur_offset = offsets[i];
|
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
size_t ans_size = strlen(ans);
|
|
|
|
|
res_data.resize(res_offset + ans_size + 1);
|
|
|
|
|
memcpy(&res_data[res_offset], ans, ans_size);
|
|
|
|
|
res_offset += ans_size;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
|
|
|
|
res_data[res_offset] = 0;
|
|
|
|
|
++res_offset;
|
|
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
prev_offset = cur_offset;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2021-03-23 18:55:14 +00:00
|
|
|
|
struct NameCharsetDetect
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
static constexpr auto name = "detectCharset";
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct NameLanguageDetect
|
|
|
|
|
{
|
|
|
|
|
static constexpr auto name = "detectLanguage";
|
2021-02-07 18:40:55 +00:00
|
|
|
|
};
|
2021-02-07 19:46:33 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
2021-05-07 14:18:06 +00:00
|
|
|
|
using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2, true>, NameCharsetDetect>;
|
|
|
|
|
using FunctionLanguageDetect = FunctionsTextClassification<CharsetClassificationImpl<2, false>, NameLanguageDetect>;
|
2021-03-23 18:55:14 +00:00
|
|
|
|
|
2021-04-15 17:16:32 +00:00
|
|
|
|
void registerFunctionsCharsetClassification(FunctionFactory & factory)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
|
factory.registerFunction<FunctionCharsetDetect>();
|
2021-05-07 14:18:06 +00:00
|
|
|
|
factory.registerFunction<FunctionLanguageDetect>();
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|