2021-03-19 10:06:21 +00:00
|
|
|
|
#include <Common/FrequencyHolder.h>
|
2021-02-07 18:40:55 +00:00
|
|
|
|
#include <Functions/FunctionFactory.h>
|
2022-01-12 16:32:17 +00:00
|
|
|
|
#include <Functions/FunctionsTextClassification.h>
|
2021-05-31 13:38:51 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
#include <memory>
|
2022-01-10 15:36:32 +00:00
|
|
|
|
#include <unordered_map>
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
namespace
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
|
|
|
|
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
|
|
|
|
|
* 1e-06 is minimal value in our marked-up dictionary.
|
|
|
|
|
*/
|
2022-03-02 14:46:06 +00:00
|
|
|
|
constexpr Float64 zero_frequency = 1e-06;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
|
|
|
|
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
2022-03-02 14:46:06 +00:00
|
|
|
|
constexpr size_t max_string_size = 1UL << 15;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
template <typename ModelMap>
|
2022-03-15 15:43:31 +00:00
|
|
|
|
ALWAYS_INLINE inline Float64 naiveBayes(
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const FrequencyHolder::EncodingMap & standard,
|
2022-03-02 14:46:06 +00:00
|
|
|
|
const ModelMap & model,
|
2021-05-19 10:01:09 +00:00
|
|
|
|
Float64 max_result)
|
2021-03-18 14:05:28 +00:00
|
|
|
|
{
|
2021-03-23 18:55:14 +00:00
|
|
|
|
Float64 res = 0;
|
2022-01-10 15:36:32 +00:00
|
|
|
|
for (const auto & el : model)
|
2021-05-06 07:04:00 +00:00
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Try to find bigram in the dictionary.
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const auto * it = standard.find(el.getKey());
|
2021-12-30 02:14:57 +00:00
|
|
|
|
if (it != standard.end())
|
2021-05-06 07:04:00 +00:00
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
res += el.getMapped() * log(it->getMapped());
|
2021-05-06 07:04:00 +00:00
|
|
|
|
} else
|
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
res += el.getMapped() * log(zero_frequency);
|
2021-03-18 14:05:28 +00:00
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
|
2021-05-21 13:58:48 +00:00
|
|
|
|
if (res < max_result)
|
|
|
|
|
{
|
2021-05-19 10:01:09 +00:00
|
|
|
|
return res;
|
|
|
|
|
}
|
2021-03-18 14:05:28 +00:00
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Сount how many times each bigram occurs in the text.
|
2022-03-02 14:46:06 +00:00
|
|
|
|
template <typename ModelMap>
|
2022-03-15 15:43:31 +00:00
|
|
|
|
ALWAYS_INLINE inline void calculateStats(
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const UInt8 * data,
|
2021-02-07 18:40:55 +00:00
|
|
|
|
const size_t size,
|
2022-03-02 14:46:06 +00:00
|
|
|
|
ModelMap & model)
|
2021-03-18 14:05:28 +00:00
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
UInt16 hash = 0;
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
2021-05-07 14:18:06 +00:00
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
hash <<= 8;
|
|
|
|
|
hash += *(data + i);
|
|
|
|
|
++model[hash];
|
2021-05-07 14:18:06 +00:00
|
|
|
|
}
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
2022-03-02 14:46:06 +00:00
|
|
|
|
}
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
|
|
|
|
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
|
|
|
|
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
|
|
|
|
*/
|
|
|
|
|
template <bool detect_language>
|
|
|
|
|
struct CharsetClassificationImpl
|
|
|
|
|
{
|
2021-02-07 18:40:55 +00:00
|
|
|
|
static void vector(
|
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
|
const ColumnString::Offsets & offsets,
|
2021-03-18 14:05:28 +00:00
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
|
ColumnString::Offsets & res_offsets)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-12-30 02:14:57 +00:00
|
|
|
|
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
if constexpr (detect_language)
|
2022-01-10 15:36:32 +00:00
|
|
|
|
/// 2 chars for ISO code + 1 zero byte
|
|
|
|
|
res_data.reserve(offsets.size() * 3);
|
|
|
|
|
else
|
|
|
|
|
/// Mean charset length is 8
|
|
|
|
|
res_data.reserve(offsets.size() * 8);
|
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
size_t current_result_offset = 0;
|
|
|
|
|
|
|
|
|
|
double zero_frequency_log = log(zero_frequency);
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
2021-03-18 14:05:28 +00:00
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const UInt8 * str = data.data() + offsets[i - 1];
|
|
|
|
|
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
|
2022-01-10 15:36:32 +00:00
|
|
|
|
calculateStats(str, str_len, model);
|
2021-03-23 18:55:14 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
std::string_view result_value;
|
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
|
/// Go through the dictionary and find the charset with the highest weight
|
2022-03-02 14:46:06 +00:00
|
|
|
|
Float64 max_result = zero_frequency_log * (max_string_size);
|
2022-01-10 15:36:32 +00:00
|
|
|
|
for (const auto & item : encodings_freq)
|
2021-04-14 18:42:33 +00:00
|
|
|
|
{
|
2021-12-30 02:14:57 +00:00
|
|
|
|
Float64 score = naiveBayes(item.map, model, max_result);
|
2021-05-19 10:01:09 +00:00
|
|
|
|
if (max_result < score)
|
2021-05-06 07:04:00 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
max_result = score;
|
2022-03-02 14:46:06 +00:00
|
|
|
|
|
|
|
|
|
if constexpr (detect_language)
|
|
|
|
|
result_value = item.lang;
|
|
|
|
|
else
|
|
|
|
|
result_value = item.name;
|
2021-05-06 07:04:00 +00:00
|
|
|
|
}
|
2021-04-14 18:42:33 +00:00
|
|
|
|
}
|
2021-05-23 16:39:40 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
size_t result_value_size = result_value.size();
|
|
|
|
|
res_data.resize(current_result_offset + result_value_size + 1);
|
|
|
|
|
memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
|
|
|
|
|
res_data[current_result_offset + result_value_size] = '\0';
|
|
|
|
|
current_result_offset += result_value_size + 1;
|
2021-03-18 14:05:28 +00:00
|
|
|
|
|
2022-03-02 14:46:06 +00:00
|
|
|
|
res_offsets[i] = current_result_offset;
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
struct NameDetectCharset
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2021-05-07 14:18:06 +00:00
|
|
|
|
static constexpr auto name = "detectCharset";
|
|
|
|
|
};
|
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
struct NameDetectLanguageUnknown
|
2021-05-07 14:18:06 +00:00
|
|
|
|
{
|
2021-12-22 21:03:42 +00:00
|
|
|
|
static constexpr auto name = "detectLanguageUnknown";
|
2021-02-07 18:40:55 +00:00
|
|
|
|
};
|
2021-02-07 19:46:33 +00:00
|
|
|
|
|
2021-02-07 18:40:55 +00:00
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
|
|
|
|
|
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
|
2021-03-23 18:55:14 +00:00
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
void registerFunctionDetectCharset(FunctionFactory & factory)
|
2021-02-07 18:40:55 +00:00
|
|
|
|
{
|
2022-01-12 16:32:17 +00:00
|
|
|
|
factory.registerFunction<FunctionDetectCharset>();
|
|
|
|
|
factory.registerFunction<FunctionDetectLanguageUnknown>();
|
2021-02-07 18:40:55 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|