2021-04-14 18:42:33 +00:00
|
|
|
#include <Functions/FunctionsTextClassification.h>
|
2021-04-18 17:03:56 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2021-04-14 18:42:33 +00:00
|
|
|
#include <Common/FrequencyHolder.h>
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Common/UTF8Helpers.h>
|
|
|
|
#include <IO/ReadBufferFromString.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
|
|
|
|
#include <unordered_map>
|
2021-05-21 13:48:18 +00:00
|
|
|
|
2021-04-14 18:42:33 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
/**
|
|
|
|
* Determines the sentiment of text data.
|
|
|
|
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -3 to 3.
|
|
|
|
* For each text, calculate the average sentiment value of its words and return NEG, POS or NEUT
|
|
|
|
*/
|
2021-04-14 18:42:33 +00:00
|
|
|
struct TonalityClassificationImpl
|
|
|
|
{
|
|
|
|
|
|
|
|
using ResultType = String;
|
|
|
|
|
|
|
|
|
|
|
|
static String get_tonality(const Float64 & tonality_level)
|
|
|
|
{
|
2021-04-18 17:03:56 +00:00
|
|
|
if (tonality_level < 0.25) { return "NEG"; }
|
|
|
|
if (tonality_level > 0.5) { return "POS"; }
|
2021-04-14 18:42:33 +00:00
|
|
|
return "NEUT";
|
|
|
|
}
|
|
|
|
|
|
|
|
static void constant(String data, String & res)
|
|
|
|
{
|
|
|
|
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
|
|
|
|
2021-04-18 17:03:56 +00:00
|
|
|
Float64 weight = 0;
|
2021-04-14 18:42:33 +00:00
|
|
|
Float64 count_words = 0;
|
|
|
|
|
2021-04-15 12:02:53 +00:00
|
|
|
String answer;
|
2021-04-18 17:03:56 +00:00
|
|
|
String word;
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Select all Russian words from the string
|
2021-04-18 17:03:56 +00:00
|
|
|
for (size_t i = 0; i < data.size();)
|
2021-04-14 18:42:33 +00:00
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Assume that all non-Ascii characters are Russian letters
|
2021-04-18 17:03:56 +00:00
|
|
|
if (!isASCII(data[i]))
|
|
|
|
{
|
|
|
|
word.push_back(data[i]);
|
|
|
|
++i;
|
2021-04-14 18:42:33 +00:00
|
|
|
|
2021-05-21 13:58:48 +00:00
|
|
|
while ((i < data.size()) && (!isASCII(data[i])))
|
|
|
|
{
|
2021-04-18 17:03:56 +00:00
|
|
|
word.push_back(data[i]);
|
|
|
|
++i;
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Try to find a russian word in the tonality dictionary
|
2021-04-18 17:03:56 +00:00
|
|
|
if (emotional_dict.find(word) != emotional_dict.cend())
|
|
|
|
{
|
|
|
|
count_words += 1;
|
|
|
|
weight += emotional_dict[word];
|
|
|
|
}
|
|
|
|
word = "";
|
|
|
|
}
|
|
|
|
else
|
2021-04-14 18:42:33 +00:00
|
|
|
{
|
2021-04-18 17:03:56 +00:00
|
|
|
++i;
|
|
|
|
}
|
2021-04-14 18:42:33 +00:00
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Calculate average value of tonality
|
2021-04-18 17:03:56 +00:00
|
|
|
Float64 total_tonality = weight / count_words;
|
|
|
|
res += get_tonality(total_tonality);
|
2021-04-14 18:42:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void vector(
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
|
|
|
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
|
|
|
|
|
|
|
res_data.reserve(1024);
|
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
|
|
|
size_t prev_offset = 0;
|
|
|
|
size_t res_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
|
|
|
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
String str = haystack;
|
|
|
|
|
2021-04-15 12:02:53 +00:00
|
|
|
String buf;
|
2021-04-14 18:42:33 +00:00
|
|
|
|
2021-04-18 17:03:56 +00:00
|
|
|
Float64 weight = 0;
|
2021-04-14 18:42:33 +00:00
|
|
|
Float64 count_words = 0;
|
|
|
|
|
|
|
|
|
2021-04-18 17:03:56 +00:00
|
|
|
String answer;
|
|
|
|
String word;
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Select all Russian words from the string
|
2021-04-18 17:03:56 +00:00
|
|
|
for (size_t ind = 0; ind < str.size();)
|
2021-04-14 18:42:33 +00:00
|
|
|
{
|
2021-04-18 17:03:56 +00:00
|
|
|
if (!isASCII(str[ind]))
|
|
|
|
{
|
|
|
|
word.push_back(str[ind]);
|
|
|
|
++ind;
|
|
|
|
|
2021-05-21 13:58:48 +00:00
|
|
|
while ((ind < str.size()) && (!isASCII(str[ind])))
|
|
|
|
{
|
2021-04-18 17:03:56 +00:00
|
|
|
word.push_back(str[ind]);
|
|
|
|
++ind;
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Try to find a russian word in the tonality dictionary
|
2021-04-18 17:03:56 +00:00
|
|
|
if (emotional_dict.find(word) != emotional_dict.cend())
|
|
|
|
{
|
|
|
|
count_words += 1;
|
|
|
|
weight += emotional_dict[word];
|
|
|
|
}
|
|
|
|
word = "";
|
2021-04-14 18:42:33 +00:00
|
|
|
}
|
2021-04-18 17:03:56 +00:00
|
|
|
else
|
2021-04-14 18:42:33 +00:00
|
|
|
{
|
2021-04-18 17:03:56 +00:00
|
|
|
++ind;
|
2021-04-14 18:42:33 +00:00
|
|
|
}
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
/// Calculate average value of tonality
|
2021-04-18 17:03:56 +00:00
|
|
|
Float64 total_tonality = weight / count_words;
|
2021-04-15 17:16:32 +00:00
|
|
|
buf = get_tonality(total_tonality);
|
2021-04-14 18:42:33 +00:00
|
|
|
|
2021-04-15 12:02:53 +00:00
|
|
|
const auto ans = buf.c_str();
|
2021-04-14 18:42:33 +00:00
|
|
|
size_t cur_offset = offsets[i];
|
2021-05-18 19:36:46 +00:00
|
|
|
size_t ans_size = strlen(ans);
|
|
|
|
res_data.resize(res_offset + ans_size + 1);
|
|
|
|
memcpy(&res_data[res_offset], ans, ans_size);
|
|
|
|
res_offset += ans_size;
|
2021-04-14 18:42:33 +00:00
|
|
|
|
|
|
|
res_data[res_offset] = 0;
|
|
|
|
++res_offset;
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = cur_offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameGetTonality
|
|
|
|
{
|
|
|
|
static constexpr auto name = "getTonality";
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
using FunctionGetTonality = FunctionsTextClassification<TonalityClassificationImpl, NameGetTonality>;
|
|
|
|
|
|
|
|
void registerFunctionsTonalityClassification(FunctionFactory & factory)
|
|
|
|
{
|
|
|
|
factory.registerFunction<FunctionGetTonality>();
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|