ClickHouse/src/Functions/FunctionsTonalityClassification.cpp

90 lines
2.6 KiB
C++
Raw Normal View History

#include <Common/FrequencyHolder.h>
2022-01-10 15:36:32 +00:00
#include <Common/StringUtils/StringUtils.h>
#include <Functions/FunctionFactory.h>
2022-01-12 16:32:17 +00:00
#include <Functions/FunctionsTextClassification.h>
#include <unordered_map>
namespace DB
{
2022-01-10 15:36:32 +00:00
/**
2021-05-27 09:16:30 +00:00
* Determines the sentiment of text data.
2022-01-10 15:36:32 +00:00
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
2022-01-12 16:32:17 +00:00
* For each text, calculate the average sentiment value of its words and return it in range [-1,1]
*/
2022-01-12 16:32:17 +00:00
struct FunctionDetectTonalityImpl
{
2022-01-12 16:32:17 +00:00
static ALWAYS_INLINE inline Float32 detectTonality(
const UInt8 * str,
const size_t str_len,
const FrequencyHolder::Map & emotional_dict)
{
2021-04-18 17:03:56 +00:00
Float64 weight = 0;
2022-01-10 15:36:32 +00:00
UInt64 count_words = 0;
2021-04-18 17:03:56 +00:00
String word;
/// Select all Russian words from the string
2022-01-12 16:32:17 +00:00
for (size_t ind = 0; ind < str_len; ++ind)
{
2022-01-12 16:32:17 +00:00
/// Split words by whitespaces and punctuation signs
if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
continue;
while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
2021-04-18 17:03:56 +00:00
{
2022-01-10 15:36:32 +00:00
word.push_back(str[ind]);
++ind;
2021-05-23 16:39:40 +00:00
}
2022-01-12 16:32:17 +00:00
/// Try to find a russian word in the tonality dictionary
const auto * it = emotional_dict.find(word);
if (it != emotional_dict.end())
{
2022-01-12 16:32:17 +00:00
count_words += 1;
weight += it->getMapped();
2021-04-18 17:03:56 +00:00
}
2022-01-12 16:32:17 +00:00
word.clear();
}
2022-01-12 16:32:17 +00:00
if (!count_words)
return 0;
2022-01-10 15:36:32 +00:00
/// Calculate average value of tonality.
/// Convert values -12..6 to -1..1
2022-01-17 10:01:06 +00:00
if (weight > 0)
return weight / count_words / 6;
else
return weight / count_words / 12;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
2022-01-10 15:36:32 +00:00
PaddedPODArray<Float32> & res)
{
2021-12-30 02:14:57 +00:00
const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
2022-01-10 15:36:32 +00:00
size_t size = offsets.size();
size_t prev_offset = 0;
2022-01-10 15:36:32 +00:00
for (size_t i = 0; i < size; ++i)
{
2022-01-10 15:36:32 +00:00
res[i] = detectTonality(data.data() + prev_offset, offsets[i] - 1 - prev_offset, emotional_dict);
prev_offset = offsets[i];
}
}
};
2022-01-10 15:36:32 +00:00
struct NameDetectTonality
{
2021-05-23 20:17:28 +00:00
static constexpr auto name = "detectTonality";
};
2022-01-12 16:32:17 +00:00
using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;
2022-01-12 16:32:17 +00:00
void registerFunctionDetectTonality(FunctionFactory & factory)
{
2022-01-10 15:36:32 +00:00
factory.registerFunction<FunctionDetectTonality>();
}
}