ClickHouse/src/Functions/FunctionsTonalityClassification.cpp

#include <Functions/FunctionsTextClassification.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>

#include <unordered_map>

namespace DB
{
/**
  * Determines the sentiment of text data. 
  * Uses a marked-up sentiment dictionary, each word has a tonality ranging from -3 to 3.
  * For each text, calculate the average sentiment value of its words and return NEG, POS or NEUT
  */
struct TonalityClassificationImpl
{

    using ResultType = String;


    static String get_tonality(const Float64 & tonality_level)
    {
        if (tonality_level < 0.25) { return "NEG"; }
        if (tonality_level > 0.5) { return "POS"; }
        return "NEUT";
    } 
    
    static void constant(String data, String & res)
    {
        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();

        Float64 weight = 0;
        Float64 count_words = 0;

        String answer;
        String word;
        /// Select all Russian words from the string
        for (size_t i = 0; i < data.size();)
        {
            /// Assume that all non-Ascii characters are Russian letters
            if (!isASCII(data[i]))
            {
                word.push_back(data[i]);
                ++i;

                while ((i < data.size()) && (!isASCII(data[i]))) {
                    word.push_back(data[i]);
                    ++i;
                }
                /// Try to find a russian word in the tonality dictionary
                if (emotional_dict.find(word) != emotional_dict.cend())
                {
                    count_words += 1;
                    weight += emotional_dict[word];
                }
                word = "";
            } 
            else
            {
                ++i;
            }
        }
        /// Calculate average value of tonality
        Float64 total_tonality = weight / count_words;
        res += get_tonality(total_tonality);
    }


    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();

        res_data.reserve(1024);
        res_offsets.resize(offsets.size());

        size_t prev_offset = 0;
        size_t res_offset = 0;

        for (size_t i = 0; i < offsets.size(); ++i)
        {
            const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
            String str = haystack;

            String buf;

            Float64 weight = 0;
            Float64 count_words = 0;


            String answer;
            String word;
            /// Select all Russian words from the string
            for (size_t ind = 0; ind < str.size();)
            {
                if (!isASCII(str[ind]))
                {
                    word.push_back(str[ind]);
                    ++ind;

                    while ((ind < str.size()) && (!isASCII(str[ind]))) {
                        word.push_back(str[ind]);
                        ++ind;
                    }
                    /// Try to find a russian word in the tonality dictionary
                    if (emotional_dict.find(word) != emotional_dict.cend())
                    {
                        count_words += 1;
                        weight += emotional_dict[word];
                    }
                    word = "";
                }
                else
                {
                    ++ind;
                }
            }
            /// Calculate average value of tonality
            Float64 total_tonality = weight / count_words;
            buf = get_tonality(total_tonality);

            const auto ans = buf.c_str();
            size_t cur_offset = offsets[i];
            size_t ans_size = strlen(ans);
            res_data.resize(res_offset + ans_size + 1);
            memcpy(&res_data[res_offset], ans, ans_size);
            res_offset += ans_size;

            res_data[res_offset] = 0;
            ++res_offset;

            res_offsets[i] = res_offset;
            prev_offset = cur_offset;
        }
    }


};

struct NameGetTonality
{
    static constexpr auto name = "getTonality";
};


using FunctionGetTonality = FunctionsTextClassification<TonalityClassificationImpl, NameGetTonality>;

void registerFunctionsTonalityClassification(FunctionFactory & factory)
{
    factory.registerFunction<FunctionGetTonality>();
}

}
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`#include <Functions/FunctionsTextClassification.h>`
Fix functions 2021-04-18 17:03:56 +00:00			`#include <Common/StringUtils/StringUtils.h>`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`#include <Common/FrequencyHolder.h>`
			`#include <Functions/FunctionFactory.h>`
			`#include <Common/UTF8Helpers.h>`
			`#include <IO/ReadBufferFromString.h>`
			`#include <IO/ReadHelpers.h>`

			`#include <unordered_map>`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`namespace DB`
			`{`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/**`
			`* Determines the sentiment of text data.`
			`* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -3 to 3.`
			`* For each text, calculate the average sentiment value of its words and return NEG, POS or NEUT`
			`*/`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`struct TonalityClassificationImpl`
			`{`

			`using ResultType = String;`


			`static String get_tonality(const Float64 & tonality_level)`
			`{`
Fix functions 2021-04-18 17:03:56 +00:00			`if (tonality_level < 0.25) { return "NEG"; }`
			`if (tonality_level > 0.5) { return "POS"; }`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`return "NEUT";`
			`}`

			`static void constant(String data, String & res)`
			`{`
			`static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();`

Fix functions 2021-04-18 17:03:56 +00:00			`Float64 weight = 0;`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`Float64 count_words = 0;`

Add detection of programming language 2021-04-15 12:02:53 +00:00			`String answer;`
Fix functions 2021-04-18 17:03:56 +00:00			`String word;`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Select all Russian words from the string`
Fix functions 2021-04-18 17:03:56 +00:00			`for (size_t i = 0; i < data.size();)`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`{`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Assume that all non-Ascii characters are Russian letters`
Fix functions 2021-04-18 17:03:56 +00:00			`if (!isASCII(data[i]))`
			`{`
			`word.push_back(data[i]);`
			`++i;`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00
Fix functions 2021-04-18 17:03:56 +00:00			`while ((i < data.size()) && (!isASCII(data[i]))) {`
			`word.push_back(data[i]);`
			`++i;`
			`}`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Try to find a russian word in the tonality dictionary`
Fix functions 2021-04-18 17:03:56 +00:00			`if (emotional_dict.find(word) != emotional_dict.cend())`
			`{`
			`count_words += 1;`
			`weight += emotional_dict[word];`
			`}`
			`word = "";`
			`}`
			`else`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`{`
Fix functions 2021-04-18 17:03:56 +00:00			`++i;`
			`}`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`}`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Calculate average value of tonality`
Fix functions 2021-04-18 17:03:56 +00:00			`Float64 total_tonality = weight / count_words;`
			`res += get_tonality(total_tonality);`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`}`


			`static void vector(`
			`const ColumnString::Chars & data,`
			`const ColumnString::Offsets & offsets,`
			`ColumnString::Chars & res_data,`
			`ColumnString::Offsets & res_offsets)`
			`{`
			`static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();`

			`res_data.reserve(1024);`
			`res_offsets.resize(offsets.size());`

			`size_t prev_offset = 0;`
			`size_t res_offset = 0;`

			`for (size_t i = 0; i < offsets.size(); ++i)`
			`{`
			`const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);`
			`String str = haystack;`

Add detection of programming language 2021-04-15 12:02:53 +00:00			`String buf;`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00
Fix functions 2021-04-18 17:03:56 +00:00			`Float64 weight = 0;`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`Float64 count_words = 0;`


Fix functions 2021-04-18 17:03:56 +00:00			`String answer;`
			`String word;`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Select all Russian words from the string`
Fix functions 2021-04-18 17:03:56 +00:00			`for (size_t ind = 0; ind < str.size();)`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`{`
Fix functions 2021-04-18 17:03:56 +00:00			`if (!isASCII(str[ind]))`
			`{`
			`word.push_back(str[ind]);`
			`++ind;`

			`while ((ind < str.size()) && (!isASCII(str[ind]))) {`
			`word.push_back(str[ind]);`
			`++ind;`
			`}`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Try to find a russian word in the tonality dictionary`
Fix functions 2021-04-18 17:03:56 +00:00			`if (emotional_dict.find(word) != emotional_dict.cend())`
			`{`
			`count_words += 1;`
			`weight += emotional_dict[word];`
			`}`
			`word = "";`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`}`
Fix functions 2021-04-18 17:03:56 +00:00			`else`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`{`
Fix functions 2021-04-18 17:03:56 +00:00			`++ind;`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`}`
			`}`
Add new programming languages and update detection 2021-05-21 13:48:18 +00:00			`/// Calculate average value of tonality`
Fix functions 2021-04-18 17:03:56 +00:00			`Float64 total_tonality = weight / count_words;`
Minor fixes 2021-04-15 17:16:32 +00:00			`buf = get_tonality(total_tonality);`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00
Add detection of programming language 2021-04-15 12:02:53 +00:00			`const auto ans = buf.c_str();`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`size_t cur_offset = offsets[i];`
Little changes 2021-05-18 19:36:46 +00:00			`size_t ans_size = strlen(ans);`
			`res_data.resize(res_offset + ans_size + 1);`
			`memcpy(&res_data[res_offset], ans, ans_size);`
			`res_offset += ans_size;`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00
			`res_data[res_offset] = 0;`
			`++res_offset;`

			`res_offsets[i] = res_offset;`
			`prev_offset = cur_offset;`
			`}`
			`}`


			`};`

			`struct NameGetTonality`
			`{`
			`static constexpr auto name = "getTonality";`
			`};`


			`using FunctionGetTonality = FunctionsTextClassification<TonalityClassificationImpl, NameGetTonality>;`

			`void registerFunctionsTonalityClassification(FunctionFactory & factory)`
			`{`
			`factory.registerFunction<FunctionGetTonality>();`
			`}`

			`}`