ClickHouse/src/Functions/FunctionsProgrammingClassification.cpp

171 lines
4.4 KiB
C++
Raw Normal View History

2021-04-15 12:02:53 +00:00
#include <Functions/FunctionsTextClassification.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <IO/ReadHelpers.h>
#include <unordered_map>
namespace DB
{
struct ProgrammingClassificationImpl
{
using ResultType = String;
static ALWAYS_INLINE inline Float64 state_machine(std::unordered_map<String, Float64> standart, std::unordered_map<String, Float64> model)
{
Float64 res = 0;
2021-04-16 11:44:09 +00:00
for (auto & el : model)
{
2021-04-15 12:02:53 +00:00
res += el.second * standart[el.first];
}
return res;
}
static void constant(String data, String & res)
{
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
std::unordered_map<String, Float64> data_freq;
String answer;
ReadBufferFromMemory in(data.data(), data.size() + 1);
skipWhitespaceIfAny(in);
String prev = "";
String new_word;
while (!in.eof())
{
2021-04-16 11:44:09 +00:00
if (data.size() - (in.position() - data.data()) <= 3)
{
2021-04-15 12:02:53 +00:00
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
2021-04-16 11:44:09 +00:00
if (prev == "")
{
2021-04-15 12:02:53 +00:00
prev = new_word;
2021-04-16 11:44:09 +00:00
}
else
{
2021-04-15 12:02:53 +00:00
data_freq[prev + new_word] += 1;
prev = new_word;
}
}
String most_liked;
Float64 max_result = 0;
2021-04-16 11:44:09 +00:00
for (const auto& item : programming_freq)
{
2021-04-15 12:02:53 +00:00
Float64 result = state_machine(item.second, data_freq);
2021-04-16 11:44:09 +00:00
if (result > max_result)
{
2021-04-15 12:02:53 +00:00
max_result = result;
most_liked = item.first;
}
}
res = most_liked;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
std::unordered_map<String, Float64> data_freq;
res_data.reserve(1024);
res_offsets.resize(offsets.size());
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
String str = haystack;
String buf;
ReadBufferFromMemory in(str.data(), str.size() + 1);
skipWhitespaceIfAny(in);
String new_word;
String prev;
while (!in.eof())
{
2021-04-16 11:44:09 +00:00
if (str.size() - (in.position() - str.data()) <= 3)
{
2021-04-15 12:02:53 +00:00
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
2021-04-16 11:44:09 +00:00
if (prev == "")
{
2021-04-15 12:02:53 +00:00
prev = new_word;
2021-04-16 11:44:09 +00:00
}
else
{
2021-04-15 12:02:53 +00:00
data_freq[prev + new_word] += 1;
prev = new_word;
}
}
String most_liked;
Float64 max_result = 0;
2021-04-16 11:44:09 +00:00
for (const auto& item : programming_freq)
{
2021-04-15 12:02:53 +00:00
Float64 result = state_machine(item.second, data_freq);
2021-04-16 11:44:09 +00:00
if (result > max_result)
{
2021-04-15 12:02:53 +00:00
max_result = result;
most_liked = item.first;
}
}
const auto ans = most_liked.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
struct NameGetProgramming
{
static constexpr auto name = "getProgrammingLanguage";
};
using FunctionGetProgramming = FunctionsTextClassification<ProgrammingClassificationImpl, NameGetProgramming>;
void registerFunctionsProgrammingClassification(FunctionFactory & factory)
{
factory.registerFunction<FunctionGetProgramming>();
}
}