ClickHouse/src/Functions/FunctionsProgrammingClassification.cpp

121 lines
3.9 KiB
C++
Raw Normal View History

2021-04-15 12:02:53 +00:00
#include <Common/FrequencyHolder.h>
2022-01-12 16:32:17 +00:00
#include <Common/StringUtils/StringUtils.h>
2021-04-15 12:02:53 +00:00
#include <Functions/FunctionFactory.h>
2022-01-12 16:32:17 +00:00
#include <Functions/FunctionsTextClassification.h>
2021-04-15 12:02:53 +00:00
#include <unordered_map>
2022-01-10 15:36:32 +00:00
#include <string_view>
2021-04-15 12:02:53 +00:00
namespace DB
{
2022-01-10 15:36:32 +00:00
/**
* Determine the programming language from the source code.
* We calculate all the unigrams and bigrams of commands in the source code.
* Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages
* Find the biggest weight of the programming language and return it
*/
2022-01-12 16:32:17 +00:00
struct FunctionDetectProgrammingLanguageImpl
2021-04-15 12:02:53 +00:00
{
/// Calculate total weight
2022-01-10 15:36:32 +00:00
static ALWAYS_INLINE inline Float64 stateMachine(
const FrequencyHolder::Map & standard,
const std::unordered_map<String, Float64> & model)
2021-04-15 12:02:53 +00:00
{
Float64 res = 0;
2022-01-10 15:36:32 +00:00
for (const auto & el : model)
2021-04-16 11:44:09 +00:00
{
/// Try to find each n-gram in dictionary
2022-01-10 15:36:32 +00:00
const auto * it = standard.find(el.first);
2021-12-30 02:14:57 +00:00
if (it != standard.end())
res += el.second * it->getMapped();
2021-04-15 12:02:53 +00:00
}
return res;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
2022-01-10 15:36:32 +00:00
const auto & programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
2021-04-15 12:02:53 +00:00
2022-01-10 15:36:32 +00:00
/// Constant 5 is arbitrary
res_data.reserve(offsets.size() * 5);
2021-04-15 12:02:53 +00:00
res_offsets.resize(offsets.size());
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
2022-01-10 15:36:32 +00:00
const UInt8 * str = data.data() + offsets[i - 1];
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
2021-05-31 13:38:51 +00:00
std::unordered_map<String, Float64> data_freq;
2022-01-17 10:01:06 +00:00
StringRef prev_command;
StringRef command;
2022-01-12 16:32:17 +00:00
/// Select all commands from the string
2022-01-12 16:32:17 +00:00
for (size_t ind = 0; ind < str_len; ++ind)
2021-04-15 12:02:53 +00:00
{
2021-12-30 03:35:37 +00:00
/// Assume that all commands are split by spaces
2022-01-12 16:32:17 +00:00
if (isWhitespaceASCII(str[ind]))
continue;
2022-01-17 10:01:06 +00:00
size_t prev_ind = ind;
2022-01-12 16:32:17 +00:00
while (ind < str_len && !isWhitespaceASCII(str[ind]))
2021-04-18 17:03:56 +00:00
++ind;
2022-01-17 10:01:06 +00:00
command = {str + prev_ind, ind - prev_ind};
2022-01-12 16:32:17 +00:00
/// We add both unigrams and bigrams to later search for them in the dictionary
2022-01-17 10:01:06 +00:00
if (prev_command.data)
data_freq[prev_command.toString() + command.toString()] += 1;
2022-01-12 16:32:17 +00:00
2022-01-17 10:01:06 +00:00
data_freq[command.toString()] += 1;
prev_command = command;
2021-04-15 12:02:53 +00:00
}
2022-01-12 16:32:17 +00:00
std::string_view res;
2021-04-15 12:02:53 +00:00
Float64 max_result = 0;
/// Iterate over all programming languages and find the language with the highest weight
2022-01-10 15:36:32 +00:00
for (const auto & item : programming_freq)
2021-04-16 11:44:09 +00:00
{
2021-12-30 02:14:57 +00:00
Float64 result = stateMachine(item.map, data_freq);
2021-04-16 11:44:09 +00:00
if (result > max_result)
{
2021-04-15 12:02:53 +00:00
max_result = result;
2022-01-10 15:36:32 +00:00
res = item.name;
2021-04-15 12:02:53 +00:00
}
}
/// If all weights are zero, then we assume that the language is undefined
2022-01-10 15:36:32 +00:00
if (res.empty())
res = "Undefined";
2021-04-16 12:26:46 +00:00
2022-01-10 15:36:32 +00:00
res_data.resize(res_offset + res.size() + 1);
memcpy(&res_data[res_offset], res.data(), res.size());
2021-04-15 12:02:53 +00:00
2022-01-10 15:36:32 +00:00
res_data[res_offset + res.size()] = 0;
res_offset += res.size() + 1;
2021-04-15 12:02:53 +00:00
res_offsets[i] = res_offset;
}
}
};
2022-01-12 16:32:17 +00:00
struct NameDetectProgrammingLanguage
2021-04-15 12:02:53 +00:00
{
2021-05-23 20:17:28 +00:00
static constexpr auto name = "detectProgrammingLanguage";
2021-04-15 12:02:53 +00:00
};
2022-01-12 16:32:17 +00:00
using FunctionDetectProgrammingLanguage = FunctionTextClassificationString<FunctionDetectProgrammingLanguageImpl, NameDetectProgrammingLanguage>;
2021-04-15 12:02:53 +00:00
REGISTER_FUNCTION(DetectProgrammingLanguage)
2021-04-15 12:02:53 +00:00
{
2022-01-12 16:32:17 +00:00
factory.registerFunction<FunctionDetectProgrammingLanguage>();
2021-04-15 12:02:53 +00:00
}
}