2021-04-15 12:02:53 +00:00
|
|
|
|
#include <Common/FrequencyHolder.h>
|
2022-01-12 16:32:17 +00:00
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2021-04-15 12:02:53 +00:00
|
|
|
|
#include <Functions/FunctionFactory.h>
|
2022-01-12 16:32:17 +00:00
|
|
|
|
#include <Functions/FunctionsTextClassification.h>
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
|
|
|
|
#include <unordered_map>
|
2022-01-10 15:36:32 +00:00
|
|
|
|
#include <string_view>
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/**
|
|
|
|
|
* Determine the programming language from the source code.
|
|
|
|
|
* We calculate all the unigrams and bigrams of commands in the source code.
|
|
|
|
|
* Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages
|
|
|
|
|
* Find the biggest weight of the programming language and return it
|
|
|
|
|
*/
|
2022-01-12 16:32:17 +00:00
|
|
|
|
struct FunctionDetectProgrammingLanguageImpl
|
2021-04-15 12:02:53 +00:00
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Calculate total weight
|
2022-01-10 15:36:32 +00:00
|
|
|
|
static ALWAYS_INLINE inline Float64 stateMachine(
|
|
|
|
|
const FrequencyHolder::Map & standard,
|
|
|
|
|
const std::unordered_map<String, Float64> & model)
|
2021-04-15 12:02:53 +00:00
|
|
|
|
{
|
|
|
|
|
Float64 res = 0;
|
2022-01-10 15:36:32 +00:00
|
|
|
|
for (const auto & el : model)
|
2021-04-16 11:44:09 +00:00
|
|
|
|
{
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Try to find each n-gram in dictionary
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const auto * it = standard.find(el.first);
|
2021-12-30 02:14:57 +00:00
|
|
|
|
if (it != standard.end())
|
|
|
|
|
res += el.second * it->getMapped();
|
2021-04-15 12:02:53 +00:00
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void vector(
|
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
|
const ColumnString::Offsets & offsets,
|
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const auto & programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
|
/// Constant 5 is arbitrary
|
|
|
|
|
res_data.reserve(offsets.size() * 5);
|
2021-04-15 12:02:53 +00:00
|
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
|
|
|
|
|
size_t res_offset = 0;
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
|
const UInt8 * str = data.data() + offsets[i - 1];
|
|
|
|
|
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
|
|
|
|
|
2021-05-31 13:38:51 +00:00
|
|
|
|
std::unordered_map<String, Float64> data_freq;
|
2022-01-17 10:01:06 +00:00
|
|
|
|
StringRef prev_command;
|
|
|
|
|
StringRef command;
|
2022-01-12 16:32:17 +00:00
|
|
|
|
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Select all commands from the string
|
2022-01-12 16:32:17 +00:00
|
|
|
|
for (size_t ind = 0; ind < str_len; ++ind)
|
2021-04-15 12:02:53 +00:00
|
|
|
|
{
|
2021-12-30 03:35:37 +00:00
|
|
|
|
/// Assume that all commands are split by spaces
|
2022-01-12 16:32:17 +00:00
|
|
|
|
if (isWhitespaceASCII(str[ind]))
|
|
|
|
|
continue;
|
2022-01-18 21:34:07 +00:00
|
|
|
|
|
2022-01-17 10:01:06 +00:00
|
|
|
|
size_t prev_ind = ind;
|
2022-01-12 16:32:17 +00:00
|
|
|
|
while (ind < str_len && !isWhitespaceASCII(str[ind]))
|
2021-04-18 17:03:56 +00:00
|
|
|
|
++ind;
|
2022-01-17 10:01:06 +00:00
|
|
|
|
|
|
|
|
|
command = {str + prev_ind, ind - prev_ind};
|
2022-01-12 16:32:17 +00:00
|
|
|
|
|
|
|
|
|
/// We add both unigrams and bigrams to later search for them in the dictionary
|
2022-01-17 10:01:06 +00:00
|
|
|
|
if (prev_command.data)
|
|
|
|
|
data_freq[prev_command.toString() + command.toString()] += 1;
|
2022-01-12 16:32:17 +00:00
|
|
|
|
|
2022-01-17 10:01:06 +00:00
|
|
|
|
data_freq[command.toString()] += 1;
|
|
|
|
|
prev_command = command;
|
2021-04-15 12:02:53 +00:00
|
|
|
|
}
|
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
std::string_view res;
|
2021-04-15 12:02:53 +00:00
|
|
|
|
Float64 max_result = 0;
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// Iterate over all programming languages and find the language with the highest weight
|
2022-01-10 15:36:32 +00:00
|
|
|
|
for (const auto & item : programming_freq)
|
2021-04-16 11:44:09 +00:00
|
|
|
|
{
|
2021-12-30 02:14:57 +00:00
|
|
|
|
Float64 result = stateMachine(item.map, data_freq);
|
2021-04-16 11:44:09 +00:00
|
|
|
|
if (result > max_result)
|
|
|
|
|
{
|
2021-04-15 12:02:53 +00:00
|
|
|
|
max_result = result;
|
2022-01-10 15:36:32 +00:00
|
|
|
|
res = item.name;
|
2021-04-15 12:02:53 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
2021-05-21 13:48:18 +00:00
|
|
|
|
/// If all weights are zero, then we assume that the language is undefined
|
2022-01-10 15:36:32 +00:00
|
|
|
|
if (res.empty())
|
|
|
|
|
res = "Undefined";
|
2021-04-16 12:26:46 +00:00
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
|
res_data.resize(res_offset + res.size() + 1);
|
|
|
|
|
memcpy(&res_data[res_offset], res.data(), res.size());
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
|
res_data[res_offset + res.size()] = 0;
|
|
|
|
|
res_offset += res.size() + 1;
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
struct NameDetectProgrammingLanguage
|
2021-04-15 12:02:53 +00:00
|
|
|
|
{
|
2021-05-23 20:17:28 +00:00
|
|
|
|
static constexpr auto name = "detectProgrammingLanguage";
|
2021-04-15 12:02:53 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
|
using FunctionDetectProgrammingLanguage = FunctionTextClassificationString<FunctionDetectProgrammingLanguageImpl, NameDetectProgrammingLanguage>;
|
2021-04-15 12:02:53 +00:00
|
|
|
|
|
2022-07-04 07:01:39 +00:00
|
|
|
|
REGISTER_FUNCTION(DetectProgrammingLanguage)
|
2021-04-15 12:02:53 +00:00
|
|
|
|
{
|
2022-01-12 16:32:17 +00:00
|
|
|
|
factory.registerFunction<FunctionDetectProgrammingLanguage>();
|
2021-04-15 12:02:53 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|