Add detection of programming language

This commit is contained in:
s-kat 2021-04-15 15:02:53 +03:00
parent cdf8ab71d2
commit 5b381029f9
9 changed files with 19336 additions and 10 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
file = open("total.txt")
new_f = open("programming_freq2.txt", "w")
for i in file:
a = i.split()
if len(a) == 1:
new_f.write("// " + i)
elif len(a) > 0 and len(a[0]) >= 2 and a[0][0] == '/' and a[0][1] == '/':
continue
else:
new_f.write(i)
file.close()
new_f.close()

File diff suppressed because it is too large Load Diff

View File

@ -39,6 +39,7 @@ public:
is_true = pt;
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
}
@ -113,6 +114,47 @@ public:
}
void loadProgrammingFrequency(const String & path_to_programming_freq)
{
String bigram;
Float64 frequency;
String programming_language;
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
LOG_TRACE(log, "Programming langugages frequencies loading from {}", path_to_programming_freq);
ReadBufferFromFile in(path_to_programming_freq);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end())
break;
std::string_view line(in.position(), newline - in.position());
if (line.empty())
continue;
// Start load new charset
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
{
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
readString(programming_language, bufline);
} else
{
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
readStringUntilWhitespace(bigram, buf_line);
buf_line.ignore();
readFloatText(frequency, buf_line);
programming_freq[programming_language][bigram] = frequency;
}
in.position() = newline + 1;
}
LOG_TRACE(log, "Programming languages frequencies was added");
}
const String & get_path()
{
return is_true;
@ -130,12 +172,18 @@ public:
return encodings_freq;
}
const std::unordered_map<String, std::unordered_map<String, Float64>> & getProgrammingFrequency()
{
return programming_freq;
}
protected:
String is_true;
std::unordered_map<String, Float64> emotional_dict;
Container encodings_freq;
std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq;
};
}

View File

@ -0,0 +1,167 @@
#include <Functions/FunctionsTextClassification.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <algorithm>
#include <cstring>
#include <cmath>
#include <limits>
#include <unordered_map>
#include <memory>
#include <utility>
#include <sstream>
#include <set>
namespace DB
{
struct ProgrammingClassificationImpl
{
using ResultType = String;
static ALWAYS_INLINE inline Float64 state_machine(std::unordered_map<String, Float64> standart, std::unordered_map<String, Float64> model)
{
Float64 res = 0;
for (auto & el : model) {
res += el.second * standart[el.first];
}
return res;
}
static void constant(String data, String & res)
{
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
std::unordered_map<String, Float64> data_freq;
String answer;
ReadBufferFromMemory in(data.data(), data.size() + 1);
skipWhitespaceIfAny(in);
String prev = "";
String new_word;
while (!in.eof())
{
if (data.size() - (in.position() - data.data()) <= 3) {
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
if (prev == "") {
prev = new_word;
} else {
data_freq[prev + new_word] += 1;
prev = new_word;
}
}
String most_liked;
Float64 max_result = 0;
for (const auto& item : programming_freq) {
Float64 result = state_machine(item.second, data_freq);
if (result > max_result) {
max_result = result;
most_liked = item.first;
}
}
res = most_liked;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
std::unordered_map<String, Float64> data_freq;
res_data.reserve(1024);
res_offsets.resize(offsets.size());
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
String str = haystack;
String buf;
ReadBufferFromMemory in(str.data(), str.size() + 1);
skipWhitespaceIfAny(in);
String new_word;
String prev;
while (!in.eof())
{
if (str.size() - (in.position() - str.data()) <= 3) {
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
if (prev == "") {
prev = new_word;
} else {
data_freq[prev + new_word] += 1;
prev = new_word;
}
}
String most_liked;
Float64 max_result = 0;
for (const auto& item : programming_freq) {
Float64 result = state_machine(item.second, data_freq);
if (result > max_result) {
max_result = result;
most_liked = item.first;
}
}
const auto ans = most_liked.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
struct NameGetProgramming
{
static constexpr auto name = "getProgrammingLanguage";
};
using FunctionGetProgramming = FunctionsTextClassification<ProgrammingClassificationImpl, NameGetProgramming>;
void registerFunctionsProgrammingClassification(FunctionFactory & factory)
{
factory.registerFunction<FunctionGetProgramming>();
}
}

View File

@ -9,7 +9,15 @@
namespace DB
{
/** Functions for text classification:
*
* charsetDetect(string data) - detect charset of data.
* Returns string name of most likely charset.
* .
* getTonality(string data) - defines the emotional coloring of the text.
* Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
*
*/
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;

View File

@ -54,10 +54,11 @@ struct TonalityClassificationImpl
Float64 freq = 0;
Float64 count_words = 0;
String ans;
String answer;
ReadBufferFromMemory in(data.data(), data.size() + 1);
skipWhitespaceIfAny(in);
String to_check;
while (!in.eof())
{
@ -66,20 +67,19 @@ struct TonalityClassificationImpl
}
readStringUntilWhitespace(to_check, in);
skipWhitespaceIfAny(in);
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
res = ans;
answer += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
res = answer;
}
@ -102,13 +102,14 @@ struct TonalityClassificationImpl
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
String str = haystack;
String prom;
String buf;
Float64 freq = 0;
Float64 count_words = 0;
ReadBufferFromMemory in(str.data(), str.size() + 1);
skipWhitespaceIfAny(in);
String to_check;
while (!in.eof())
@ -122,14 +123,13 @@ struct TonalityClassificationImpl
if (emotional_dict.find(to_check) != emotional_dict.cend())
{
count_words += 1;
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
freq += emotional_dict[to_check];
}
}
Float64 total_tonality = freq / count_words;
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
buf += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
const auto ans = prom.c_str();
const auto ans = buf.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);

View File

@ -35,6 +35,7 @@ void registerFunctionsStringRegexp(FunctionFactory &);
void registerFunctionsStringSimilarity(FunctionFactory &);
void registerFunctionsTextClassification(FunctionFactory &);
void registerFunctionsTonalityClassification(FunctionFactory &);
void registerFunctionsProgrammingClassification(FunctionFactory &);
void registerFunctionsURL(FunctionFactory &);
void registerFunctionsVisitParam(FunctionFactory &);
void registerFunctionsMath(FunctionFactory &);
@ -95,6 +96,7 @@ void registerFunctions()
registerFunctionsStringSimilarity(factory);
registerFunctionsTextClassification(factory);
registerFunctionsTonalityClassification(factory);
registerFunctionsProgrammingClassification(factory);
registerFunctionsURL(factory);
registerFunctionsVisitParam(factory);
registerFunctionsMath(factory);

View File

@ -57,6 +57,7 @@ SRCS(
FunctionsStringSimilarity.cpp
FunctionsTextClassification.cpp
FunctionsTonalityClassification.cpp
FunctionsProgrammingClassification.cpp
GatherUtils/concat.cpp
GatherUtils/createArraySink.cpp
GatherUtils/createArraySource.cpp