mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 01:51:59 +00:00
Add detection of programming language
This commit is contained in:
parent
cdf8ab71d2
commit
5b381029f9
9434
src/Common/ClassificationDictionaries/programming_freq.txt
Normal file
9434
src/Common/ClassificationDictionaries/programming_freq.txt
Normal file
File diff suppressed because it is too large
Load Diff
12
src/Common/ClassificationDictionaries/scr.py
Normal file
12
src/Common/ClassificationDictionaries/scr.py
Normal file
@ -0,0 +1,12 @@
|
||||
file = open("total.txt")
|
||||
new_f = open("programming_freq2.txt", "w")
|
||||
for i in file:
|
||||
a = i.split()
|
||||
if len(a) == 1:
|
||||
new_f.write("// " + i)
|
||||
elif len(a) > 0 and len(a[0]) >= 2 and a[0][0] == '/' and a[0][1] == '/':
|
||||
continue
|
||||
else:
|
||||
new_f.write(i)
|
||||
file.close()
|
||||
new_f.close()
|
9654
src/Common/ClassificationDictionaries/total.txt
Normal file
9654
src/Common/ClassificationDictionaries/total.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -39,6 +39,7 @@ public:
|
||||
is_true = pt;
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
|
||||
}
|
||||
|
||||
|
||||
@ -113,6 +114,47 @@ public:
|
||||
}
|
||||
|
||||
|
||||
void loadProgrammingFrequency(const String & path_to_programming_freq)
|
||||
{
|
||||
String bigram;
|
||||
Float64 frequency;
|
||||
String programming_language;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
|
||||
LOG_TRACE(log, "Programming langugages frequencies loading from {}", path_to_programming_freq);
|
||||
|
||||
ReadBufferFromFile in(path_to_programming_freq);
|
||||
while (!in.eof())
|
||||
{
|
||||
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
||||
|
||||
if (newline >= in.buffer().end())
|
||||
break;
|
||||
|
||||
std::string_view line(in.position(), newline - in.position());
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
// Start load new charset
|
||||
if (line.size() > 2 && line[0] == '/' && line[1] == '/')
|
||||
{
|
||||
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
|
||||
readString(programming_language, bufline);
|
||||
} else
|
||||
{
|
||||
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
programming_freq[programming_language][bigram] = frequency;
|
||||
}
|
||||
in.position() = newline + 1;
|
||||
}
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
|
||||
const String & get_path()
|
||||
{
|
||||
return is_true;
|
||||
@ -130,12 +172,18 @@ public:
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
const std::unordered_map<String, std::unordered_map<String, Float64>> & getProgrammingFrequency()
|
||||
{
|
||||
return programming_freq;
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
String is_true;
|
||||
std::unordered_map<String, Float64> emotional_dict;
|
||||
Container encodings_freq;
|
||||
std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq;
|
||||
};
|
||||
}
|
||||
|
||||
|
167
src/Functions/FunctionsProgrammingClassification.cpp
Normal file
167
src/Functions/FunctionsProgrammingClassification.cpp
Normal file
@ -0,0 +1,167 @@
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <sstream>
|
||||
#include <set>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
struct ProgrammingClassificationImpl
|
||||
{
|
||||
|
||||
using ResultType = String;
|
||||
|
||||
static ALWAYS_INLINE inline Float64 state_machine(std::unordered_map<String, Float64> standart, std::unordered_map<String, Float64> model)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (auto & el : model) {
|
||||
res += el.second * standart[el.first];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
||||
std::unordered_map<String, Float64> data_freq;
|
||||
|
||||
String answer;
|
||||
|
||||
ReadBufferFromMemory in(data.data(), data.size() + 1);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
String prev = "";
|
||||
String new_word;
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
if (data.size() - (in.position() - data.data()) <= 3) {
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(new_word, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
if (prev == "") {
|
||||
prev = new_word;
|
||||
} else {
|
||||
data_freq[prev + new_word] += 1;
|
||||
prev = new_word;
|
||||
}
|
||||
}
|
||||
|
||||
String most_liked;
|
||||
Float64 max_result = 0;
|
||||
|
||||
for (const auto& item : programming_freq) {
|
||||
Float64 result = state_machine(item.second, data_freq);
|
||||
if (result > max_result) {
|
||||
max_result = result;
|
||||
most_liked = item.first;
|
||||
}
|
||||
}
|
||||
|
||||
res = most_liked;
|
||||
}
|
||||
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
||||
std::unordered_map<String, Float64> data_freq;
|
||||
|
||||
res_data.reserve(1024);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t prev_offset = 0;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
String str = haystack;
|
||||
|
||||
String buf;
|
||||
|
||||
ReadBufferFromMemory in(str.data(), str.size() + 1);
|
||||
|
||||
skipWhitespaceIfAny(in);
|
||||
String new_word;
|
||||
String prev;
|
||||
while (!in.eof())
|
||||
{
|
||||
if (str.size() - (in.position() - str.data()) <= 3) {
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(new_word, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
if (prev == "") {
|
||||
prev = new_word;
|
||||
} else {
|
||||
data_freq[prev + new_word] += 1;
|
||||
prev = new_word;
|
||||
}
|
||||
}
|
||||
|
||||
String most_liked;
|
||||
Float64 max_result = 0;
|
||||
|
||||
for (const auto& item : programming_freq) {
|
||||
Float64 result = state_machine(item.second, data_freq);
|
||||
if (result > max_result) {
|
||||
max_result = result;
|
||||
most_liked = item.first;
|
||||
}
|
||||
}
|
||||
|
||||
const auto ans = most_liked.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
memcpy(&res_data[res_offset], ans, strlen(ans));
|
||||
res_offset += strlen(ans);
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
prev_offset = cur_offset;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct NameGetProgramming
|
||||
{
|
||||
static constexpr auto name = "getProgrammingLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionGetProgramming = FunctionsTextClassification<ProgrammingClassificationImpl, NameGetProgramming>;
|
||||
|
||||
void registerFunctionsProgrammingClassification(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionGetProgramming>();
|
||||
}
|
||||
|
||||
}
|
@ -9,7 +9,15 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** Functions for text classification:
|
||||
*
|
||||
* charsetDetect(string data) - detect charset of data.
|
||||
* Returns string name of most likely charset.
|
||||
* .
|
||||
* getTonality(string data) - defines the emotional coloring of the text.
|
||||
* Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
|
||||
*
|
||||
*/
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
|
@ -54,10 +54,11 @@ struct TonalityClassificationImpl
|
||||
Float64 freq = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
String ans;
|
||||
String answer;
|
||||
|
||||
ReadBufferFromMemory in(data.data(), data.size() + 1);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
String to_check;
|
||||
while (!in.eof())
|
||||
{
|
||||
@ -66,20 +67,19 @@ struct TonalityClassificationImpl
|
||||
}
|
||||
readStringUntilWhitespace(to_check, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
|
||||
word_processing(to_check);
|
||||
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
|
||||
res = ans;
|
||||
answer += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
|
||||
res = answer;
|
||||
}
|
||||
|
||||
|
||||
@ -102,13 +102,14 @@ struct TonalityClassificationImpl
|
||||
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
String str = haystack;
|
||||
|
||||
String prom;
|
||||
String buf;
|
||||
|
||||
Float64 freq = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
|
||||
ReadBufferFromMemory in(str.data(), str.size() + 1);
|
||||
|
||||
skipWhitespaceIfAny(in);
|
||||
String to_check;
|
||||
while (!in.eof())
|
||||
@ -122,14 +123,13 @@ struct TonalityClassificationImpl
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
|
||||
buf += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
|
||||
|
||||
const auto ans = prom.c_str();
|
||||
const auto ans = buf.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
|
@ -35,6 +35,7 @@ void registerFunctionsStringRegexp(FunctionFactory &);
|
||||
void registerFunctionsStringSimilarity(FunctionFactory &);
|
||||
void registerFunctionsTextClassification(FunctionFactory &);
|
||||
void registerFunctionsTonalityClassification(FunctionFactory &);
|
||||
void registerFunctionsProgrammingClassification(FunctionFactory &);
|
||||
void registerFunctionsURL(FunctionFactory &);
|
||||
void registerFunctionsVisitParam(FunctionFactory &);
|
||||
void registerFunctionsMath(FunctionFactory &);
|
||||
@ -95,6 +96,7 @@ void registerFunctions()
|
||||
registerFunctionsStringSimilarity(factory);
|
||||
registerFunctionsTextClassification(factory);
|
||||
registerFunctionsTonalityClassification(factory);
|
||||
registerFunctionsProgrammingClassification(factory);
|
||||
registerFunctionsURL(factory);
|
||||
registerFunctionsVisitParam(factory);
|
||||
registerFunctionsMath(factory);
|
||||
|
@ -57,6 +57,7 @@ SRCS(
|
||||
FunctionsStringSimilarity.cpp
|
||||
FunctionsTextClassification.cpp
|
||||
FunctionsTonalityClassification.cpp
|
||||
FunctionsProgrammingClassification.cpp
|
||||
GatherUtils/concat.cpp
|
||||
GatherUtils/createArraySink.cpp
|
||||
GatherUtils/createArraySource.cpp
|
||||
|
Loading…
Reference in New Issue
Block a user