This commit is contained in:
s-kat 2021-04-16 14:44:09 +03:00
parent a7258d2e14
commit af63bc3347
7 changed files with 9509 additions and 50 deletions

View File

@ -632,12 +632,25 @@ int Server::main(const std::vector<std::string> & /*args*/)
TLDListsHolder::getInstance().parseConfig(top_level_domains_path, config());
}
/// my test
/// encoding frequencies
{
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "encodings_frequency/");
FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
const std::string & encode_frequency_path = config().getString("encoding_frequencies_path", path);
FrequencyHolder::getInstance().parseEncodingFrequencies(encode_frequency_path);
}
/// programming languages frequencies
{
const std::string & programming_frequency_path = config().getString("programming_lang_frequencies_path", path);
FrequencyHolder::getInstance().parseProgrammingFrequency(programming_frequency_path);
}
/// emotional dictionary
{
const std::string & emotional_dict_path = config().getString("emotional_dict_path", path);
FrequencyHolder::getInstance().parseEmotionalDict(emotional_dict_path);
}
{
Poco::File(path + "data/").createDirectories();
Poco::File(path + "metadata/").createDirectories();

View File

@ -843,9 +843,11 @@
Path to the list is under top_level_domains_path (see above).
-->
<!-- MY CHANGES -->
<!-- Text classification -->
<encodings_frequency_path>/var/lib/clickhouse/encodings_frequency/</encodings_frequency_path>
<encoding_frequencies_path>/ClassificationDictionaries/charset_freq.txt</encoding_frequencies_path>
<programming_lang_frequencies_path>/ClassificationDictionaries/programming_freq.txt</programming_lang_frequencies_path>
<emotional_dict_path>/ClassificationDictionaries/emotional_dictionary_rus.txt</emotional_dict_path>
<top_level_domains_lists>
<!--

View File

@ -6,13 +6,10 @@
#include <IO/ReadHelpers.h>
#include <IO/readFloatText.h>
#include <IO/Operators.h>
#include <string_view>
#include <string>
#include <common/find_symbols.h>
#include <fstream>
#include <algorithm>
#include <cstring>
#include <limits>
#include <unordered_map>
#include <common/logger_useful.h>
@ -34,11 +31,24 @@ public:
}
void parseDictionaries(const String & pt)
void parseEncodingFrequencies(const String & pt)
{
is_true = pt;
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
path_to_enc_freq = pt;
//loadEncodingsFrequency(pt);
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
}
void parseEmotionalDict(const String & pt)
{
path_to_emo_dict = pt;
//loadEmotionalDict(pt);
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
}
void parseProgrammingFrequency(const String & pt)
{
path_to_prog_freq = pt;
//loadProgrammingFrequency(pt);
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
}
@ -154,13 +164,6 @@ public:
LOG_TRACE(log, "Programming languages frequencies was added");
}
const String & get_path()
{
return is_true;
}
const std::unordered_map<String, Float64> & getEmotionalDict()
{
return emotional_dict;
@ -178,12 +181,15 @@ public:
}
protected:
private:
String is_true;
std::unordered_map<String, Float64> emotional_dict;
Container encodings_freq;
std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq;
String path_to_emo_dict;
String path_to_enc_freq;
String path_to_prog_freq;
};
}

File diff suppressed because it is too large Load Diff

View File

@ -1,19 +1,9 @@
#include <Functions/FunctionsTextClassification.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <algorithm>
#include <cstring>
#include <cmath>
#include <limits>
#include <unordered_map>
#include <memory>
#include <utility>
#include <sstream>
#include <set>
namespace DB
{
@ -27,7 +17,8 @@ struct ProgrammingClassificationImpl
static ALWAYS_INLINE inline Float64 state_machine(std::unordered_map<String, Float64> standart, std::unordered_map<String, Float64> model)
{
Float64 res = 0;
for (auto & el : model) {
for (auto & el : model)
{
res += el.second * standart[el.first];
}
return res;
@ -50,15 +41,19 @@ struct ProgrammingClassificationImpl
while (!in.eof())
{
if (data.size() - (in.position() - data.data()) <= 3) {
if (data.size() - (in.position() - data.data()) <= 3)
{
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
if (prev == "") {
if (prev == "")
{
prev = new_word;
} else {
}
else
{
data_freq[prev + new_word] += 1;
prev = new_word;
}
@ -67,9 +62,11 @@ struct ProgrammingClassificationImpl
String most_liked;
Float64 max_result = 0;
for (const auto& item : programming_freq) {
for (const auto& item : programming_freq)
{
Float64 result = state_machine(item.second, data_freq);
if (result > max_result) {
if (result > max_result)
{
max_result = result;
most_liked = item.first;
}
@ -108,15 +105,19 @@ struct ProgrammingClassificationImpl
String prev;
while (!in.eof())
{
if (str.size() - (in.position() - str.data()) <= 3) {
if (str.size() - (in.position() - str.data()) <= 3)
{
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
if (prev == "") {
if (prev == "")
{
prev = new_word;
} else {
}
else
{
data_freq[prev + new_word] += 1;
prev = new_word;
}
@ -125,9 +126,11 @@ struct ProgrammingClassificationImpl
String most_liked;
Float64 max_result = 0;
for (const auto& item : programming_freq) {
for (const auto& item : programming_freq)
{
Float64 result = state_machine(item.second, data_freq);
if (result > max_result) {
if (result > max_result)
{
max_result = result;
most_liked = item.first;
}

View File

@ -17,6 +17,7 @@ namespace DB
* getTonality(string data) - defines the emotional coloring of the text.
* Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
*
* getProgrammingLanguage(string data) - detect programming language
*/
namespace ErrorCodes
{

View File

@ -5,15 +5,15 @@
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <algorithm>
#include <cstring>
#include <cmath>
#include <limits>
//#include <algorithm>
//#include <cstring>
//#include <cmath>
//#include <limits>
#include <unordered_map>
#include <memory>
#include <utility>
#include <sstream>
#include <set>
//#include <memory>
//#include <utility>
//#include <sstream>
//#include <set>
namespace DB
{