mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 03:22:14 +00:00
Fixes
This commit is contained in:
parent
a7258d2e14
commit
af63bc3347
@ -632,12 +632,25 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
||||
TLDListsHolder::getInstance().parseConfig(top_level_domains_path, config());
|
||||
}
|
||||
|
||||
/// my test
|
||||
/// encoding frequencies
|
||||
{
|
||||
const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "encodings_frequency/");
|
||||
FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
|
||||
const std::string & encode_frequency_path = config().getString("encoding_frequencies_path", path);
|
||||
FrequencyHolder::getInstance().parseEncodingFrequencies(encode_frequency_path);
|
||||
}
|
||||
|
||||
/// programming languages frequencies
|
||||
{
|
||||
const std::string & programming_frequency_path = config().getString("programming_lang_frequencies_path", path);
|
||||
FrequencyHolder::getInstance().parseProgrammingFrequency(programming_frequency_path);
|
||||
}
|
||||
|
||||
/// emotional dictionary
|
||||
{
|
||||
const std::string & emotional_dict_path = config().getString("emotional_dict_path", path);
|
||||
FrequencyHolder::getInstance().parseEmotionalDict(emotional_dict_path);
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
Poco::File(path + "data/").createDirectories();
|
||||
Poco::File(path + "metadata/").createDirectories();
|
||||
|
@ -843,9 +843,11 @@
|
||||
Path to the list is under top_level_domains_path (see above).
|
||||
-->
|
||||
|
||||
<!-- MY CHANGES -->
|
||||
<!-- Text classification -->
|
||||
|
||||
<encodings_frequency_path>/var/lib/clickhouse/encodings_frequency/</encodings_frequency_path>
|
||||
<encoding_frequencies_path>/ClassificationDictionaries/charset_freq.txt</encoding_frequencies_path>
|
||||
<programming_lang_frequencies_path>/ClassificationDictionaries/programming_freq.txt</programming_lang_frequencies_path>
|
||||
<emotional_dict_path>/ClassificationDictionaries/emotional_dictionary_rus.txt</emotional_dict_path>
|
||||
|
||||
<top_level_domains_lists>
|
||||
<!--
|
||||
|
@ -6,13 +6,10 @@
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/readFloatText.h>
|
||||
#include <IO/Operators.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <string>
|
||||
#include <common/find_symbols.h>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <common/logger_useful.h>
|
||||
|
||||
@ -34,11 +31,24 @@ public:
|
||||
}
|
||||
|
||||
|
||||
void parseDictionaries(const String & pt)
|
||||
void parseEncodingFrequencies(const String & pt)
|
||||
{
|
||||
is_true = pt;
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
path_to_enc_freq = pt;
|
||||
//loadEncodingsFrequency(pt);
|
||||
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
}
|
||||
|
||||
void parseEmotionalDict(const String & pt)
|
||||
{
|
||||
path_to_emo_dict = pt;
|
||||
//loadEmotionalDict(pt);
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
}
|
||||
|
||||
void parseProgrammingFrequency(const String & pt)
|
||||
{
|
||||
path_to_prog_freq = pt;
|
||||
//loadProgrammingFrequency(pt);
|
||||
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
|
||||
}
|
||||
|
||||
@ -154,13 +164,6 @@ public:
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
|
||||
const String & get_path()
|
||||
{
|
||||
return is_true;
|
||||
}
|
||||
|
||||
|
||||
const std::unordered_map<String, Float64> & getEmotionalDict()
|
||||
{
|
||||
return emotional_dict;
|
||||
@ -178,12 +181,15 @@ public:
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
private:
|
||||
|
||||
String is_true;
|
||||
std::unordered_map<String, Float64> emotional_dict;
|
||||
Container encodings_freq;
|
||||
std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq;
|
||||
|
||||
String path_to_emo_dict;
|
||||
String path_to_enc_freq;
|
||||
String path_to_prog_freq;
|
||||
};
|
||||
}
|
||||
|
||||
|
9434
src/Functions/ClassificationDictionaries/programming_freq.txt
Normal file
9434
src/Functions/ClassificationDictionaries/programming_freq.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,9 @@
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <sstream>
|
||||
#include <set>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -27,7 +17,8 @@ struct ProgrammingClassificationImpl
|
||||
static ALWAYS_INLINE inline Float64 state_machine(std::unordered_map<String, Float64> standart, std::unordered_map<String, Float64> model)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (auto & el : model) {
|
||||
for (auto & el : model)
|
||||
{
|
||||
res += el.second * standart[el.first];
|
||||
}
|
||||
return res;
|
||||
@ -50,15 +41,19 @@ struct ProgrammingClassificationImpl
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
if (data.size() - (in.position() - data.data()) <= 3) {
|
||||
if (data.size() - (in.position() - data.data()) <= 3)
|
||||
{
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(new_word, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
if (prev == "") {
|
||||
if (prev == "")
|
||||
{
|
||||
prev = new_word;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
data_freq[prev + new_word] += 1;
|
||||
prev = new_word;
|
||||
}
|
||||
@ -67,9 +62,11 @@ struct ProgrammingClassificationImpl
|
||||
String most_liked;
|
||||
Float64 max_result = 0;
|
||||
|
||||
for (const auto& item : programming_freq) {
|
||||
for (const auto& item : programming_freq)
|
||||
{
|
||||
Float64 result = state_machine(item.second, data_freq);
|
||||
if (result > max_result) {
|
||||
if (result > max_result)
|
||||
{
|
||||
max_result = result;
|
||||
most_liked = item.first;
|
||||
}
|
||||
@ -108,15 +105,19 @@ struct ProgrammingClassificationImpl
|
||||
String prev;
|
||||
while (!in.eof())
|
||||
{
|
||||
if (str.size() - (in.position() - str.data()) <= 3) {
|
||||
if (str.size() - (in.position() - str.data()) <= 3)
|
||||
{
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(new_word, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
if (prev == "") {
|
||||
if (prev == "")
|
||||
{
|
||||
prev = new_word;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
data_freq[prev + new_word] += 1;
|
||||
prev = new_word;
|
||||
}
|
||||
@ -125,9 +126,11 @@ struct ProgrammingClassificationImpl
|
||||
String most_liked;
|
||||
Float64 max_result = 0;
|
||||
|
||||
for (const auto& item : programming_freq) {
|
||||
for (const auto& item : programming_freq)
|
||||
{
|
||||
Float64 result = state_machine(item.second, data_freq);
|
||||
if (result > max_result) {
|
||||
if (result > max_result)
|
||||
{
|
||||
max_result = result;
|
||||
most_liked = item.first;
|
||||
}
|
||||
|
@ -17,6 +17,7 @@ namespace DB
|
||||
* getTonality(string data) - defines the emotional coloring of the text.
|
||||
* Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
|
||||
*
|
||||
* getProgrammingLanguage(string data) - detect programming language
|
||||
*/
|
||||
namespace ErrorCodes
|
||||
{
|
||||
|
@ -5,15 +5,15 @@
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
//#include <algorithm>
|
||||
//#include <cstring>
|
||||
//#include <cmath>
|
||||
//#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <sstream>
|
||||
#include <set>
|
||||
//#include <memory>
|
||||
//#include <utility>
|
||||
//#include <sstream>
|
||||
//#include <set>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user