Little changes

This commit is contained in:
s-kat 2021-05-18 22:36:46 +03:00
parent 4c80b5f8a9
commit 4e935baccb
8 changed files with 21 additions and 39 deletions

View File

@ -32,7 +32,6 @@ if (SANITIZE STREQUAL "undefined")
endif()
set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1)
add_subdirectory (abseil-cpp-cmake)
add_subdirectory (antlr4-runtime-cmake)
add_subdirectory (boost-cmake)

View File

@ -28,19 +28,9 @@ set (SRCS
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
)
add_library(cld2 ${SRCS})
add_library(cld2 SHARED ${SRCS})
target_compile_options (cld2 PRIVATE -Wl -Wno-narrowing)
target_compile_options (cld2
PRIVATE
-Wno-old-style-cast
-Wno-inconsistent-missing-destructor-override
-Wno-deprecated
-Wno-unused-parameter
-Wno-shadow
-Wno-tautological-type-limit-compare
-Wno-extra-semi
-Wno-narrowing
-Wl
)
target_include_directories (cld2 PUBLIC ${LIBRARY_DIR}/public)
target_link_libraries (cld2 PRIVATE ${LIBRARY_DIR}/internal/libcld2.so)
target_include_directories(cld2 PUBLIC "${LIBRARY_DIR}/public")

View File

@ -908,9 +908,9 @@
<!-- Text classification -->
<encoding_frequencies_path>charset_freq.txt</encoding_frequencies_path>
<programming_lang_frequencies_path>programming_freq.txt</programming_lang_frequencies_path>
<emotional_dict_path>emotional_dictionary_rus.txt</emotional_dict_path>
<encoding_frequencies_path>/ClickHouse/programs/server/charset_freq.txt</encoding_frequencies_path>
<programming_lang_frequencies_path>/ClickHouse/programs/server/programming_freq.txt</programming_lang_frequencies_path>
<emotional_dict_path>/ClickHouse/programs/server/emotional_dictionary_rus.txt</emotional_dict_path>
<top_level_domains_lists>
<!--

View File

@ -34,22 +34,22 @@ public:
void parseEncodingFrequencies(const String & pt)
{
path_to_enc_freq = pt;
loadEncodingsFrequency(pt);
// loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
//loadEncodingsFrequency(pt);
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
}
void parseEmotionalDict(const String & pt)
{
path_to_emo_dict = pt;
loadEmotionalDict(pt);
//loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
//loadEmotionalDict(pt);
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
}
void parseProgrammingFrequency(const String & pt)
{
path_to_prog_freq = pt;
loadProgrammingFrequency(pt);
//loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
//loadProgrammingFrequency(pt);
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
}

Binary file not shown.

View File

@ -4,7 +4,6 @@
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <algorithm>
#include <cstring>
#include <cmath>
@ -27,8 +26,6 @@ struct CharsetClassificationImpl
using CodePoint = UInt8;
static constexpr Float64 zero_frequency = 0.000001;
/// map_size for ngram count.
static constexpr size_t map_size = 1u << 16;
/// If the data size is bigger than this, behaviour is unspecified for this function.
static constexpr size_t max_string_size = 1u << 15;
@ -39,10 +36,6 @@ struct CharsetClassificationImpl
/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
/** map_size of this fits mostly in L2 cache all the time.
* Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
* integer array.
*/
using NgramCount = UInt16;
static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)

View File

@ -151,10 +151,10 @@ struct ProgrammingClassificationImpl
const auto ans = most_liked.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
size_t ans_size = strlen(ans);
res_data.resize(res_offset + ans_size + 1);
memcpy(&res_data[res_offset], ans, ans_size);
res_offset += ans_size;
res_data[res_offset] = 0;
++res_offset;

View File

@ -120,10 +120,10 @@ struct TonalityClassificationImpl
const auto ans = buf.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
size_t ans_size = strlen(ans);
res_data.resize(res_offset + ans_size + 1);
memcpy(&res_data[res_offset], ans, ans_size);
res_offset += ans_size;
res_data[res_offset] = 0;
++res_offset;