mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Little changes
This commit is contained in:
parent
4c80b5f8a9
commit
4e935baccb
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -32,7 +32,6 @@ if (SANITIZE STREQUAL "undefined")
|
||||
endif()
|
||||
|
||||
set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1)
|
||||
|
||||
add_subdirectory (abseil-cpp-cmake)
|
||||
add_subdirectory (antlr4-runtime-cmake)
|
||||
add_subdirectory (boost-cmake)
|
||||
|
@ -28,19 +28,9 @@ set (SRCS
|
||||
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
|
||||
)
|
||||
|
||||
add_library(cld2 ${SRCS})
|
||||
add_library(cld2 SHARED ${SRCS})
|
||||
target_compile_options (cld2 PRIVATE -Wl -Wno-narrowing)
|
||||
|
||||
target_compile_options (cld2
|
||||
PRIVATE
|
||||
-Wno-old-style-cast
|
||||
-Wno-inconsistent-missing-destructor-override
|
||||
-Wno-deprecated
|
||||
-Wno-unused-parameter
|
||||
-Wno-shadow
|
||||
-Wno-tautological-type-limit-compare
|
||||
-Wno-extra-semi
|
||||
-Wno-narrowing
|
||||
-Wl
|
||||
)
|
||||
|
||||
target_include_directories (cld2 PUBLIC ${LIBRARY_DIR}/public)
|
||||
target_link_libraries (cld2 PRIVATE ${LIBRARY_DIR}/internal/libcld2.so)
|
||||
target_include_directories(cld2 PUBLIC "${LIBRARY_DIR}/public")
|
||||
|
@ -908,9 +908,9 @@
|
||||
|
||||
<!-- Text classification -->
|
||||
|
||||
<encoding_frequencies_path>charset_freq.txt</encoding_frequencies_path>
|
||||
<programming_lang_frequencies_path>programming_freq.txt</programming_lang_frequencies_path>
|
||||
<emotional_dict_path>emotional_dictionary_rus.txt</emotional_dict_path>
|
||||
<encoding_frequencies_path>/ClickHouse/programs/server/charset_freq.txt</encoding_frequencies_path>
|
||||
<programming_lang_frequencies_path>/ClickHouse/programs/server/programming_freq.txt</programming_lang_frequencies_path>
|
||||
<emotional_dict_path>/ClickHouse/programs/server/emotional_dictionary_rus.txt</emotional_dict_path>
|
||||
|
||||
<top_level_domains_lists>
|
||||
<!--
|
||||
|
@ -34,22 +34,22 @@ public:
|
||||
void parseEncodingFrequencies(const String & pt)
|
||||
{
|
||||
path_to_enc_freq = pt;
|
||||
loadEncodingsFrequency(pt);
|
||||
// loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
//loadEncodingsFrequency(pt);
|
||||
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
}
|
||||
|
||||
void parseEmotionalDict(const String & pt)
|
||||
{
|
||||
path_to_emo_dict = pt;
|
||||
loadEmotionalDict(pt);
|
||||
//loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
//loadEmotionalDict(pt);
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
}
|
||||
|
||||
void parseProgrammingFrequency(const String & pt)
|
||||
{
|
||||
path_to_prog_freq = pt;
|
||||
loadProgrammingFrequency(pt);
|
||||
//loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
|
||||
//loadProgrammingFrequency(pt);
|
||||
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
|
||||
}
|
||||
|
||||
|
||||
|
BIN
src/Common/t
BIN
src/Common/t
Binary file not shown.
@ -4,7 +4,6 @@
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
@ -27,8 +26,6 @@ struct CharsetClassificationImpl
|
||||
using CodePoint = UInt8;
|
||||
|
||||
static constexpr Float64 zero_frequency = 0.000001;
|
||||
/// map_size for ngram count.
|
||||
static constexpr size_t map_size = 1u << 16;
|
||||
|
||||
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
@ -39,10 +36,6 @@ struct CharsetClassificationImpl
|
||||
/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
|
||||
static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
|
||||
|
||||
/** map_size of this fits mostly in L2 cache all the time.
|
||||
* Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
|
||||
* integer array.
|
||||
*/
|
||||
using NgramCount = UInt16;
|
||||
|
||||
static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
|
||||
|
@ -151,10 +151,10 @@ struct ProgrammingClassificationImpl
|
||||
|
||||
const auto ans = most_liked.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
memcpy(&res_data[res_offset], ans, strlen(ans));
|
||||
res_offset += strlen(ans);
|
||||
size_t ans_size = strlen(ans);
|
||||
res_data.resize(res_offset + ans_size + 1);
|
||||
memcpy(&res_data[res_offset], ans, ans_size);
|
||||
res_offset += ans_size;
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
|
@ -120,10 +120,10 @@ struct TonalityClassificationImpl
|
||||
|
||||
const auto ans = buf.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
memcpy(&res_data[res_offset], ans, strlen(ans));
|
||||
res_offset += strlen(ans);
|
||||
size_t ans_size = strlen(ans);
|
||||
res_data.resize(res_offset + ans_size + 1);
|
||||
memcpy(&res_data[res_offset], ans, ans_size);
|
||||
res_offset += ans_size;
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
|
Loading…
Reference in New Issue
Block a user