From 1d451082187fd21a52dfba4c79cd5c847554557f Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Mon, 6 Apr 2020 13:27:31 +0300 Subject: [PATCH] Fixed builds, implementation and tests * Builds shouldn't fail on platforms that do not support SSE2 and SSE4.2 and do not have corresponding headers. * Updated tests to include malicious padding * Fixed reporting tokens that cross or outside of data boundaries. --- .../MergeTree/MergeTreeIndexFullText.cpp | 27 +++++++++++-------- .../tests/gtest_SplitTokenExtractor.cpp | 8 +++++- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/dbms/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/Storages/MergeTree/MergeTreeIndexFullText.cpp index af979010dc0..93553e0619e 100644 --- a/dbms/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -19,9 +19,14 @@ #include +#if defined(__SSE2__) #include + +#if defined(__SSE4_2__) #include -#include +#endif + +#endif namespace DB @@ -620,19 +625,19 @@ bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size #if defined(__SSE4_2__) // With the help of https://www.strchr.com/strcmp_and_strlen_using_sse_4.2 - static const auto alnum_chars_ranges = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + const auto alnum_chars_ranges = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, '\xFF', '\x80', 'z', 'a', 'Z', 'A', '9', '0'); // Every bit represents if `haystack` character is in the ranges (1) or not(0) const int result_bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(alnum_chars_ranges, 8, haystack, haystack_length, _SIDD_CMP_RANGES)); #else // NOTE: -1 and +1 required since SSE2 has no `>=` and `<=` instructions on packed 8-bit integers (epi8). - static const auto number_begin = _mm_set1_epi8('0' - 1); - static const auto number_end = _mm_set1_epi8('9' + 1); - static const auto alpha_lower_begin = _mm_set1_epi8('a' - 1); - static const auto alpha_lower_end = _mm_set1_epi8('z' + 1); - static const auto alpha_upper_begin = _mm_set1_epi8('A' - 1); - static const auto alpha_upper_end = _mm_set1_epi8('Z' + 1); - static const auto zero = _mm_set1_epi8(0); + const auto number_begin = _mm_set1_epi8('0' - 1); + const auto number_end = _mm_set1_epi8('9' + 1); + const auto alpha_lower_begin = _mm_set1_epi8('a' - 1); + const auto alpha_lower_end = _mm_set1_epi8('z' + 1); + const auto alpha_upper_begin = _mm_set1_epi8('A' - 1); + const auto alpha_upper_end = _mm_set1_epi8('Z' + 1); + const auto zero = _mm_set1_epi8(0); // every bit represents if `haystack` character `c` statisfies condition: // (c < 0) || (c > '0' - 1 && c < '9' + 1) || (c > 'a' - 1 && c < 'z' + 1) || (c > 'A' - 1 && c < 'Z' + 1) @@ -669,7 +674,7 @@ bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size // check if there are leftovers in next `haystack` continue; - return true; + break; #else if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos])) { @@ -691,7 +696,7 @@ bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size // Could happen only if string is not padded with zeroes, and we accidentally hopped over end of data. if (*token_start > len) return false; - *token_len = len - *token_start; + *token_len = std::min(len - *token_start, *token_len); #endif return *token_len > 0; diff --git a/dbms/src/Storages/tests/gtest_SplitTokenExtractor.cpp b/dbms/src/Storages/tests/gtest_SplitTokenExtractor.cpp index b8686f962bc..e2229792020 100644 --- a/dbms/src/Storages/tests/gtest_SplitTokenExtractor.cpp +++ b/dbms/src/Storages/tests/gtest_SplitTokenExtractor.cpp @@ -17,7 +17,7 @@ using namespace DB; struct SplitTokenExtractorTestCase { - const char * description; + const std::string_view description; const std::string source; const std::vector tokens; }; @@ -35,6 +35,12 @@ public: const auto & param = GetParam(); const auto & source = param.source; data = std::make_unique>(source.data(), source.data() + source.size()); + + // add predefined padding that forms tokens to ensure no reads past end of buffer. + const char extra_padding[] = "this is the end \xd1\x8d\xd1\x82\xd0\xbe\xd0\xba\xd0\xbe \xd0\xbd\xd0\xb5\xd1\x86"; + data->insert(data->end(), std::begin(extra_padding), std::end(extra_padding)); + + data->resize(data->size() - sizeof(extra_padding)); } std::unique_ptr> data;