Merge pull request #1882 from zhang2014/support/simd_for_lengthUTF-8

ISSUES-1885 UTF8 countCodePoints use SIMD
This commit is contained in:
KochetovNicolai 2018-03-26 18:27:07 +03:00 committed by GitHub
commit d25c244946
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 2 deletions

View File

@ -3,6 +3,9 @@
#include <Core/Types.h>
#include <Common/BitHelpers.h>
#if __SSE2__
#include <emmintrin.h>
#endif
namespace DB
{
@ -49,9 +52,29 @@ inline size_t seqLength(const UInt8 first_octet)
inline size_t countCodePoints(const UInt8 * data, size_t size)
{
size_t res = 0;
const auto end = data + size;
/// TODO SIMD implementation looks quite simple.
for (auto end = data + size; data < end; ++data) /// Skip UTF-8 continuation bytes.
#if __SSE2__
const auto bytes_sse = sizeof(__m128i);
const auto src_end_sse = (data + size) - (size % bytes_sse);
const auto align_sse = _mm_set1_epi8(0x40);
const auto upper_bound = _mm_set1_epi8(0xBF);
for (; data < src_end_sse; data += bytes_sse)
{
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data));
///Align to zero for the solve two case
const auto align_res = _mm_adds_epu8(chars, align_sse);
const auto less_than_and_equals = _mm_cmpeq_epi8(_mm_min_epu8(align_res, upper_bound), align_res);
res += __builtin_popcount(_mm_movemask_epi8(less_than_and_equals));
}
#endif
for (; data < end; ++data) /// Skip UTF-8 continuation bytes.
res += (*data <= 0x7F || *data >= 0xC0);
return res;

View File

@ -0,0 +1,38 @@
<test>
<name>functions_length</name>
<type>once</type>
<stop_conditions>
<all_of>
<total_time_ms>10000</total_time_ms>
</all_of>
<any_of>
<average_speed_not_changing_for_ms>5000</average_speed_not_changing_for_ms>
<total_time_ms>20000</total_time_ms>
</any_of>
</stop_conditions>
<main_metric>
<avg_rows_per_second/>
</main_metric>
<substitutions>
<substitution>
<name>string</name>
<values>
<value>materialize('')</value>
<value>materialize('Hello, world')</value>
<value>toString(number)</value>
<value>reinterpretAsString(number)</value>
<value>materialize('中文测试字符串')</value>
<value>materialize('https://github.com/yandex/ClickHouse/pull/1882')</value>
<value>materialize('https://zh.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E9%9F%93%E7%B5%B1%E4%B8%80%E8%A1%A8%E6%84%8F%E6%96%87%E5%AD%97%E6%93%B4%E5%B1%95%E5%8D%80F')</value>
<value>concat('中文测试字符串 ', toString(number), ' Привет, мир!')</value>
<value>concat(concat('中文测试字符串 ', toString(number), ' Привет, мир!') AS x, x, x, x, x, x, x, x, x, x)</value>
<value>convertCharset(concat(reinterpretAsString(rand64(1)), reinterpretAsString(rand64(2)), reinterpretAsString(rand64(3)), reinterpretAsString(rand64(4)), reinterpretAsString(rand64(5)), reinterpretAsString(rand64(6)), reinterpretAsString(rand64(7)), reinterpretAsString(rand64(8)), reinterpretAsString(rand64(9)), reinterpretAsString(rand64(10))), 'UTF-16', 'UTF-8')</value>
</values>
</substitution>
</substitutions>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(lengthUTF8({string}))</query>
</test>