mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
Merge pull request #1882 from zhang2014/support/simd_for_lengthUTF-8
ISSUES-1885 UTF8 countCodePoints use SIMD
This commit is contained in:
commit
d25c244946
@ -3,6 +3,9 @@
|
||||
#include <Core/Types.h>
|
||||
#include <Common/BitHelpers.h>
|
||||
|
||||
#if __SSE2__
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -49,9 +52,29 @@ inline size_t seqLength(const UInt8 first_octet)
|
||||
inline size_t countCodePoints(const UInt8 * data, size_t size)
|
||||
{
|
||||
size_t res = 0;
|
||||
const auto end = data + size;
|
||||
|
||||
/// TODO SIMD implementation looks quite simple.
|
||||
for (auto end = data + size; data < end; ++data) /// Skip UTF-8 continuation bytes.
|
||||
#if __SSE2__
|
||||
const auto bytes_sse = sizeof(__m128i);
|
||||
const auto src_end_sse = (data + size) - (size % bytes_sse);
|
||||
|
||||
const auto align_sse = _mm_set1_epi8(0x40);
|
||||
const auto upper_bound = _mm_set1_epi8(0xBF);
|
||||
|
||||
for (; data < src_end_sse; data += bytes_sse)
|
||||
{
|
||||
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data));
|
||||
|
||||
///Align to zero for the solve two case
|
||||
const auto align_res = _mm_adds_epu8(chars, align_sse);
|
||||
const auto less_than_and_equals = _mm_cmpeq_epi8(_mm_min_epu8(align_res, upper_bound), align_res);
|
||||
|
||||
res += __builtin_popcount(_mm_movemask_epi8(less_than_and_equals));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; data < end; ++data) /// Skip UTF-8 continuation bytes.
|
||||
res += (*data <= 0x7F || *data >= 0xC0);
|
||||
|
||||
return res;
|
||||
|
38
dbms/tests/performance/functions_length/functions_length.xml
Normal file
38
dbms/tests/performance/functions_length/functions_length.xml
Normal file
@ -0,0 +1,38 @@
|
||||
<test>
|
||||
<name>functions_length</name>
|
||||
<type>once</type>
|
||||
|
||||
<stop_conditions>
|
||||
<all_of>
|
||||
<total_time_ms>10000</total_time_ms>
|
||||
</all_of>
|
||||
<any_of>
|
||||
<average_speed_not_changing_for_ms>5000</average_speed_not_changing_for_ms>
|
||||
<total_time_ms>20000</total_time_ms>
|
||||
</any_of>
|
||||
</stop_conditions>
|
||||
|
||||
<main_metric>
|
||||
<avg_rows_per_second/>
|
||||
</main_metric>
|
||||
|
||||
<substitutions>
|
||||
<substitution>
|
||||
<name>string</name>
|
||||
<values>
|
||||
<value>materialize('')</value>
|
||||
<value>materialize('Hello, world')</value>
|
||||
<value>toString(number)</value>
|
||||
<value>reinterpretAsString(number)</value>
|
||||
<value>materialize('中文测试字符串')</value>
|
||||
<value>materialize('https://github.com/yandex/ClickHouse/pull/1882')</value>
|
||||
<value>materialize('https://zh.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E9%9F%93%E7%B5%B1%E4%B8%80%E8%A1%A8%E6%84%8F%E6%96%87%E5%AD%97%E6%93%B4%E5%B1%95%E5%8D%80F')</value>
|
||||
<value>concat('中文测试字符串 ', toString(number), ' Привет, мир!')</value>
|
||||
<value>concat(concat('中文测试字符串 ', toString(number), ' Привет, мир!') AS x, x, x, x, x, x, x, x, x, x)</value>
|
||||
<value>convertCharset(concat(reinterpretAsString(rand64(1)), reinterpretAsString(rand64(2)), reinterpretAsString(rand64(3)), reinterpretAsString(rand64(4)), reinterpretAsString(rand64(5)), reinterpretAsString(rand64(6)), reinterpretAsString(rand64(7)), reinterpretAsString(rand64(8)), reinterpretAsString(rand64(9)), reinterpretAsString(rand64(10))), 'UTF-16', 'UTF-8')</value>
|
||||
</values>
|
||||
</substitution>
|
||||
</substitutions>
|
||||
|
||||
<query>SELECT count() FROM system.numbers WHERE NOT ignore(lengthUTF8({string}))</query>
|
||||
</test>
|
Loading…
Reference in New Issue
Block a user