Accurate tail handling

This commit is contained in:
Danila Kutenin 2019-04-07 23:24:08 +03:00
parent 304c3fcc9b
commit 3519598ae9
3 changed files with 29 additions and 7 deletions

View File

@ -252,13 +252,13 @@ struct ValidUTF8Impl
while (len >= 16)
checkPacked(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)));
if (len)
{
alignas(16) char buf[32];
_mm_store_si128(reinterpret_cast<__m128i *>(buf), _mm_loadu_si128(reinterpret_cast<const __m128i *>(data)));
memset(reinterpret_cast<char *>(&buf) + len, 0, 16);
checkPacked(_mm_load_si128(reinterpret_cast<__m128i *>(buf)));
}
/// 0 <= len <= 15 for now. Reading data from data - 1 because of right padding of 15 and left padding
/// Then zero some bytes from the unknown memory and check again.
alignas(16) char buf[32];
_mm_store_si128(reinterpret_cast<__m128i *>(buf), _mm_loadu_si128(reinterpret_cast<const __m128i *>(data - 1)));
memset(buf + len + 1, 0, 16);
checkPacked(_mm_loadu_si128(reinterpret_cast<__m128i *>(buf + 1)));
/* Reduce error vector, error_reduced = 0xFFFF if error == 0 */
return _mm_movemask_epi8(_mm_cmpeq_epi8(error, _mm_set1_epi8(0))) == 0xFFFF;
}

View File

@ -1178,3 +1178,23 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -33,7 +33,9 @@ select 0 = isValidUTF8('\xc0\x9f') from system.numbers limit 10;
select 0 = isValidUTF8('\xf5\xff\xff\xff') from system.numbers limit 10;
select 0 = isValidUTF8('\xed\xa0\x81') from system.numbers limit 10;
select 0 = isValidUTF8('\xf8\x90\x80\x80\x80') from system.numbers limit 10;
select 0 = isValidUTF8('12345678901234\xed') from system.numbers limit 10;
select 0 = isValidUTF8('123456789012345\xed') from system.numbers limit 10;
select 0 = isValidUTF8('123456789012345\xed123456789012345\xed') from system.numbers limit 10;
select 0 = isValidUTF8('123456789012345\xf1') from system.numbers limit 10;
select 0 = isValidUTF8('123456789012345\xc2') from system.numbers limit 10;
select 0 = isValidUTF8('\xC2\x7F') from system.numbers limit 10;