Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

This commit is contained in:
Alexey Milovidov 2018-11-26 01:26:36 +03:00
parent 68e0a687c8
commit 4cb7f2896c
4 changed files with 31 additions and 15 deletions

View File

@ -1,5 +1,6 @@
#include <Columns/ColumnString.h>
#include <Poco/UTF8Encoding.h>
#include <Common/UTF8Helpers.h>
#if __SSE2__
#include <emmintrin.h>
@ -9,6 +10,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace
{
/// xor or do nothing
@ -90,10 +96,9 @@ struct LowerUpperUTF8Impl
array(data.data(), data.data() + data.size(), res_data.data());
}
static void vector_fixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
res_data.resize(data.size());
array(data.data(), data.data() + data.size(), res_data.data());
throw Exception("Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument", ErrorCodes::BAD_ARGUMENTS);
}
/** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
@ -129,16 +134,28 @@ struct LowerUpperUTF8Impl
{
static const Poco::UTF8Encoding utf8;
if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src))
int src_sequence_length = UTF8::seqLength(*src);
int src_code_point = utf8.queryConvert(src, src_end - src);
if (src_code_point > 0)
{
src += chars;
dst += chars;
}
else
{
++src;
++dst;
int dst_code_point = to_case(src_code_point);
if (dst_code_point > 0)
{
int dst_sequence_length = utf8.convert(dst_code_point, dst, src_end - src);
/// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
/// As an example, this happens for ß and ẞ.
if (dst_sequence_length == src_sequence_length)
{
src += dst_sequence_length;
dst += dst_sequence_length;
return;
}
}
}
*dst++ = *src++;
}
}
@ -149,7 +166,7 @@ private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
#if __SSE2__
const auto bytes_sse = sizeof(__m128i);
static constexpr auto bytes_sse = sizeof(__m128i);
auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;
/// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
SELECT lowerUTF8('\xF0') = lowerUTF8('\xF0');

View File

@ -6,9 +6,6 @@ SELECT globalNotIn(['"wh'], [NULL]);
SELECT globalIn([''], [NULL])
SELECT ( SELECT toDecimal128([], rowNumberInBlock()) ) , lcm('', [[(CAST(('>A') AS String))]]);
SELECT truncate(895, -16);
SELECT (CAST((lowerUTF8('a7\xwK>-')) AS String)), [6935];
SELECT upperUTF8(sipHash128('\0')), [], ['xD2jG'];
SELECT upperUTF8(SHA256(''));
SELECT arrayEnumerateUniq(anyHeavy([]), []);
SELECT notIn([['']], [[NULL]]);
SELECT subtractDays((CAST((-5263074.47) AS DateTime)), -737895);