mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]
This commit is contained in:
parent
68e0a687c8
commit
4cb7f2896c
@ -1,5 +1,6 @@
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Poco/UTF8Encoding.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
|
||||
#if __SSE2__
|
||||
#include <emmintrin.h>
|
||||
@ -9,6 +10,11 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
/// xor or do nothing
|
||||
@ -90,10 +96,9 @@ struct LowerUpperUTF8Impl
|
||||
array(data.data(), data.data() + data.size(), res_data.data());
|
||||
}
|
||||
|
||||
static void vector_fixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
|
||||
static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
||||
{
|
||||
res_data.resize(data.size());
|
||||
array(data.data(), data.data() + data.size(), res_data.data());
|
||||
throw Exception("Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
|
||||
/** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
|
||||
@ -129,16 +134,28 @@ struct LowerUpperUTF8Impl
|
||||
{
|
||||
static const Poco::UTF8Encoding utf8;
|
||||
|
||||
if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src))
|
||||
int src_sequence_length = UTF8::seqLength(*src);
|
||||
|
||||
int src_code_point = utf8.queryConvert(src, src_end - src);
|
||||
if (src_code_point > 0)
|
||||
{
|
||||
src += chars;
|
||||
dst += chars;
|
||||
}
|
||||
else
|
||||
{
|
||||
++src;
|
||||
++dst;
|
||||
int dst_code_point = to_case(src_code_point);
|
||||
if (dst_code_point > 0)
|
||||
{
|
||||
int dst_sequence_length = utf8.convert(dst_code_point, dst, src_end - src);
|
||||
|
||||
/// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
|
||||
/// As an example, this happens for ß and ẞ.
|
||||
if (dst_sequence_length == src_sequence_length)
|
||||
{
|
||||
src += dst_sequence_length;
|
||||
dst += dst_sequence_length;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -149,7 +166,7 @@ private:
|
||||
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
|
||||
{
|
||||
#if __SSE2__
|
||||
const auto bytes_sse = sizeof(__m128i);
|
||||
static constexpr auto bytes_sse = sizeof(__m128i);
|
||||
auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;
|
||||
|
||||
/// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)
|
||||
|
@ -0,0 +1 @@
|
||||
1
|
1
dbms/tests/queries/0_stateless/00761_lower_utf8_bug.sql
Normal file
1
dbms/tests/queries/0_stateless/00761_lower_utf8_bug.sql
Normal file
@ -0,0 +1 @@
|
||||
SELECT lowerUTF8('\xF0') = lowerUTF8('\xF0');
|
@ -6,9 +6,6 @@ SELECT globalNotIn(['"wh'], [NULL]);
|
||||
SELECT globalIn([''], [NULL])
|
||||
SELECT ( SELECT toDecimal128([], rowNumberInBlock()) ) , lcm('', [[(CAST(('>A') AS String))]]);
|
||||
SELECT truncate(895, -16);
|
||||
SELECT (CAST((lowerUTF8('a7\xwK>-')) AS String)), [6935];
|
||||
SELECT upperUTF8(sipHash128('\0')), [], ['xD2jG'];
|
||||
SELECT upperUTF8(SHA256(''));
|
||||
SELECT arrayEnumerateUniq(anyHeavy([]), []);
|
||||
SELECT notIn([['']], [[NULL]]);
|
||||
SELECT subtractDays((CAST((-5263074.47) AS DateTime)), -737895);
|
||||
|
Loading…
Reference in New Issue
Block a user