mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 02:52:13 +00:00
Improve accuracy of ICU-correcting code by 12.6%
This commit is contained in:
parent
b8027e5566
commit
d350f7bc1a
@ -6,7 +6,6 @@
|
|||||||
|
|
||||||
#include <Columns/ColumnString.h>
|
#include <Columns/ColumnString.h>
|
||||||
#include <Functions/LowerUpperImpl.h>
|
#include <Functions/LowerUpperImpl.h>
|
||||||
#include <base/find_symbols.h>
|
|
||||||
#include <unicode/unistr.h>
|
#include <unicode/unistr.h>
|
||||||
#include <Common/StringUtils.h>
|
#include <Common/StringUtils.h>
|
||||||
|
|
||||||
@ -43,7 +42,7 @@ struct LowerUpperUTF8Impl
|
|||||||
|
|
||||||
String output;
|
String output;
|
||||||
size_t curr_offset = 0;
|
size_t curr_offset = 0;
|
||||||
for (size_t i = 0; i < offsets.size(); ++i)
|
for (size_t i = 0; i < input_rows_count; ++i)
|
||||||
{
|
{
|
||||||
const auto * data_start = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
|
const auto * data_start = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
|
||||||
size_t size = offsets[i] - offsets[i - 1];
|
size_t size = offsets[i] - offsets[i - 1];
|
||||||
@ -57,13 +56,15 @@ struct LowerUpperUTF8Impl
|
|||||||
output.clear();
|
output.clear();
|
||||||
input.toUTF8String(output);
|
input.toUTF8String(output);
|
||||||
|
|
||||||
/// For valid UTF-8 input strings, ICU sometimes produces output with extra '\0's at the end. Only the data before the first
|
/// For valid UTF-8 input strings, ICU sometimes produces output with an extra '\0 at the end. Only the data before that
|
||||||
/// '\0' is valid. It the input is not valid UTF-8, then the behavior of lower/upperUTF8 is undefined by definition. In this
|
/// '\0' is valid. It the input is not valid UTF-8, then the behavior of lower/upperUTF8 is undefined by definition. In this
|
||||||
/// case, the behavior is also reasonable.
|
/// case, the behavior is also reasonable.
|
||||||
const char * res_end = find_last_not_symbols_or_null<'\0'>(output.data(), output.data() + output.size());
|
size_t valid_size = output.size();
|
||||||
size_t valid_size = res_end ? res_end - output.data() + 1 : 0;
|
if (!output.empty() && output.back() == '\0')
|
||||||
|
--valid_size;
|
||||||
|
|
||||||
res_data.resize(curr_offset + valid_size + 1);
|
res_data.resize(curr_offset + valid_size + 1);
|
||||||
|
|
||||||
memcpy(&res_data[curr_offset], output.data(), valid_size);
|
memcpy(&res_data[curr_offset], output.data(), valid_size);
|
||||||
res_data[curr_offset + valid_size] = 0;
|
res_data[curr_offset + valid_size] = 0;
|
||||||
|
|
||||||
|
@ -26,3 +26,4 @@
|
|||||||
1
|
1
|
||||||
1
|
1
|
||||||
1
|
1
|
||||||
|
2
|
||||||
|
@ -38,3 +38,6 @@ select lowerUTF8('ır') = 'ır';
|
|||||||
-- German language
|
-- German language
|
||||||
select upper('öäüß') = 'öäüß';
|
select upper('öäüß') = 'öäüß';
|
||||||
select lower('ÖÄÜẞ') = 'ÖÄÜẞ';
|
select lower('ÖÄÜẞ') = 'ÖÄÜẞ';
|
||||||
|
|
||||||
|
-- Bug 68680
|
||||||
|
SELECT lengthUTF8(lowerUTF8('Ä\0'));
|
||||||
|
Loading…
Reference in New Issue
Block a user