fix failed uts

This commit is contained in:
taiyang-li 2024-08-26 16:28:56 +08:00
parent 9bcafbc90f
commit 6c0102e971
3 changed files with 28 additions and 39 deletions

View File

@ -31,7 +31,7 @@ struct LowerUpperUTF8Impl
ColumnString::Offsets & res_offsets,
size_t input_rows_count)
{
if (data.empty())
if (input_rows_count == 0)
return;
bool all_ascii = isAllASCII(data.data(), data.size());
@ -41,67 +41,56 @@ struct LowerUpperUTF8Impl
return;
}
res_data.resize(data.size());
res_offsets.resize_exact(input_rows_count);
UErrorCode error_code = U_ZERO_ERROR;
UCaseMap * csm = ucasemap_open(nullptr, 0, &error_code);
UCaseMap * case_map = ucasemap_open("", U_FOLD_CASE_DEFAULT, &error_code);
if (U_FAILURE(error_code))
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Call ucasemap_open error:{}", u_errorName(error_code));
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Error calling ucasemap_open: {}", u_errorName(error_code));
// String output;
size_t curr_offset = 0;
res_data.resize(data.size());
res_offsets.resize_exact(offsets.size());
for (size_t i = 0; i < input_rows_count; ++i)
for (size_t row_i = 0; row_i < input_rows_count; ++row_i)
{
const auto * data_start = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
size_t size = offsets[i] - offsets[i - 1] - 1;
const auto * src = reinterpret_cast<const char *>(&data[offsets[row_i - 1]]);
size_t src_size = offsets[row_i] - offsets[row_i - 1] - 1;
int32_t out_size;
int32_t dst_size;
if constexpr (upper)
out_size = ucasemap_utf8ToUpper(
csm, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, data_start, size, &error_code);
dst_size = ucasemap_utf8ToUpper(
case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
else
out_size = ucasemap_utf8ToLower(
csm, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, data_start, size, &error_code);
dst_size = ucasemap_utf8ToLower(
case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
// std::cout << size << ":" << out_size << ":" << static_cast<size_t>(res_data[curr_offset + out_size - 1]) << ":" << error_code
// << std::endl;
if (error_code == U_BUFFER_OVERFLOW_ERROR)
{
size_t new_size = curr_offset + out_size + 1;
size_t new_size = curr_offset + dst_size + 1;
res_data.resize(new_size);
error_code = U_ZERO_ERROR;
if constexpr (upper)
out_size = ucasemap_utf8ToUpper(
csm,
reinterpret_cast<char *>(&res_data[curr_offset]),
res_data.size() - curr_offset,
data_start,
size,
&error_code);
dst_size = ucasemap_utf8ToUpper(
case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
else
out_size = ucasemap_utf8ToLower(
csm,
reinterpret_cast<char *>(&res_data[curr_offset]),
res_data.size() - curr_offset,
data_start,
size,
&error_code);
dst_size = ucasemap_utf8ToLower(
case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
}
if (error_code != U_ZERO_ERROR)
throw DB::Exception(
ErrorCodes::LOGICAL_ERROR,
"Call {} error:{} input:{} input_size:{}",
"Error calling {}: {} input: {} input_size: {}",
upper ? "ucasemap_utf8ToUpper" : "ucasemap_utf8ToLower",
u_errorName(error_code),
std::string_view(data_start, size),
size);
std::string_view(src, src_size),
src_size);
res_data[curr_offset + out_size] = 0;
curr_offset += out_size + 1;
res_offsets[i] = curr_offset;
res_data[curr_offset + dst_size] = 0;
curr_offset += dst_size + 1;
res_offsets[row_i] = curr_offset;
}
res_data.resize(curr_offset);

View File

@ -1,2 +1,2 @@
EFBFBD
EFBFBD
FF
FF

View File

@ -5,9 +5,9 @@ insert into utf8_overlap values ('\xe2'), ('Foo⚊BarBazBam'), ('\xe2'), ('Foo
-- MONOGRAM FOR YANG
with lowerUTF8(str) as l_, upperUTF8(str) as u_, '0x' || hex(str) as h_
select length(str), if(l_ == '\xe2', h_, l_), if(u_ == '\xe2', h_, u_) from utf8_overlap format CSV;
1,"<EFBFBD>","<22>"
1,"0xE2","0xE2"
15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
1,"<EFBFBD>","<22>"
1,"0xE2","0xE2"
15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
-- NOTE: regression test for introduced bug
-- https://github.com/ClickHouse/ClickHouse/issues/42756