Merge pull request #64756 from Blargian/utf8_compute_fix

`UTF8::computeWidth` should skip ANSI escape sequences (fix)
This commit is contained in:
Alexey Milovidov 2024-06-06 21:13:08 +02:00 committed by GitHub
commit e421c741bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 40 additions and 16 deletions

View File

@ -140,6 +140,18 @@ inline bool isPrintableASCII(char c)
return uc >= 32 && uc <= 126; /// 127 is ASCII DEL. return uc >= 32 && uc <= 126; /// 127 is ASCII DEL.
} }
inline bool isCSIParameterByte(char c)
{
uint8_t uc = c;
return uc >= 0x30 && uc <= 0x3F; /// ASCII 09:;<=>?
}
inline bool isCSIIntermediateByte(char c)
{
uint8_t uc = c;
return uc >= 0x20 && uc <= 0x2F; /// ASCII !"#$%&'()*+,-./
}
inline bool isCSIFinalByte(char c) inline bool isCSIFinalByte(char c)
{ {
uint8_t uc = c; uint8_t uc = c;

View File

@ -103,7 +103,7 @@ template <ComputeWidthMode mode>
size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept
{ {
UTF8Decoder decoder; UTF8Decoder decoder;
int isEscapeSequence = false; bool is_escape_sequence = false;
size_t width = 0; size_t width = 0;
size_t rollback = 0; size_t rollback = 0;
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)
@ -116,6 +116,9 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
while (i + 15 < size) while (i + 15 < size)
{ {
if (is_escape_sequence)
break;
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i])); __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i]));
const uint16_t non_regular_width_mask = _mm_movemask_epi8( const uint16_t non_regular_width_mask = _mm_movemask_epi8(
@ -131,26 +134,29 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
break; break;
} }
else else
{
if (isEscapeSequence)
{
break;
}
else
{ {
i += 16; i += 16;
width += 16; width += 16;
} }
} }
}
#endif #endif
while (i < size && isPrintableASCII(data[i])) while (i < size && isPrintableASCII(data[i]))
{ {
if (!isEscapeSequence) bool ignore_width = is_escape_sequence && (isCSIParameterByte(data[i]) || isCSIIntermediateByte(data[i]));
if (ignore_width || (data[i] == '[' && is_escape_sequence))
{
/// don't count the width
}
else if (is_escape_sequence && isCSIFinalByte(data[i]))
{
is_escape_sequence = false;
}
else
{
++width; ++width;
else if (isCSIFinalByte(data[i]) && data[i - 1] != '\x1b') }
isEscapeSequence = false; /// end of CSI escape sequence reached
++i; ++i;
} }
@ -178,7 +184,7 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
// special treatment for '\t' and for ESC // special treatment for '\t' and for ESC
size_t next_width = width; size_t next_width = width;
if (decoder.codepoint == '\x1b') if (decoder.codepoint == '\x1b')
isEscapeSequence = true; is_escape_sequence = true;
else if (decoder.codepoint == '\t') else if (decoder.codepoint == '\t')
next_width += 8 - (prefix + width) % 8; next_width += 8 - (prefix + width) % 8;
else else

View File

@ -1,5 +1,10 @@
┏━━━┓ ┏━━━┓
┃ x ┃ ┃ x ┃
┡━━━┩ ┡━━━┩
1. │ █ │ 1. │ █ │
└───┘ └───┘
┏━━━━━━━━━┳━━━━━━━━━━┓
┃ 'Hello' ┃ x ┃
┡━━━━━━━━━╇━━━━━━━━━━┩
1. │ Hello │ █ test █ │
└─────────┴──────────┘

View File

@ -1 +1,2 @@
SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 0) AS x FORMAT Pretty; SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty;
SELECT 'Hello', format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty;