Merge pull request #64756 from Blargian/utf8_compute_fix

`UTF8::computeWidth` should skip ANSI escape sequences (fix)
This commit is contained in:
Alexey Milovidov 2024-06-06 21:13:08 +02:00 committed by GitHub
commit e421c741bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 40 additions and 16 deletions

View File

@ -140,6 +140,18 @@ inline bool isPrintableASCII(char c)
return uc >= 32 && uc <= 126; /// 127 is ASCII DEL.
}
inline bool isCSIParameterByte(char c)
{
uint8_t uc = c;
return uc >= 0x30 && uc <= 0x3F; /// ASCII 09:;<=>?
}
inline bool isCSIIntermediateByte(char c)
{
uint8_t uc = c;
return uc >= 0x20 && uc <= 0x2F; /// ASCII !"#$%&'()*+,-./
}
inline bool isCSIFinalByte(char c)
{
uint8_t uc = c;

View File

@ -103,7 +103,7 @@ template <ComputeWidthMode mode>
size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept
{
UTF8Decoder decoder;
int isEscapeSequence = false;
bool is_escape_sequence = false;
size_t width = 0;
size_t rollback = 0;
for (size_t i = 0; i < size; ++i)
@ -116,6 +116,9 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
while (i + 15 < size)
{
if (is_escape_sequence)
break;
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i]));
const uint16_t non_regular_width_mask = _mm_movemask_epi8(
@ -131,26 +134,29 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
break;
}
else
{
if (isEscapeSequence)
{
break;
}
else
{
i += 16;
width += 16;
}
}
}
#endif
while (i < size && isPrintableASCII(data[i]))
{
if (!isEscapeSequence)
bool ignore_width = is_escape_sequence && (isCSIParameterByte(data[i]) || isCSIIntermediateByte(data[i]));
if (ignore_width || (data[i] == '[' && is_escape_sequence))
{
/// don't count the width
}
else if (is_escape_sequence && isCSIFinalByte(data[i]))
{
is_escape_sequence = false;
}
else
{
++width;
else if (isCSIFinalByte(data[i]) && data[i - 1] != '\x1b')
isEscapeSequence = false; /// end of CSI escape sequence reached
}
++i;
}
@ -178,7 +184,7 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
// special treatment for '\t' and for ESC
size_t next_width = width;
if (decoder.codepoint == '\x1b')
isEscapeSequence = true;
is_escape_sequence = true;
else if (decoder.codepoint == '\t')
next_width += 8 - (prefix + width) % 8;
else

View File

@ -1,5 +1,10 @@
┏━━━┓
┃ x ┃
┡━━━┩
1. │ █ │
1. │ █ │
└───┘
┏━━━━━━━━━┳━━━━━━━━━━┓
┃ 'Hello' ┃ x ┃
┡━━━━━━━━━╇━━━━━━━━━━┩
1. │ Hello │ █ test █ │
└─────────┴──────────┘

View File

@ -1 +1,2 @@
SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 0) AS x FORMAT Pretty;
SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty;
SELECT 'Hello', format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty;