mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge pull request #3257 from amosbird/master
Correct wcwidth computation for pretty outputs.
This commit is contained in:
commit
147a2a13c2
@ -162,6 +162,7 @@ endif()
|
||||
target_link_libraries (clickhouse_common_io
|
||||
common
|
||||
string_utils
|
||||
widechar_width
|
||||
${LINK_LIBRARIES_ONLY_ON_X86_64}
|
||||
${LZ4_LIBRARY}
|
||||
${ZSTD_LIBRARY}
|
||||
|
134
dbms/src/Common/UTF8Helpers.cpp
Normal file
134
dbms/src/Common/UTF8Helpers.cpp
Normal file
@ -0,0 +1,134 @@
|
||||
#include <Common/UTF8Helpers.h>
|
||||
|
||||
#include <widechar_width.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace UTF8
|
||||
{
|
||||
|
||||
// based on https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions: The above copyright
|
||||
// notice and this permission notice shall be included in all copies or
|
||||
// substantial portions of the Software.
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
static const UInt8 TABLE[] =
|
||||
{
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||
};
|
||||
|
||||
struct UTF8Decoder
|
||||
{
|
||||
enum
|
||||
{
|
||||
ACCEPT = 0,
|
||||
REJECT = 1
|
||||
};
|
||||
|
||||
UInt32 decode(UInt8 byte)
|
||||
{
|
||||
UInt32 type = TABLE[byte];
|
||||
codepoint = (state != ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte);
|
||||
state = TABLE[256 + state * 16 + type];
|
||||
return state;
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
state = ACCEPT;
|
||||
codepoint = 0xfffdU;
|
||||
}
|
||||
|
||||
UInt8 state {ACCEPT};
|
||||
UInt32 codepoint {0};
|
||||
};
|
||||
|
||||
static int wcwidth(wchar_t wc)
|
||||
{
|
||||
int width = widechar_wcwidth(wc);
|
||||
switch (width)
|
||||
{
|
||||
case widechar_nonprint:
|
||||
[[fallthrough]];
|
||||
case widechar_combining:
|
||||
[[fallthrough]];
|
||||
case widechar_unassigned:
|
||||
return 0;
|
||||
case widechar_ambiguous:
|
||||
[[fallthrough]];
|
||||
case widechar_private_use:
|
||||
[[fallthrough]];
|
||||
case widechar_widened_in_9:
|
||||
return 1;
|
||||
default:
|
||||
return width;
|
||||
}
|
||||
}
|
||||
|
||||
size_t computeWidth(const UInt8 * data, size_t size, size_t prefix) noexcept
|
||||
{
|
||||
UTF8Decoder decoder;
|
||||
size_t width = 0;
|
||||
size_t rollback = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
switch (decoder.decode(data[i]))
|
||||
{
|
||||
case UTF8Decoder::REJECT:
|
||||
decoder.reset();
|
||||
// invalid sequences seem to have zero width in modern terminals
|
||||
// tested in libvte-based, alacritty, urxvt and xterm
|
||||
i -= rollback;
|
||||
rollback = 0;
|
||||
break;
|
||||
case UTF8Decoder::ACCEPT:
|
||||
// there are special control characters that manipulate the terminal output.
|
||||
// (`0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x1b`)
|
||||
// Since we don't touch the original column data, there is no easy way to escape them.
|
||||
// TODO: escape control characters
|
||||
// TODO: multiline support for '\n'
|
||||
|
||||
// special treatment for '\t'
|
||||
if (decoder.codepoint == '\t')
|
||||
width += 8 - (prefix + width) % 8;
|
||||
else
|
||||
width += wcwidth(decoder.codepoint);
|
||||
rollback = 0;
|
||||
break;
|
||||
// continue if we meet other values here
|
||||
default:
|
||||
++rollback;
|
||||
}
|
||||
}
|
||||
|
||||
// no need to handle trailing sequence as they have zero width
|
||||
return width;
|
||||
}
|
||||
}
|
||||
}
|
@ -72,6 +72,11 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)
|
||||
return res;
|
||||
}
|
||||
|
||||
/// returns UTF-8 wcswidth. Invalid sequence is treated as zero width character.
|
||||
/// `prefix` is used to compute the `\t` width which extends the string before
|
||||
/// and include `\t` to the nearest longer length with multiple of eight.
|
||||
size_t computeWidth(const UInt8 * data, size_t size, size_t prefix = 0) noexcept;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -47,6 +47,7 @@ void PrettyBlockOutputStream::calculateWidths(
|
||||
|
||||
/// Calculate widths of all values.
|
||||
String serialized_value;
|
||||
size_t prefix = 2; // Tab character adjustment
|
||||
for (size_t i = 0; i < columns; ++i)
|
||||
{
|
||||
const ColumnWithTypeAndName & elem = block.getByPosition(i);
|
||||
@ -61,16 +62,18 @@ void PrettyBlockOutputStream::calculateWidths(
|
||||
}
|
||||
|
||||
widths[i][j] = std::min(format_settings.pretty.max_column_pad_width,
|
||||
UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(serialized_value.data()), serialized_value.size()));
|
||||
UTF8::computeWidth(reinterpret_cast<const UInt8 *>(serialized_value.data()), serialized_value.size(), prefix));
|
||||
max_widths[i] = std::max(max_widths[i], widths[i][j]);
|
||||
}
|
||||
|
||||
/// And also calculate widths for names of columns.
|
||||
{
|
||||
// name string doesn't contain Tab, no need to pass `prefix`
|
||||
name_widths[i] = std::min(format_settings.pretty.max_column_pad_width,
|
||||
UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(elem.name.data()), elem.name.size()));
|
||||
UTF8::computeWidth(reinterpret_cast<const UInt8 *>(elem.name.data()), elem.name.size()));
|
||||
max_widths[i] = std::max(max_widths[i], name_widths[i]);
|
||||
}
|
||||
prefix += max_widths[i] + 3;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -28,7 +28,7 @@ VerticalRowOutputStream::VerticalRowOutputStream(
|
||||
/// Note that number of code points is just a rough approximation of visible string width.
|
||||
const String & name = sample.getByPosition(i).name;
|
||||
|
||||
name_widths[i] = UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(name.data()), name.size());
|
||||
name_widths[i] = UTF8::computeWidth(reinterpret_cast<const UInt8 *>(name.data()), name.size());
|
||||
|
||||
if (name_widths[i] > max_name_width)
|
||||
max_name_width = name_widths[i];
|
||||
@ -43,7 +43,10 @@ VerticalRowOutputStream::VerticalRowOutputStream(
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < columns; ++i)
|
||||
names_and_paddings[i].resize(max_name_width + strlen(": "), ' ');
|
||||
{
|
||||
size_t new_size = max_name_width - name_widths[i] + names_and_paddings[i].size();
|
||||
names_and_paddings[i].resize(new_size, ' ');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -5,13 +5,13 @@
|
||||
│ Hello │ 0 │
|
||||
│ \ │ 0 │
|
||||
└───────┴───┘
|
||||
┌─[1mx[0m─────┬─[1my[0m─┐
|
||||
│ Hello │ 0 │
|
||||
│ \ │ 0 │
|
||||
│ \t │ 0 │
|
||||
└───────┴───┘
|
||||
┌─[1mx[0m─────┬─[1my[0m─┬─[1mtoInt8(x)[0m─┬─[1ms[0m─────┬─[1mcasted[0m─┐
|
||||
│ Hello │ 0 │ -100 │ Hello │ Hello │
|
||||
│ \ │ 0 │ 0 │ \ │ \ │
|
||||
│ \t │ 0 │ 111 │ \t │ \t │
|
||||
└───────┴───┴───────────┴───────┴────────┘
|
||||
┌─[1mx[0m────────┬─[1my[0m─┐
|
||||
│ Hello │ 0 │
|
||||
│ \ │ 0 │
|
||||
│ \t │ 0 │
|
||||
└──────────┴───┘
|
||||
┌─[1mx[0m────────┬─[1my[0m─┬─[1mtoInt8(x)[0m─┬─[1ms[0m─────┬─[1mcasted[0m─┐
|
||||
│ Hello │ 0 │ -100 │ Hello │ Hello │
|
||||
│ \ │ 0 │ 0 │ \ │ \ │
|
||||
│ \t │ 0 │ 111 │ \t │ \t │
|
||||
└──────────┴───┴───────────┴───────┴────────┘
|
||||
|
@ -0,0 +1,101 @@
|
||||
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Здравствуйте │ Этот код можно отредактировать и запустить! │
|
||||
└──────────────┴─────────────────────────────────────────────┘
|
||||
┏━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ 你好 │ 这段代码是可以编辑并且能够运行的! │
|
||||
└──────┴────────────────────────────────────┘
|
||||
┏━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Hola │ ¡Este código es editable y ejecutable! │
|
||||
└──────┴────────────────────────────────────────┘
|
||||
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Bonjour │ Ce code est modifiable et exécutable ! │
|
||||
└─────────┴────────────────────────────────────────┘
|
||||
┏━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Ciao │ Questo codice è modificabile ed eseguibile! │
|
||||
└──────┴─────────────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ こんにちは │ このコードは編集して実行出来ます! │
|
||||
└────────────┴────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ 안녕하세요 │ 여기에서 코드를 수정하고 실행할 수 있습니다! │
|
||||
└────────────┴──────────────────────────────────────────────┘
|
||||
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Cześć │ Ten kod można edytować oraz uruchomić! │
|
||||
└───────┴────────────────────────────────────────┘
|
||||
┏━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Olá │ Este código é editável e executável! │
|
||||
└─────┴──────────────────────────────────────┘
|
||||
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Chào bạn │ Bạn có thể edit và run code trực tiếp! │
|
||||
└──────────┴────────────────────────────────────────┘
|
||||
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Hallo │ Dieser Code kann bearbeitet und ausgeführt werden! │
|
||||
└───────┴────────────────────────────────────────────────────┘
|
||||
┏━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Hej │ Den här koden kan redigeras och köras! │
|
||||
└─────┴────────────────────────────────────────┘
|
||||
┏━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Ahoj │ Tento kód můžete upravit a spustit │
|
||||
└──────┴────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Tabs Tabs │ Non-first Tabs │
|
||||
└─────────────┴───────────────────────┘
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Control characters with zero width │ Invalid UTF-8 which eats pending characters <20>, or invalid by itself <20> with zero width │
|
||||
└─────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ c2 ┃
|
||||
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Russian ё and ё │ Zero bytes |