Allow to parse BOM in TSV if the first column cannot contain BOM in its value

This commit is contained in:
Alexey Milovidov 2020-04-22 02:46:42 +03:00
parent 8df64543b7
commit e8a9c7d518
3 changed files with 17 additions and 1 deletions

View File

@ -126,7 +126,7 @@ void TabSeparatedRowInputFormat::fillUnreadColumnsWithDefaults(MutableColumns &
void TabSeparatedRowInputFormat::readPrefix()
{
if (with_names || with_types)
if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8())
{
/// In this format, we assume that column name or type cannot contain BOM,
/// so, if format has header,

View File

@ -0,0 +1,2 @@
1 2 3
4 5 6

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
# BOM can be parsed if TSV format has first column that cannot contain arbitrary binary data (such as integer)
# In contrast, BOM cannot be parsed if the first column in String as it can contain arbitrary binary data.
echo 'DROP TABLE IF EXISTS bom' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @-
echo 'CREATE TABLE bom (a UInt8, b UInt8, c UInt8) ENGINE = Memory' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @-
echo -ne '1\t2\t3\n' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+bom+FORMAT+TSV" --data-binary @-
echo -ne '\xEF\xBB\xBF4\t5\t6\n' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+bom+FORMAT+TSV" --data-binary @-
echo 'SELECT * FROM bom ORDER BY a' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @-
echo 'DROP TABLE bom' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @-