From e8a9c7d518ca2ecec6ce8693a337f49664d96eff Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 22 Apr 2020 02:46:42 +0300 Subject: [PATCH] Allow to parse BOM in TSV if the first column cannot contain BOM in its value --- .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 2 +- tests/queries/0_stateless/01258_bom_tsv.reference | 2 ++ tests/queries/0_stateless/01258_bom_tsv.sh | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01258_bom_tsv.reference create mode 100755 tests/queries/0_stateless/01258_bom_tsv.sh diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 706941a3dc6..0dd7245b965 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -126,7 +126,7 @@ void TabSeparatedRowInputFormat::fillUnreadColumnsWithDefaults(MutableColumns & void TabSeparatedRowInputFormat::readPrefix() { - if (with_names || with_types) + if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8()) { /// In this format, we assume that column name or type cannot contain BOM, /// so, if format has header, diff --git a/tests/queries/0_stateless/01258_bom_tsv.reference b/tests/queries/0_stateless/01258_bom_tsv.reference new file mode 100644 index 00000000000..6622044ee6e --- /dev/null +++ b/tests/queries/0_stateless/01258_bom_tsv.reference @@ -0,0 +1,2 @@ +1 2 3 +4 5 6 diff --git a/tests/queries/0_stateless/01258_bom_tsv.sh b/tests/queries/0_stateless/01258_bom_tsv.sh new file mode 100755 index 00000000000..5668b00f069 --- /dev/null +++ b/tests/queries/0_stateless/01258_bom_tsv.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +# BOM can be parsed if TSV format has first column that cannot contain arbitrary binary data (such as integer) +# In contrast, BOM cannot be parsed if the first column in String as it can contain arbitrary binary data. + +echo 'DROP TABLE IF EXISTS bom' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- +echo 'CREATE TABLE bom (a UInt8, b UInt8, c UInt8) ENGINE = Memory' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- +echo -ne '1\t2\t3\n' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+bom+FORMAT+TSV" --data-binary @- +echo -ne '\xEF\xBB\xBF4\t5\t6\n' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+bom+FORMAT+TSV" --data-binary @- +echo 'SELECT * FROM bom ORDER BY a' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- +echo 'DROP TABLE bom' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @-