Stop removing UTF-8 BOM in RowBinary* formats

Fixes #37420
This commit is contained in:
Paul Loyd 2022-05-22 20:17:16 +08:00
parent 77c06447d5
commit 32d267ec6c
No known key found for this signature in database
GPG Key ID: 3D22CFBFD585AC8F
9 changed files with 42 additions and 6 deletions

View File

@ -18,6 +18,7 @@ BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Param
header,
in_,
params_,
true,
with_names_,
with_types_,
format_settings_,

View File

@ -41,7 +41,15 @@ CSVRowInputFormat::CSVRowInputFormat(
bool with_types_,
const FormatSettings & format_settings_,
std::unique_ptr<FormatWithNamesAndTypesReader> format_reader_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::move(format_reader_))
: RowInputFormatWithNamesAndTypes(
header_,
in_,
params_,
false,
with_names_,
with_types_,
format_settings_,
std::move(format_reader_))
{
const String bad_delimiters = " \t\"'.UL";
if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos)

View File

@ -47,6 +47,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
header_,
*buf_,
params_,
false,
with_names_,
with_types_,
format_settings_,

View File

@ -28,6 +28,7 @@ JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(
header_,
in_,
params_,
false,
with_names_,
with_types_,
format_settings_,

View File

@ -40,7 +40,15 @@ TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(
bool with_types_,
bool is_raw_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::make_unique<TabSeparatedFormatReader>(in_, format_settings_, is_raw_))
: RowInputFormatWithNamesAndTypes(
header_,
in_,
params_,
false,
with_names_,
with_types_,
format_settings_,
std::make_unique<TabSeparatedFormatReader>(in_, format_settings_, is_raw_))
{
}

View File

@ -17,6 +17,7 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool is_binary_,
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_,
@ -24,6 +25,7 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
: RowInputFormatWithDiagnosticInfo(header_, in_, params_)
, format_settings(format_settings_)
, data_types(header_.getDataTypes())
, is_binary(is_binary_)
, with_names(with_names_)
, with_types(with_types_)
, format_reader(std::move(format_reader_))
@ -38,10 +40,11 @@ void RowInputFormatWithNamesAndTypes::readPrefix()
if (getCurrentUnitNumber() != 0)
return;
if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8())
/// Search and remove BOM only in textual formats (CSV, TSV etc), not in binary ones (RowBinary*).
/// Also, we assume that column name or type cannot contain BOM, so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
if (!is_binary && (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8()))
{
/// We assume that column name or type cannot contain BOM, so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(*in);
}

View File

@ -24,13 +24,15 @@ class FormatWithNamesAndTypesReader;
class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo
{
protected:
/** with_names - in the first line the header with column names
/** is_binary - it is a binary format (e.g. don't search for BOM)
* with_names - in the first line the header with column names
* with_types - in the second line the header with column names
*/
RowInputFormatWithNamesAndTypes(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool is_binary_,
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_,
@ -51,6 +53,7 @@ private:
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
bool is_binary;
bool with_names;
bool with_types;
std::unique_ptr<FormatWithNamesAndTypesReader> format_reader;

View File

@ -0,0 +1 @@
1651760768976141295

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
echo "DROP TABLE IF EXISTS table_with_uint64" | ${CLICKHOUSE_CURL} -d@- -sS "${CLICKHOUSE_URL}"
echo "CREATE TABLE table_with_uint64(no UInt64) ENGINE = MergeTree ORDER BY no" | ${CLICKHOUSE_CURL} -d@- -sS "${CLICKHOUSE_URL}"
echo -en '\xef\xbb\xbf\x00\xab\x3b\xec\x16' | ${CLICKHOUSE_CURL} --data-binary @- "${CLICKHOUSE_URL}&query=INSERT+INTO+table_with_uint64(no)+FORMAT+RowBinary"
echo "SELECT * FROM table_with_uint64" | ${CLICKHOUSE_CURL} -d@- -sS "${CLICKHOUSE_URL}"