From df538babc8a507f00b00be65b3f7f73821b50250 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 23 Jun 2016 22:39:20 +0300
Subject: [PATCH] Text formats: detecting and skipping UTF-8 BOM when possible
 [#METR-21817].

---
 dbms/CMakeLists.txt                           |  2 +
 .../DB/DataStreams/verbosePrintString.h       | 12 ++++
 dbms/include/DB/IO/ReadHelpers.h              | 17 +++++
 dbms/src/DataStreams/CSVRowInputStream.cpp    | 64 ++---------------
 .../DataStreams/JSONEachRowRowInputStream.cpp |  3 +
 dbms/src/DataStreams/TSKVRowInputStream.cpp   |  4 ++
 .../TabSeparatedRowInputStream.cpp            | 68 +++----------------
 dbms/src/DataStreams/ValuesRowInputStream.cpp |  2 +
 dbms/src/DataStreams/verbosePrintString.cpp   | 66 ++++++++++++++++++
 9 files changed, 120 insertions(+), 118 deletions(-)
 create mode 100644 dbms/include/DB/DataStreams/verbosePrintString.h
 create mode 100644 dbms/src/DataStreams/verbosePrintString.cpp
diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index e63f21b9dc6..8e0694a8170 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -265,6 +265,7 @@ add_library (dbms
 	include/DB/DataStreams/MarkInCompressedFile.h
 	include/DB/DataStreams/CSVRowOutputStream.h
 	include/DB/DataStreams/CSVRowInputStream.h
+	include/DB/DataStreams/verbosePrintString.h
 	include/DB/DataTypes/IDataType.h
 	include/DB/DataTypes/IDataTypeDummy.h
 	include/DB/DataTypes/DataTypeSet.h
@@ -742,6 +743,7 @@ add_library (dbms
 	src/DataStreams/DistinctBlockInputStream.cpp
 	src/DataStreams/RemoteBlockInputStream.cpp
 	src/DataStreams/BlockIO.cpp
+	src/DataStreams/verbosePrintString.cpp
 
 	src/DataTypes/DataTypeString.cpp
 	src/DataTypes/DataTypeFixedString.cpp
diff --git a/dbms/include/DB/DataStreams/verbosePrintString.h b/dbms/include/DB/DataStreams/verbosePrintString.h
new file mode 100644
index 00000000000..e006d8f022d
--- /dev/null
+++ b/dbms/include/DB/DataStreams/verbosePrintString.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <DB/IO/WriteBuffer.h>
+
+namespace DB
+{
+
+/** Print string in double quotes and with control characters in "<NAME>" form - for output diagnostic info to user.
+  */
+void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out);
+
+}
diff --git a/dbms/include/DB/IO/ReadHelpers.h b/dbms/include/DB/IO/ReadHelpers.h
index 66bc99e1c59..5671dbe9ca5 100644
--- a/dbms/include/DB/IO/ReadHelpers.h
+++ b/dbms/include/DB/IO/ReadHelpers.h
@@ -879,4 +879,21 @@ inline T parse(const String & s)
 	return parse<T>(s.data(), s.size());
 }
 
+
+/** Skip UTF-8 BOM if it is under cursor.
+  * As BOM is usually located at start of stream, and buffer size is usually larger than three bytes,
+  *  the function expects, that all three bytes of BOM is fully in buffer (otherwise it don't skip anything).
+  */
+inline void skipBOMIfExists(ReadBuffer & buf)
+{
+	if (!buf.eof()
+		&& buf.position() + 3 < buf.buffer().end()
+		&& buf.position()[0] == '\xEF'
+		&& buf.position()[1] == '\xBB'
+		&& buf.position()[2] == '\xBF')
+	{
+		buf.position() += 3;
+	}
+}
+
 }
diff --git a/dbms/src/DataStreams/CSVRowInputStream.cpp b/dbms/src/DataStreams/CSVRowInputStream.cpp
index 58183121bfb..bdd514abfaa 100644
--- a/dbms/src/DataStreams/CSVRowInputStream.cpp
+++ b/dbms/src/DataStreams/CSVRowInputStream.cpp
@@ -1,6 +1,7 @@
 #include <DB/IO/ReadHelpers.h>
 #include <DB/IO/Operators.h>
 
+#include <DB/DataStreams/verbosePrintString.h>
 #include <DB/DataStreams/CSVRowInputStream.h>
 #include <DB/DataTypes/DataTypesNumberFixed.h>
 
@@ -96,6 +97,10 @@ static void skipRow(ReadBuffer & istr, const char delimiter, size_t columns)
 
 void CSVRowInputStream::readPrefix()
 {
+	/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
+	///  so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
+	skipBOMIfExists(istr);
+
 	size_t columns = sample.columns();
 	String tmp;
 
@@ -190,65 +195,6 @@ void CSVRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer & out)
 }
 
 
-static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
-{
-	if (end == begin)
-	{
-		out << "<EMPTY>";
-		return;
-	}
-
-	out << "\"";
-
-	for (auto pos = begin; pos < end; ++pos)
-	{
-		switch (*pos)
-		{
-			case '\0':
-				out << "<ASCII NUL>";
-				break;
-			case '\b':
-				out << "<BACKSPACE>";
-				break;
-			case '\f':
-				out << "<FORM FEED>";
-				break;
-			case '\n':
-				out << "<LINE FEED>";
-				break;
-			case '\r':
-				out << "<CARRIAGE RETURN>";
-				break;
-			case '\t':
-				out << "<TAB>";
-				break;
-			case '\\':
-				out << "<BACKSLASH>";
-				break;
-			case '"':
-				out << "<DOUBLE QUOTE>";
-				break;
-			case '\'':
-				out << "<SINGLE QUOTE>";
-				break;
-
-			default:
-			{
-				if (*pos >= 0 && *pos < 32)
-				{
-					static const char * hex = "0123456789ABCDEF";
-					out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
-				}
-				else
-					out << *pos;
-			}
-		}
-	}
-
-	out << "\"";
-}
-
-
 bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
 	WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
 {
diff --git a/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp b/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp
index 544af1c76a4..ba530c53355 100644
--- a/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp
+++ b/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp
@@ -15,6 +15,9 @@ namespace ErrorCodes
 JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & sample_)
 	: istr(istr_), sample(sample_), name_map(sample.columns())
 {
+	/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
+	skipBOMIfExists(istr);
+
 	size_t columns = sample.columns();
 	for (size_t i = 0; i < columns; ++i)
 		name_map[sample.getByPosition(i).name] = i;		/// NOTE Можно было бы расположить имена более кэш-локально.
diff --git a/dbms/src/DataStreams/TSKVRowInputStream.cpp b/dbms/src/DataStreams/TSKVRowInputStream.cpp
index f444dee2d47..d8e47ca235b 100644
--- a/dbms/src/DataStreams/TSKVRowInputStream.cpp
+++ b/dbms/src/DataStreams/TSKVRowInputStream.cpp
@@ -16,6 +16,10 @@ namespace ErrorCodes
 TSKVRowInputStream::TSKVRowInputStream(ReadBuffer & istr_, const Block & sample_, bool skip_unknown_)
 	: istr(istr_), sample(sample_), skip_unknown(skip_unknown_), name_map(sample.columns())
 {
+	/// In this format, we assume that column name cannot contain BOM,
+	///  so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it.
+	skipBOMIfExists(istr);
+
 	size_t columns = sample.columns();
 	for (size_t i = 0; i < columns; ++i)
 		name_map[sample.getByPosition(i).name] = i;		/// NOTE Можно было бы расположить имена более кэш-локально.
diff --git a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
index 1ed28dab0b4..fbed2fb45d3 100644
--- a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
+++ b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
@@ -2,6 +2,7 @@
 #include <DB/IO/Operators.h>
 
 #include <DB/DataStreams/TabSeparatedRowInputStream.h>
+#include <DB/DataStreams/verbosePrintString.h>
 #include <DB/DataTypes/DataTypesNumberFixed.h>
 
 
@@ -29,6 +30,14 @@ void TabSeparatedRowInputStream::readPrefix()
 	size_t columns = sample.columns();
 	String tmp;
 
+	if (with_names || with_types)
+	{
+		/// In this format, we assume that column name or type cannot contain BOM,
+		///  so, if format has header,
+		///  then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
+		skipBOMIfExists(istr);
+	}
+
 	if (with_names)
 	{
 		for (size_t i = 0; i < columns; ++i)
@@ -155,65 +164,6 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer
 }
 
 
-static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
-{
-	if (end == begin)
-	{
-		out << "<EMPTY>";
-		return;
-	}
-
-	out << "\"";
-
-	for (auto pos = begin; pos < end; ++pos)
-	{
-		switch (*pos)
-		{
-			case '\0':
-				out << "<ASCII NUL>";
-				break;
-			case '\b':
-				out << "<BACKSPACE>";
-				break;
-			case '\f':
-				out << "<FORM FEED>";
-				break;
-			case '\n':
-				out << "<LINE FEED>";
-				break;
-			case '\r':
-				out << "<CARRIAGE RETURN>";
-				break;
-			case '\t':
-				out << "<TAB>";
-				break;
-			case '\\':
-				out << "<BACKSLASH>";
-				break;
-			case '"':
-				out << "<DOUBLE QUOTE>";
-				break;
-			case '\'':
-				out << "<SINGLE QUOTE>";
-				break;
-
-			default:
-			{
-				if (*pos >= 0 && *pos < 32)
-				{
-					static const char * hex = "0123456789ABCDEF";
-					out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
-				}
-				else
-					out << *pos;
-			}
-		}
-	}
-
-	out << "\"";
-}
-
-
 bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
 	WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
 {
diff --git a/dbms/src/DataStreams/ValuesRowInputStream.cpp b/dbms/src/DataStreams/ValuesRowInputStream.cpp
index 387b67470a7..11c707756f7 100644
--- a/dbms/src/DataStreams/ValuesRowInputStream.cpp
+++ b/dbms/src/DataStreams/ValuesRowInputStream.cpp
@@ -26,6 +26,8 @@ namespace ErrorCodes
 ValuesRowInputStream::ValuesRowInputStream(ReadBuffer & istr_, const Context & context_)
 	: istr(istr_), context(context_)
 {
+	/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
+	skipBOMIfExists(istr);
 }
 
 
diff --git a/dbms/src/DataStreams/verbosePrintString.cpp b/dbms/src/DataStreams/verbosePrintString.cpp
new file mode 100644
index 00000000000..29b2e770e4e
--- /dev/null
+++ b/dbms/src/DataStreams/verbosePrintString.cpp
@@ -0,0 +1,66 @@
+#include <DB/DataStreams/verbosePrintString.h>
+#include <DB/IO/Operators.h>
+
+
+namespace DB
+{
+
+void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
+{
+	if (end == begin)
+	{
+		out << "<EMPTY>";
+		return;
+	}
+
+	out << "\"";
+
+	for (auto pos = begin; pos < end; ++pos)
+	{
+		switch (*pos)
+		{
+			case '\0':
+				out << "<ASCII NUL>";
+				break;
+			case '\b':
+				out << "<BACKSPACE>";
+				break;
+			case '\f':
+				out << "<FORM FEED>";
+				break;
+			case '\n':
+				out << "<LINE FEED>";
+				break;
+			case '\r':
+				out << "<CARRIAGE RETURN>";
+				break;
+			case '\t':
+				out << "<TAB>";
+				break;
+			case '\\':
+				out << "<BACKSLASH>";
+				break;
+			case '"':
+				out << "<DOUBLE QUOTE>";
+				break;
+			case '\'':
+				out << "<SINGLE QUOTE>";
+				break;
+
+			default:
+			{
+				if (*pos >= 0 && *pos < 32)
+				{
+					static const char * hex = "0123456789ABCDEF";
+					out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
+				}
+				else
+					out << *pos;
+			}
+		}
+	}
+
+	out << "\"";
+}
+
+}