ClickHouse/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp

#include <DB/IO/ReadHelpers.h>
#include <DB/IO/Operators.h>

#include <DB/DataStreams/TabSeparatedRowInputStream.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>


namespace DB
{

using Poco::SharedPtr;

TabSeparatedRowInputStream::TabSeparatedRowInputStream(ReadBuffer & istr_, const Block & sample_, bool with_names_, bool with_types_)
	: istr(istr_), sample(sample_), with_names(with_names_), with_types(with_types_)
{
	size_t columns = sample.columns();
	data_types.resize(columns);
	for (size_t i = 0; i < columns; ++i)
		data_types[i] = sample.getByPosition(i).type;
}


void TabSeparatedRowInputStream::readPrefix()
{
	size_t columns = sample.columns();
	String tmp;

	if (with_names)
	{
		for (size_t i = 0; i < columns; ++i)
		{
			readEscapedString(tmp, istr);
			assertString(i == columns - 1 ? "\n" : "\t", istr);
		}
	}

	if (with_types)
	{
		for (size_t i = 0; i < columns; ++i)
		{
			readEscapedString(tmp, istr);
			assertString(i == columns - 1 ? "\n" : "\t", istr);
		}
	}
}


/** Проверка на распространённый случай ошибки - использование Windows перевода строки.
  */
static void checkForCarriageReturn(ReadBuffer & istr)
{
	if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
		throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
			"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
			" You must transform your file to Unix format."
			"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
			ErrorCodes::INCORRECT_DATA);
}


bool TabSeparatedRowInputStream::read(Row & row)
{
	updateDiagnosticInfo();

	size_t size = data_types.size();
	row.resize(size);

	try
	{
		for (size_t i = 0; i < size; ++i)
		{
			if (i == 0 && istr.eof())
			{
				row.clear();
				return false;
			}

			data_types[i]->deserializeTextEscaped(row[i], istr);

			/// пропускаем разделители
			if (i + 1 == size)
			{
				if (!istr.eof())
				{
					if (unlikely(row_num == 1))
						checkForCarriageReturn(istr);

					assertString("\n", istr);
				}
			}
			else
				assertString("\t", istr);
		}
	}
	catch (Exception & e)
	{
		String verbose_diagnostic;
		{
			WriteBufferFromString diagnostic_out(verbose_diagnostic);
			printDiagnosticInfo(diagnostic_out);
		}

		e.addMessage("\n" + verbose_diagnostic);
		throw;
	}

	return true;
}


void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
{
	/// Вывести подробную диагностику возможно лишь если последняя и предпоследняя строка ещё находятся в буфере для чтения.
	size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
	if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
	{
		out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
		return;
	}

	size_t max_length_of_column_name = 0;
	for (size_t i = 0; i < sample.columns(); ++i)
		if (sample.getByPosition(i).name.size() > max_length_of_column_name)
			max_length_of_column_name = sample.getByPosition(i).name.size();

	size_t max_length_of_data_type_name = 0;
	for (size_t i = 0; i < sample.columns(); ++i)
		if (sample.getByPosition(i).type->getName().size() > max_length_of_data_type_name)
			max_length_of_data_type_name = sample.getByPosition(i).type->getName().size();

	/// Откатываем курсор для чтения на начало предыдущей или текущей строки и парсим всё заново. Но теперь выводим подробную информацию.

	if (pos_of_prev_row)
	{
		istr.position() = pos_of_prev_row;

		out << "\nRow " << (row_num - 1) << ":\n";
		if (!parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name))
			return;
	}
	else
	{
		if (!pos_of_current_row)
		{
			out << "Could not print diagnostic info because parsing of data hasn't started.\n";
			return;
		}

		istr.position() = pos_of_current_row;
	}

	out << "\nRow " << row_num << ":\n";
	parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name);
	out << "\n";
}


static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
{
	if (end == begin)
	{
		out << "<EMPTY>";
		return;
	}

	out << "\"";

	for (auto pos = begin; pos < end; ++pos)
	{
		switch (*pos)
		{
			case '\0':
				out << "<ASCII NUL>";
				break;
			case '\b':
				out << "<BACKSPACE>";
				break;
			case '\f':
				out << "<FORM FEED>";
				break;
			case '\n':
				out << "<LINE FEED>";
				break;
			case '\r':
				out << "<CARRIAGE RETURN>";
				break;
			case '\t':
				out << "<TAB>";
				break;
			case '\\':
				out << "<BACKSLASH>";
				break;
			case '"':
				out << "<DOUBLE QUOTE>";
				break;
			case '\'':
				out << "<SINGLE QUOTE>";
				break;

			default:
			{
				if (*pos >= 0 && *pos < 32)
				{
					static const char * hex = "0123456789ABCDEF";
					out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
				}
				else
					out << *pos;
			}
		}
	}

	out << "\"";
}


bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
	WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
	size_t size = data_types.size();
	for (size_t i = 0; i < size; ++i)
	{
		if (i == 0 && istr.eof())
		{
			out << "<End of stream>\n";
			return false;
		}

		out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
			<< "name: " << sample.getByPosition(i).name << ", " << std::string(max_length_of_column_name - sample.getByPosition(i).name.size(), ' ')
			<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');

		auto prev_position = istr.position();
		std::exception_ptr exception;

		Field field;
		try
		{
			data_types[i]->deserializeTextEscaped(field, istr);
		}
		catch (...)
		{
			exception = std::current_exception();
		}

		auto curr_position = istr.position();

		if (curr_position < prev_position)
			throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);

		if (data_types[i]->isNumeric())
		{
			/// Пустая строка вместо числа.
			if (curr_position == prev_position)
			{
				out << "ERROR: text ";
				verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
				out << " is not like " << data_types[i]->getName() << "\n";
				return false;
			}
		}

		out << "parsed text: ";
		verbosePrintString(prev_position, curr_position, out);

		if (exception)
		{
			if (data_types[i]->getName() == "DateTime")
				out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss format.\n";
			else if (data_types[i]->getName() == "Date")
				out << "ERROR: Date must be in YYYY-MM-DD format.\n";
			else
				out << "ERROR\n";
			return false;
		}

		out << "\n";

		if (data_types[i]->isNumeric())
		{
			if (*curr_position != '\n' && *curr_position != '\t')
			{
				out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
				verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
				out << "\n";

				if (data_types[i]->getName() == "DateTime")
					out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss format.\n";
				else if (data_types[i]->getName() == "Date")
					out << "ERROR: Date must be in YYYY-MM-DD format.\n";

				return false;
			}
		}

		if (   (typeid_cast<const DataTypeUInt8  *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt8>::max())
			|| (typeid_cast<const DataTypeUInt16 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt16>::max())
			|| (typeid_cast<const DataTypeUInt32 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt32>::max())
			|| (typeid_cast<const DataTypeInt8 *>(data_types[i].get())
				&& (field.get<Int64>() > std::numeric_limits<Int8>::max() || field.get<Int64>() < std::numeric_limits<Int8>::min()))
			|| (typeid_cast<const DataTypeInt16 *>(data_types[i].get())
				&& (field.get<Int64>() > std::numeric_limits<Int16>::max() || field.get<Int64>() < std::numeric_limits<Int16>::min()))
			|| (typeid_cast<const DataTypeInt32 *>(data_types[i].get())
				&& (field.get<Int64>() > std::numeric_limits<Int32>::max() || field.get<Int64>() < std::numeric_limits<Int32>::min())))
		{
			out << "ERROR: parsed number is out of range of data type.\n";
			return false;
		}

		/// Разделители
		if (i + 1 == size)
		{
			if (!istr.eof())
			{
				try
				{
					assertString("\n", istr);
				}
				catch (const DB::Exception &)
				{
					if (*istr.position() == '\t')
					{
						out << "ERROR: Tab found where line feed is expected."
							" It's like your file has more columns than expected.\n"
							"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
					}
					else if (*istr.position() == '\r')
					{
						out << "ERROR: Carriage return found where line feed is expected."
							" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
					}
					else
					{
						out << "ERROR: There is no line feed. ";
						verbosePrintString(istr.position(), istr.position() + 1, out);
						out << " found instead.\n";
					}
					return false;
				}
			}
		}
		else
		{
			try
			{
				assertString("\t", istr);
			}
			catch (const DB::Exception &)
			{
				if (*istr.position() == '\n')
				{
					out << "ERROR: Line feed found where tab is expected."
						" It's like your file has less columns than expected.\n"
						"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
				}
				else if (*istr.position() == '\r')
				{
					out << "ERROR: Carriage return found where tab is expected.\n";
				}
				else
				{
					out << "ERROR: There is no tab. ";
					verbosePrintString(istr.position(), istr.position() + 1, out);
					out << " found instead.\n";
				}
				return false;
			}
		}
	}

	return true;
}

}
-												dbms: development



											
										
										
											2010-06-04 18:25:25 +00:00
+								#include <DB/IO/ReadHelpers.h>
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+								#include <DB/IO/Operators.h>
-												dbms: development



											
										
										
											2010-06-04 18:25:25 +00:00
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+								#include <DB/DataStreams/TabSeparatedRowInputStream.h>
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+								#include <DB/DataTypes/DataTypesNumberFixed.h>
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
 								namespace DB
 								{
 								using Poco::SharedPtr;
-												dbms: development [#CONV-2944].



											
										
										
											2011-11-06 06:22:52 +00:00
+								TabSeparatedRowInputStream::TabSeparatedRowInputStream(ReadBuffer & istr_, const Block & sample_, bool with_names_, bool with_types_)
 									: istr(istr_), sample(sample_), with_names(with_names_), with_types(with_types_)
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+								{
-												dbms: development [#CONV-2944].



											
										
										
											2011-11-06 05:01:42 +00:00
+									size_t columns = sample.columns();
 									data_types.resize(columns);
 									for (size_t i = 0; i < columns; ++i)
 										data_types[i] = sample.getByPosition(i).type;
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+								}
-												dbms: development [#CONV-2944].



											
										
										
											2011-11-06 06:22:52 +00:00
+								void TabSeparatedRowInputStream::readPrefix()
 								{
 									size_t columns = sample.columns();
 									String tmp;
 									if (with_names)
 									{
 										for (size_t i = 0; i < columns; ++i)
 										{
 											readEscapedString(tmp, istr);
 											assertString(i == columns - 1 ? "\n" : "\t", istr);
 										}
 									}
 									if (with_types)
 									{
 										for (size_t i = 0; i < columns; ++i)
 										{
 											readEscapedString(tmp, istr);
 											assertString(i == columns - 1 ? "\n" : "\t", istr);
 										}
 									}
 								}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+								/** Проверка на распространённый случай ошибки - использование Windows перевода строки.
 								  */
 								static void checkForCarriageReturn(ReadBuffer & istr)
 								{
 									if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+										throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
 											"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+											" You must transform your file to Unix format."
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+											"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+											ErrorCodes::INCORRECT_DATA);
 								}
-												dbms: using special array for Row and instead of std::vector<AggregateFunctionPlainPtrs> (experimental) [#CONV-2944].



											
										
										
											2013-01-07 00:57:43 +00:00
+								bool TabSeparatedRowInputStream::read(Row & row)
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+								{
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+									updateDiagnosticInfo();
-												dbms: development [#CONV-2944].



											
										
										
											2011-11-06 05:01:42 +00:00
+									size_t size = data_types.size();
-												dbms: using special array for Row and instead of std::vector<AggregateFunctionPlainPtrs> (experimental) [#CONV-2944].



											
										
										
											2013-01-07 00:57:43 +00:00
+									row.resize(size);
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+									try
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+									{
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+										for (size_t i = 0; i < size; ++i)
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+										{
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+											if (i == 0 && istr.eof())
 											{
 												row.clear();
 												return false;
 											}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+											data_types[i]->deserializeTextEscaped(row[i], istr);
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+											/// пропускаем разделители
 											if (i + 1 == size)
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+											{
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+												if (!istr.eof())
 												{
 													if (unlikely(row_num == 1))
 														checkForCarriageReturn(istr);
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+													assertString("\n", istr);
 												}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+											}
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+											else
 												assertString("\t", istr);
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+										}
-												dbms: better diagnostics of errors in TabSeparated format [#METR-15574].

											
										
										
											2015-03-29 09:02:24 +00:00
+									}
 									catch (Exception & e)
 									{
 										String verbose_diagnostic;
 										{
 											WriteBufferFromString diagnostic_out(verbose_diagnostic);
 											printDiagnosticInfo(diagnostic_out);
 										}
 										e.addMessage("\n" + verbose_diagnostic);
 										throw;
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+									}
-												dbms: using special array for Row and instead of std::vector<AggregateFunctionPlainPtrs> (experimental) [#CONV-2944].



											
										
										
											2013-01-07 00:57:43 +00:00
+									return true;
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+								}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
 								void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
 								{
 									/// Вывести подробную диагностику возможно лишь если последняя и предпоследняя строка ещё находятся в буфере для чтения.
 									size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
 									if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
 									{
 										out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
 										return;
 									}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+									size_t max_length_of_column_name = 0;
 									for (size_t i = 0; i < sample.columns(); ++i)
 										if (sample.getByPosition(i).name.size() > max_length_of_column_name)
 											max_length_of_column_name = sample.getByPosition(i).name.size();
 									size_t max_length_of_data_type_name = 0;
 									for (size_t i = 0; i < sample.columns(); ++i)
 										if (sample.getByPosition(i).type->getName().size() > max_length_of_data_type_name)
 											max_length_of_data_type_name = sample.getByPosition(i).type->getName().size();
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+									/// Откатываем курсор для чтения на начало предыдущей или текущей строки и парсим всё заново. Но теперь выводим подробную информацию.
 									if (pos_of_prev_row)
 									{
 										istr.position() = pos_of_prev_row;
 										out << "\nRow " << (row_num - 1) << ":\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+										if (!parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name))
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+											return;
 									}
 									else
 									{
 										if (!pos_of_current_row)
 										{
 											out << "Could not print diagnostic info because parsing of data hasn't started.\n";
 											return;
 										}
 										istr.position() = pos_of_current_row;
 									}
 									out << "\nRow " << row_num << ":\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+									parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name);
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+									out << "\n";
 								}
 								static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
 								{
 									if (end == begin)
 									{
 										out << "<EMPTY>";
 										return;
 									}
 									out << "\"";
 									for (auto pos = begin; pos < end; ++pos)
 									{
 										switch (*pos)
 										{
 											case '\0':
 												out << "<ASCII NUL>";
 												break;
 											case '\b':
 												out << "<BACKSPACE>";
 												break;
 											case '\f':
 												out << "<FORM FEED>";
 												break;
 											case '\n':
 												out << "<LINE FEED>";
 												break;
 											case '\r':
 												out << "<CARRIAGE RETURN>";
 												break;
 											case '\t':
 												out << "<TAB>";
 												break;
 											case '\\':
 												out << "<BACKSLASH>";
 												break;
 											case '"':
 												out << "<DOUBLE QUOTE>";
 												break;
 											case '\'':
 												out << "<SINGLE QUOTE>";
 												break;
 											default:
 											{
 												if (*pos >= 0 && *pos < 32)
 												{
 													static const char * hex = "0123456789ABCDEF";
 													out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
 												}
 												else
 													out << *pos;
 											}
 										}
 									}
 									out << "\"";
 								}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+								bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
 									WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+								{
 									size_t size = data_types.size();
 									for (size_t i = 0; i < size; ++i)
 									{
 										if (i == 0 && istr.eof())
 										{
 											out << "<End of stream>\n";
 											return false;
 										}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+										out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
 											<< "name: " << sample.getByPosition(i).name << ", " << std::string(max_length_of_column_name - sample.getByPosition(i).name.size(), ' ')
 											<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
 										auto prev_position = istr.position();
 										std::exception_ptr exception;
 										Field field;
 										try
 										{
 											data_types[i]->deserializeTextEscaped(field, istr);
 										}
 										catch (...)
 										{
 											exception = std::current_exception();
 										}
 										auto curr_position = istr.position();
 										if (curr_position < prev_position)
 											throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
 										if (data_types[i]->isNumeric())
 										{
 											/// Пустая строка вместо числа.
 											if (curr_position == prev_position)
 											{
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+												out << "ERROR: text ";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+												verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+												out << " is not like " << data_types[i]->getName() << "\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+												return false;
 											}
 										}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+										out << "parsed text: ";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+										verbosePrintString(prev_position, curr_position, out);
 										if (exception)
 										{
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+											if (data_types[i]->getName() == "DateTime")
 												out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss format.\n";
 											else if (data_types[i]->getName() == "Date")
 												out << "ERROR: Date must be in YYYY-MM-DD format.\n";
 											else
 												out << "ERROR\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+											return false;
 										}
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+										out << "\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
 										if (data_types[i]->isNumeric())
 										{
 											if (*curr_position != '\n' && *curr_position != '\t')
 											{
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
+												out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+												verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
 												out << "\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 08:44:04 +00:00
 												if (data_types[i]->getName() == "DateTime")
 													out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss format.\n";
 												else if (data_types[i]->getName() == "Date")
 													out << "ERROR: Date must be in YYYY-MM-DD format.\n";
-												dbms: better diagnostics of errors in TabSeparated format (development) [#METR-15574].

											
										
										
											2015-03-29 07:13:38 +00:00
+												return false;
 											}
 										}
 										if (   (typeid_cast<const DataTypeUInt8  *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt8>::max())
 											|| (typeid_cast<const DataTypeUInt16 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt16>::max())
 											|| (typeid_cast<const DataTypeUInt32 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt32>::max())
 											|| (typeid_cast<const DataTypeInt8 *>(data_types[i].get())
 												&& (field.get<Int64>() > std::numeric_limits<Int8>::max() || field.get<Int64>() < std::numeric_limits<Int8>::min()))
 											|| (typeid_cast<const DataTypeInt16 *>(data_types[i].get())
 												&& (field.get<Int64>() > std::numeric_limits<Int16>::max() || field.get<Int64>() < std::numeric_limits<Int16>::min()))
 											|| (typeid_cast<const DataTypeInt32 *>(data_types[i].get())
 												&& (field.get<Int64>() > std::numeric_limits<Int32>::max() || field.get<Int64>() < std::numeric_limits<Int32>::min())))
 										{
 											out << "ERROR: parsed number is out of range of data type.\n";
 											return false;
 										}
 										/// Разделители
 										if (i + 1 == size)
 										{
 											if (!istr.eof())
 											{
 												try
 												{
 													assertString("\n", istr);
 												}
 												catch (const DB::Exception &)
 												{
 													if (*istr.position() == '\t')
 													{
 														out << "ERROR: Tab found where line feed is expected."
 															" It's like your file has more columns than expected.\n"
 															"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
 													}
 													else if (*istr.position() == '\r')
 													{
 														out << "ERROR: Carriage return found where line feed is expected."
 															" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
 													}
 													else
 													{
 														out << "ERROR: There is no line feed. ";
 														verbosePrintString(istr.position(), istr.position() + 1, out);
 														out << " found instead.\n";
 													}
 													return false;
 												}
 											}
 										}
 										else
 										{
 											try
 											{
 												assertString("\t", istr);
 											}
 											catch (const DB::Exception &)
 											{
 												if (*istr.position() == '\n')
 												{
 													out << "ERROR: Line feed found where tab is expected."
 														" It's like your file has less columns than expected.\n"
 														"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
 												}
 												else if (*istr.position() == '\r')
 												{
 													out << "ERROR: Carriage return found where tab is expected.\n";
 												}
 												else
 												{
 													out << "ERROR: There is no tab. ";
 													verbosePrintString(istr.position(), istr.position() + 1, out);
 													out << " found instead.\n";
 												}
 												return false;
 											}
 										}
 									}
 									return true;
 								}
-												dbms: development.



											
										
										
											2010-05-21 19:52:50 +00:00
+								}