2010-06-04 18:25:25 +00:00
|
|
|
|
#include <DB/IO/ReadHelpers.h>
|
2015-03-29 07:13:38 +00:00
|
|
|
|
#include <DB/IO/Operators.h>
|
2010-06-04 18:25:25 +00:00
|
|
|
|
|
2010-05-21 19:52:50 +00:00
|
|
|
|
#include <DB/DataStreams/TabSeparatedRowInputStream.h>
|
2015-03-29 07:13:38 +00:00
|
|
|
|
#include <DB/DataTypes/DataTypesNumberFixed.h>
|
2010-05-21 19:52:50 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
|
namespace ErrorCodes
|
|
|
|
|
{
|
|
|
|
|
extern const int INCORRECT_DATA;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2011-11-06 06:22:52 +00:00
|
|
|
|
TabSeparatedRowInputStream::TabSeparatedRowInputStream(ReadBuffer & istr_, const Block & sample_, bool with_names_, bool with_types_)
|
|
|
|
|
: istr(istr_), sample(sample_), with_names(with_names_), with_types(with_types_)
|
2010-05-21 19:52:50 +00:00
|
|
|
|
{
|
2011-11-06 05:01:42 +00:00
|
|
|
|
size_t columns = sample.columns();
|
|
|
|
|
data_types.resize(columns);
|
|
|
|
|
for (size_t i = 0; i < columns; ++i)
|
|
|
|
|
data_types[i] = sample.getByPosition(i).type;
|
2010-05-21 19:52:50 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2011-11-06 06:22:52 +00:00
|
|
|
|
void TabSeparatedRowInputStream::readPrefix()
|
|
|
|
|
{
|
|
|
|
|
size_t columns = sample.columns();
|
|
|
|
|
String tmp;
|
|
|
|
|
|
|
|
|
|
if (with_names)
|
|
|
|
|
{
|
|
|
|
|
for (size_t i = 0; i < columns; ++i)
|
|
|
|
|
{
|
|
|
|
|
readEscapedString(tmp, istr);
|
2016-02-07 08:42:21 +00:00
|
|
|
|
assertChar(i == columns - 1 ? '\n' : '\t', istr);
|
2011-11-06 06:22:52 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (with_types)
|
|
|
|
|
{
|
|
|
|
|
for (size_t i = 0; i < columns; ++i)
|
|
|
|
|
{
|
|
|
|
|
readEscapedString(tmp, istr);
|
2016-02-07 08:42:21 +00:00
|
|
|
|
assertChar(i == columns - 1 ? '\n' : '\t', istr);
|
2011-11-06 06:22:52 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2015-03-29 08:44:04 +00:00
|
|
|
|
/** Проверка на распространённый случай ошибки - использование Windows перевода строки.
|
|
|
|
|
*/
|
|
|
|
|
static void checkForCarriageReturn(ReadBuffer & istr)
|
|
|
|
|
{
|
|
|
|
|
if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
|
2015-03-29 09:02:24 +00:00
|
|
|
|
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
|
|
|
|
|
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
|
2015-03-29 08:44:04 +00:00
|
|
|
|
" You must transform your file to Unix format."
|
2015-03-29 09:02:24 +00:00
|
|
|
|
"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
|
2015-03-29 08:44:04 +00:00
|
|
|
|
ErrorCodes::INCORRECT_DATA);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
|
bool TabSeparatedRowInputStream::read(Block & block)
|
2010-05-21 19:52:50 +00:00
|
|
|
|
{
|
2015-03-29 07:13:38 +00:00
|
|
|
|
updateDiagnosticInfo();
|
|
|
|
|
|
2011-11-06 05:01:42 +00:00
|
|
|
|
size_t size = data_types.size();
|
2015-03-29 07:13:38 +00:00
|
|
|
|
|
2015-03-29 09:02:24 +00:00
|
|
|
|
try
|
2010-05-21 19:52:50 +00:00
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
|
if (istr.eof())
|
|
|
|
|
return false;
|
|
|
|
|
|
2015-03-29 09:02:24 +00:00
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
2010-05-21 19:52:50 +00:00
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
|
data_types[i].get()->deserializeTextEscaped(*block.unsafeGetByPosition(i).column.get(), istr);
|
2010-05-21 19:52:50 +00:00
|
|
|
|
|
2015-03-29 09:02:24 +00:00
|
|
|
|
/// пропускаем разделители
|
|
|
|
|
if (i + 1 == size)
|
2015-03-29 08:44:04 +00:00
|
|
|
|
{
|
2015-03-29 09:02:24 +00:00
|
|
|
|
if (!istr.eof())
|
|
|
|
|
{
|
|
|
|
|
if (unlikely(row_num == 1))
|
|
|
|
|
checkForCarriageReturn(istr);
|
2015-03-29 08:44:04 +00:00
|
|
|
|
|
2016-02-07 08:42:21 +00:00
|
|
|
|
assertChar('\n', istr);
|
2015-03-29 09:02:24 +00:00
|
|
|
|
}
|
2015-03-29 08:44:04 +00:00
|
|
|
|
}
|
2015-03-29 09:02:24 +00:00
|
|
|
|
else
|
2016-02-07 08:42:21 +00:00
|
|
|
|
assertChar('\t', istr);
|
2010-05-21 19:52:50 +00:00
|
|
|
|
}
|
2015-03-29 09:02:24 +00:00
|
|
|
|
}
|
|
|
|
|
catch (Exception & e)
|
|
|
|
|
{
|
|
|
|
|
String verbose_diagnostic;
|
|
|
|
|
{
|
|
|
|
|
WriteBufferFromString diagnostic_out(verbose_diagnostic);
|
2016-02-16 16:39:39 +00:00
|
|
|
|
printDiagnosticInfo(block, diagnostic_out);
|
2015-03-29 09:02:24 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.addMessage("\n" + verbose_diagnostic);
|
|
|
|
|
throw;
|
2010-05-21 19:52:50 +00:00
|
|
|
|
}
|
|
|
|
|
|
2013-01-07 00:57:43 +00:00
|
|
|
|
return true;
|
2010-05-21 19:52:50 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-03-29 07:13:38 +00:00
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
|
void TabSeparatedRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer & out)
|
2015-03-29 07:13:38 +00:00
|
|
|
|
{
|
|
|
|
|
/// Вывести подробную диагностику возможно лишь если последняя и предпоследняя строка ещё находятся в буфере для чтения.
|
|
|
|
|
size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
|
|
|
|
|
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
|
|
|
|
|
{
|
|
|
|
|
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-29 08:44:04 +00:00
|
|
|
|
size_t max_length_of_column_name = 0;
|
|
|
|
|
for (size_t i = 0; i < sample.columns(); ++i)
|
|
|
|
|
if (sample.getByPosition(i).name.size() > max_length_of_column_name)
|
|
|
|
|
max_length_of_column_name = sample.getByPosition(i).name.size();
|
|
|
|
|
|
|
|
|
|
size_t max_length_of_data_type_name = 0;
|
|
|
|
|
for (size_t i = 0; i < sample.columns(); ++i)
|
|
|
|
|
if (sample.getByPosition(i).type->getName().size() > max_length_of_data_type_name)
|
|
|
|
|
max_length_of_data_type_name = sample.getByPosition(i).type->getName().size();
|
|
|
|
|
|
2015-03-29 07:13:38 +00:00
|
|
|
|
/// Откатываем курсор для чтения на начало предыдущей или текущей строки и парсим всё заново. Но теперь выводим подробную информацию.
|
|
|
|
|
|
|
|
|
|
if (pos_of_prev_row)
|
|
|
|
|
{
|
|
|
|
|
istr.position() = pos_of_prev_row;
|
|
|
|
|
|
|
|
|
|
out << "\nRow " << (row_num - 1) << ":\n";
|
2016-02-16 16:39:39 +00:00
|
|
|
|
if (!parseRowAndPrintDiagnosticInfo(block, out, max_length_of_column_name, max_length_of_data_type_name))
|
2015-03-29 07:13:38 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (!pos_of_current_row)
|
|
|
|
|
{
|
|
|
|
|
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
istr.position() = pos_of_current_row;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out << "\nRow " << row_num << ":\n";
|
2016-02-16 16:39:39 +00:00
|
|
|
|
parseRowAndPrintDiagnosticInfo(block, out, max_length_of_column_name, max_length_of_data_type_name);
|
2015-03-29 07:13:38 +00:00
|
|
|
|
out << "\n";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
|
|
|
|
|
{
|
|
|
|
|
if (end == begin)
|
|
|
|
|
{
|
|
|
|
|
out << "<EMPTY>";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out << "\"";
|
|
|
|
|
|
|
|
|
|
for (auto pos = begin; pos < end; ++pos)
|
|
|
|
|
{
|
|
|
|
|
switch (*pos)
|
|
|
|
|
{
|
|
|
|
|
case '\0':
|
|
|
|
|
out << "<ASCII NUL>";
|
|
|
|
|
break;
|
|
|
|
|
case '\b':
|
|
|
|
|
out << "<BACKSPACE>";
|
|
|
|
|
break;
|
|
|
|
|
case '\f':
|
|
|
|
|
out << "<FORM FEED>";
|
|
|
|
|
break;
|
|
|
|
|
case '\n':
|
|
|
|
|
out << "<LINE FEED>";
|
|
|
|
|
break;
|
|
|
|
|
case '\r':
|
|
|
|
|
out << "<CARRIAGE RETURN>";
|
|
|
|
|
break;
|
|
|
|
|
case '\t':
|
|
|
|
|
out << "<TAB>";
|
|
|
|
|
break;
|
|
|
|
|
case '\\':
|
|
|
|
|
out << "<BACKSLASH>";
|
|
|
|
|
break;
|
|
|
|
|
case '"':
|
|
|
|
|
out << "<DOUBLE QUOTE>";
|
|
|
|
|
break;
|
|
|
|
|
case '\'':
|
|
|
|
|
out << "<SINGLE QUOTE>";
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
if (*pos >= 0 && *pos < 32)
|
|
|
|
|
{
|
|
|
|
|
static const char * hex = "0123456789ABCDEF";
|
|
|
|
|
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
out << *pos;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out << "\"";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2016-02-16 16:39:39 +00:00
|
|
|
|
bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
|
2015-03-29 08:44:04 +00:00
|
|
|
|
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
2015-03-29 07:13:38 +00:00
|
|
|
|
{
|
|
|
|
|
size_t size = data_types.size();
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
|
{
|
|
|
|
|
if (i == 0 && istr.eof())
|
|
|
|
|
{
|
|
|
|
|
out << "<End of stream>\n";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-29 08:44:04 +00:00
|
|
|
|
out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
|
|
|
|
|
<< "name: " << sample.getByPosition(i).name << ", " << std::string(max_length_of_column_name - sample.getByPosition(i).name.size(), ' ')
|
|
|
|
|
<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
|
2015-03-29 07:13:38 +00:00
|
|
|
|
|
|
|
|
|
auto prev_position = istr.position();
|
|
|
|
|
std::exception_ptr exception;
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
2016-02-16 16:39:39 +00:00
|
|
|
|
data_types[i]->deserializeTextEscaped(*block.getByPosition(i).column, istr);
|
2015-03-29 07:13:38 +00:00
|
|
|
|
}
|
|
|
|
|
catch (...)
|
|
|
|
|
{
|
|
|
|
|
exception = std::current_exception();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto curr_position = istr.position();
|
|
|
|
|
|
|
|
|
|
if (curr_position < prev_position)
|
|
|
|
|
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
|
|
if (data_types[i]->isNumeric())
|
|
|
|
|
{
|
|
|
|
|
/// Пустая строка вместо числа.
|
|
|
|
|
if (curr_position == prev_position)
|
|
|
|
|
{
|
2015-03-29 08:44:04 +00:00
|
|
|
|
out << "ERROR: text ";
|
2015-03-29 07:13:38 +00:00
|
|
|
|
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
|
2015-03-29 08:44:04 +00:00
|
|
|
|
out << " is not like " << data_types[i]->getName() << "\n";
|
2015-03-29 07:13:38 +00:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-29 08:44:04 +00:00
|
|
|
|
out << "parsed text: ";
|
2015-03-29 07:13:38 +00:00
|
|
|
|
verbosePrintString(prev_position, curr_position, out);
|
|
|
|
|
|
|
|
|
|
if (exception)
|
|
|
|
|
{
|
2015-03-29 08:44:04 +00:00
|
|
|
|
if (data_types[i]->getName() == "DateTime")
|
2015-03-31 21:09:19 +00:00
|
|
|
|
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
2015-03-29 08:44:04 +00:00
|
|
|
|
else if (data_types[i]->getName() == "Date")
|
|
|
|
|
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
|
|
|
else
|
|
|
|
|
out << "ERROR\n";
|
2015-03-29 07:13:38 +00:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-29 08:44:04 +00:00
|
|
|
|
out << "\n";
|
2015-03-29 07:13:38 +00:00
|
|
|
|
|
|
|
|
|
if (data_types[i]->isNumeric())
|
|
|
|
|
{
|
|
|
|
|
if (*curr_position != '\n' && *curr_position != '\t')
|
|
|
|
|
{
|
2015-03-29 08:44:04 +00:00
|
|
|
|
out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
|
2015-03-29 07:13:38 +00:00
|
|
|
|
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
|
|
|
|
|
out << "\n";
|
2015-03-29 08:44:04 +00:00
|
|
|
|
|
|
|
|
|
if (data_types[i]->getName() == "DateTime")
|
2015-03-31 21:09:19 +00:00
|
|
|
|
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
2015-03-29 08:44:04 +00:00
|
|
|
|
else if (data_types[i]->getName() == "Date")
|
|
|
|
|
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
|
|
|
|
2015-03-29 07:13:38 +00:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Разделители
|
|
|
|
|
if (i + 1 == size)
|
|
|
|
|
{
|
|
|
|
|
if (!istr.eof())
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
2016-02-07 08:42:21 +00:00
|
|
|
|
assertChar('\n', istr);
|
2015-03-29 07:13:38 +00:00
|
|
|
|
}
|
|
|
|
|
catch (const DB::Exception &)
|
|
|
|
|
{
|
|
|
|
|
if (*istr.position() == '\t')
|
|
|
|
|
{
|
|
|
|
|
out << "ERROR: Tab found where line feed is expected."
|
|
|
|
|
" It's like your file has more columns than expected.\n"
|
|
|
|
|
"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
|
|
|
|
|
}
|
|
|
|
|
else if (*istr.position() == '\r')
|
|
|
|
|
{
|
|
|
|
|
out << "ERROR: Carriage return found where line feed is expected."
|
|
|
|
|
" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
out << "ERROR: There is no line feed. ";
|
|
|
|
|
verbosePrintString(istr.position(), istr.position() + 1, out);
|
|
|
|
|
out << " found instead.\n";
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
2016-02-07 08:42:21 +00:00
|
|
|
|
assertChar('\t', istr);
|
2015-03-29 07:13:38 +00:00
|
|
|
|
}
|
|
|
|
|
catch (const DB::Exception &)
|
|
|
|
|
{
|
|
|
|
|
if (*istr.position() == '\n')
|
|
|
|
|
{
|
|
|
|
|
out << "ERROR: Line feed found where tab is expected."
|
|
|
|
|
" It's like your file has less columns than expected.\n"
|
|
|
|
|
"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
|
|
|
|
|
}
|
|
|
|
|
else if (*istr.position() == '\r')
|
|
|
|
|
{
|
|
|
|
|
out << "ERROR: Carriage return found where tab is expected.\n";
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
out << "ERROR: There is no tab. ";
|
|
|
|
|
verbosePrintString(istr.position(), istr.position() + 1, out);
|
|
|
|
|
out << " found instead.\n";
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-21 19:52:50 +00:00
|
|
|
|
}
|