2019-08-23 19:47:22 +00:00
|
|
|
#include <Processors/Formats/Impl/TemplateRowInputFormat.h>
|
2019-04-07 21:30:54 +00:00
|
|
|
#include <Formats/FormatFactory.h>
|
2019-04-15 02:45:57 +00:00
|
|
|
#include <Formats/verbosePrintString.h>
|
2021-11-09 13:14:07 +00:00
|
|
|
#include <Formats/EscapingRuleUtils.h>
|
2022-12-07 21:19:27 +00:00
|
|
|
#include <Formats/SchemaInferenceUtils.h>
|
2019-04-15 02:45:57 +00:00
|
|
|
#include <IO/Operators.h>
|
2019-08-27 16:53:26 +00:00
|
|
|
#include <DataTypes/DataTypeNothing.h>
|
2021-03-09 14:46:52 +00:00
|
|
|
#include <DataTypes/Serializations/SerializationNullable.h>
|
2019-04-07 21:30:54 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2021-12-15 11:30:57 +00:00
|
|
|
extern const int ATTEMPT_TO_READ_AFTER_EOF;
|
|
|
|
extern const int CANNOT_READ_ALL_DATA;
|
|
|
|
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
|
|
|
|
extern const int CANNOT_PARSE_QUOTED_STRING;
|
|
|
|
extern const int SYNTAX_ERROR;
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
[[noreturn]] static void throwUnexpectedEof(size_t row_num)
|
|
|
|
{
|
|
|
|
throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". "
|
|
|
|
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
|
|
|
|
ErrorCodes::CANNOT_READ_ALL_DATA);
|
|
|
|
}
|
2019-04-07 21:30:54 +00:00
|
|
|
|
2022-12-08 20:00:10 +00:00
|
|
|
static void updateFormatSettingsIfNeeded(FormatSettings::EscapingRule escaping_rule, FormatSettings & settings, const ParsedTemplateFormatString & row_format, char default_csv_delimiter, size_t file_column)
|
|
|
|
{
|
|
|
|
if (escaping_rule != FormatSettings::EscapingRule::CSV)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/// Clean custom_delimiter from previous column.
|
|
|
|
settings.csv.custom_delimiter.clear();
|
|
|
|
/// If field delimiter is empty, we read until default csv delimiter.
|
|
|
|
if (row_format.delimiters[file_column + 1].empty())
|
|
|
|
settings.csv.delimiter = default_csv_delimiter;
|
|
|
|
/// If field delimiter has length = 1, it will be more efficient to use csv.delimiter.
|
|
|
|
else if (row_format.delimiters[file_column + 1].size() == 1)
|
|
|
|
settings.csv.delimiter = row_format.delimiters[file_column + 1].front();
|
|
|
|
/// If we have some complex delimiter, normal CSV reading will now work properly if we will
|
|
|
|
/// use the first character of delimiter (for example, if delimiter='||' and we have data 'abc|d||')
|
|
|
|
/// We have special implementation for such case that uses custom delimiter, it's not so efficient,
|
|
|
|
/// but works properly.
|
|
|
|
else
|
|
|
|
settings.csv.custom_delimiter = row_format.delimiters[file_column + 1];
|
|
|
|
}
|
|
|
|
|
2021-12-10 17:54:08 +00:00
|
|
|
TemplateRowInputFormat::TemplateRowInputFormat(
|
|
|
|
const Block & header_,
|
|
|
|
ReadBuffer & in_,
|
|
|
|
const Params & params_,
|
|
|
|
FormatSettings settings_,
|
|
|
|
bool ignore_spaces_,
|
|
|
|
ParsedTemplateFormatString format_,
|
|
|
|
ParsedTemplateFormatString row_format_,
|
|
|
|
std::string row_between_delimiter_)
|
|
|
|
: TemplateRowInputFormat(
|
|
|
|
header_, std::make_unique<PeekableReadBuffer>(in_), params_, settings_, ignore_spaces_, format_, row_format_, row_between_delimiter_)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::unique_ptr<PeekableReadBuffer> buf_, const Params & params_,
|
2019-12-25 19:17:41 +00:00
|
|
|
FormatSettings settings_, bool ignore_spaces_,
|
|
|
|
ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_,
|
|
|
|
std::string row_between_delimiter_)
|
2021-12-10 17:54:08 +00:00
|
|
|
: RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()),
|
2019-12-25 19:17:41 +00:00
|
|
|
settings(std::move(settings_)), ignore_spaces(ignore_spaces_),
|
2019-10-07 16:03:01 +00:00
|
|
|
format(std::move(format_)), row_format(std::move(row_format_)),
|
2021-12-15 11:30:57 +00:00
|
|
|
default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_),
|
|
|
|
format_reader(std::make_unique<TemplateFormatReader>(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings))
|
2019-04-07 21:30:54 +00:00
|
|
|
{
|
2019-08-27 16:53:26 +00:00
|
|
|
/// Validate format string for rows
|
2019-08-23 19:47:22 +00:00
|
|
|
std::vector<UInt8> column_in_format(header_.columns(), false);
|
2019-04-07 21:30:54 +00:00
|
|
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
|
|
|
{
|
2022-10-17 16:08:52 +00:00
|
|
|
const auto & column_index = row_format.format_idx_to_column_idx[i];
|
|
|
|
if (column_index)
|
2019-08-27 16:53:26 +00:00
|
|
|
{
|
2022-10-17 16:08:52 +00:00
|
|
|
if (header_.columns() <= *column_index)
|
|
|
|
row_format.throwInvalidFormat("Column index " + std::to_string(*column_index) +
|
2019-09-23 15:10:48 +00:00
|
|
|
" must be less then number of columns (" + std::to_string(header_.columns()) + ")", i);
|
2021-11-09 13:14:07 +00:00
|
|
|
if (row_format.escaping_rules[i] == EscapingRule::None)
|
2019-08-29 19:29:54 +00:00
|
|
|
row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i);
|
2019-08-28 16:46:54 +00:00
|
|
|
|
2022-10-17 16:08:52 +00:00
|
|
|
size_t col_idx = *column_index;
|
2019-08-27 16:53:26 +00:00
|
|
|
if (column_in_format[col_idx])
|
2019-08-29 19:29:54 +00:00
|
|
|
row_format.throwInvalidFormat("Duplicate column", i);
|
2019-08-27 16:53:26 +00:00
|
|
|
column_in_format[col_idx] = true;
|
2022-10-17 16:08:52 +00:00
|
|
|
|
|
|
|
checkSupportedDelimiterAfterField(row_format.escaping_rules[i], row_format.delimiters[i + 1], data_types[*column_index]);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
checkSupportedDelimiterAfterField(row_format.escaping_rules[i], row_format.delimiters[i + 1], nullptr);
|
2019-08-27 16:53:26 +00:00
|
|
|
}
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
2019-10-07 16:08:07 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < header_.columns(); ++i)
|
|
|
|
if (!column_in_format[i])
|
|
|
|
always_default_columns.push_back(i);
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
|
|
|
|
2019-08-23 19:47:22 +00:00
|
|
|
void TemplateRowInputFormat::readPrefix()
|
2019-04-07 21:30:54 +00:00
|
|
|
{
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->readPrefix();
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
|
|
|
|
2019-08-23 19:47:22 +00:00
|
|
|
bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra)
|
2019-04-07 21:30:54 +00:00
|
|
|
{
|
2019-08-27 16:53:26 +00:00
|
|
|
/// This function can be called again after it returned false
|
|
|
|
if (unlikely(end_of_stream))
|
|
|
|
return false;
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
if (unlikely(format_reader->checkForSuffix()))
|
2019-08-27 16:53:26 +00:00
|
|
|
{
|
|
|
|
end_of_stream = true;
|
2019-04-07 21:30:54 +00:00
|
|
|
return false;
|
2019-08-27 16:53:26 +00:00
|
|
|
}
|
2019-04-07 21:30:54 +00:00
|
|
|
|
2019-04-15 02:45:57 +00:00
|
|
|
updateDiagnosticInfo();
|
|
|
|
|
|
|
|
if (likely(row_num != 1))
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->skipRowBetweenDelimiter();
|
2019-04-07 21:30:54 +00:00
|
|
|
|
|
|
|
extra.read_columns.assign(columns.size(), false);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
|
|
|
{
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->skipDelimiter(i);
|
|
|
|
|
2019-08-27 16:53:26 +00:00
|
|
|
if (row_format.format_idx_to_column_idx[i])
|
|
|
|
{
|
|
|
|
size_t col_idx = *row_format.format_idx_to_column_idx[i];
|
2021-03-09 14:46:52 +00:00
|
|
|
extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i);
|
2019-08-27 16:53:26 +00:00
|
|
|
}
|
|
|
|
else
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->skipField(row_format.escaping_rules[i]);
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->skipRowEndDelimiter();
|
2019-04-07 21:30:54 +00:00
|
|
|
|
2019-10-07 16:08:07 +00:00
|
|
|
for (const auto & idx : always_default_columns)
|
|
|
|
data_types[idx]->insertDefaultInto(*columns[idx]);
|
2019-04-07 21:30:54 +00:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-03-09 14:46:52 +00:00
|
|
|
bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
|
|
|
|
const SerializationPtr & serialization, IColumn & column, size_t file_column)
|
2019-04-07 21:30:54 +00:00
|
|
|
{
|
2021-11-09 13:14:07 +00:00
|
|
|
EscapingRule escaping_rule = row_format.escaping_rules[file_column];
|
2022-12-08 20:00:10 +00:00
|
|
|
updateFormatSettingsIfNeeded(escaping_rule, settings, row_format, default_csv_delimiter, file_column);
|
2022-11-17 15:21:38 +00:00
|
|
|
|
2019-04-21 15:37:04 +00:00
|
|
|
try
|
|
|
|
{
|
2021-12-10 17:54:08 +00:00
|
|
|
return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, settings);
|
2019-08-27 16:53:26 +00:00
|
|
|
}
|
|
|
|
catch (Exception & e)
|
|
|
|
{
|
|
|
|
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
2021-12-15 11:30:57 +00:00
|
|
|
throwUnexpectedEof(row_num);
|
2019-04-21 15:37:04 +00:00
|
|
|
throw;
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-23 19:47:22 +00:00
|
|
|
bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
|
2019-04-15 02:45:57 +00:00
|
|
|
{
|
2019-08-29 19:29:54 +00:00
|
|
|
out << "Suffix does not match: ";
|
2021-12-15 11:30:57 +00:00
|
|
|
size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1;
|
2021-12-10 17:54:08 +00:00
|
|
|
const ReadBuffer::Position row_begin_pos = buf->position();
|
2019-08-30 14:38:24 +00:00
|
|
|
bool caught = false;
|
2019-08-29 19:29:54 +00:00
|
|
|
try
|
|
|
|
{
|
2021-12-10 17:54:08 +00:00
|
|
|
PeekableReadBufferCheckpoint checkpoint{*buf, true};
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format.columnsCount());
|
2019-08-29 19:29:54 +00:00
|
|
|
}
|
|
|
|
catch (Exception & e)
|
|
|
|
{
|
|
|
|
out << e.message() << " Near column " << last_successfully_parsed_idx;
|
2019-08-30 14:38:24 +00:00
|
|
|
caught = true;
|
|
|
|
}
|
|
|
|
if (!caught)
|
|
|
|
{
|
|
|
|
out << " There is some data after suffix (EOF expected, got ";
|
2021-12-10 17:54:08 +00:00
|
|
|
verbosePrintString(buf->position(), std::min(buf->buffer().end(), buf->position() + 16), out);
|
2019-08-30 14:38:24 +00:00
|
|
|
out << "). ";
|
2019-08-29 19:29:54 +00:00
|
|
|
}
|
|
|
|
out << " Format string (from format_schema): \n" << format.dump() << "\n";
|
|
|
|
|
2021-12-10 17:54:08 +00:00
|
|
|
if (row_begin_pos != buf->position())
|
2019-08-30 14:38:24 +00:00
|
|
|
{
|
|
|
|
/// Pointers to buffer memory were invalidated during checking for suffix
|
|
|
|
out << "\nCannot print more diagnostic info.";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n";
|
|
|
|
out << "\nTrying to parse next row, because suffix does not match:\n";
|
2021-12-10 17:54:08 +00:00
|
|
|
if (likely(row_num != 1) && !parseDelimiterWithDiagnosticInfo(out, *buf, row_between_delimiter, "delimiter between rows", ignore_spaces))
|
2019-04-15 02:45:57 +00:00
|
|
|
return false;
|
2021-11-09 13:14:07 +00:00
|
|
|
|
2019-04-15 02:45:57 +00:00
|
|
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
|
|
|
{
|
2021-12-10 17:54:08 +00:00
|
|
|
if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces))
|
2019-04-15 02:45:57 +00:00
|
|
|
return false;
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->skipSpaces();
|
2019-08-27 16:53:26 +00:00
|
|
|
if (row_format.format_idx_to_column_idx[i])
|
2019-04-15 02:45:57 +00:00
|
|
|
{
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto & header = getPort().getHeader();
|
2019-08-27 16:53:26 +00:00
|
|
|
size_t col_idx = *row_format.format_idx_to_column_idx[i];
|
|
|
|
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx],
|
|
|
|
*columns[col_idx], out, i))
|
|
|
|
{
|
|
|
|
out << "Maybe it's not possible to deserialize field " + std::to_string(i) +
|
2021-11-09 13:14:07 +00:00
|
|
|
" as " + escapingRuleToString(row_format.escaping_rules[i]);
|
2019-08-27 16:53:26 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
|
|
|
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
|
|
|
|
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
|
|
|
|
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, i))
|
|
|
|
return false;
|
2019-04-15 02:45:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-10 17:54:08 +00:00
|
|
|
return parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters.back(), "delimiter after last field", ignore_spaces);
|
2021-11-09 13:14:07 +00:00
|
|
|
}
|
|
|
|
|
2021-11-16 14:10:18 +00:00
|
|
|
bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces)
|
2021-11-09 13:14:07 +00:00
|
|
|
{
|
|
|
|
if (skip_spaces)
|
|
|
|
skipWhitespaceIfAny(buf);
|
2019-04-15 02:45:57 +00:00
|
|
|
try
|
|
|
|
{
|
2021-11-09 13:14:07 +00:00
|
|
|
assertString(delimiter, buf);
|
2019-04-15 02:45:57 +00:00
|
|
|
}
|
|
|
|
catch (const DB::Exception &)
|
|
|
|
{
|
2021-11-09 13:14:07 +00:00
|
|
|
out << "ERROR: There is no " << description << ": expected ";
|
|
|
|
verbosePrintString(delimiter.data(), delimiter.data() + delimiter.size(), out);
|
|
|
|
out << ", got ";
|
|
|
|
if (buf.eof())
|
|
|
|
out << "<End of stream>";
|
|
|
|
else
|
|
|
|
verbosePrintString(buf.position(), std::min(buf.position() + delimiter.size() + 10, buf.buffer().end()), out);
|
|
|
|
out << '\n';
|
2019-04-15 02:45:57 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-04-21 20:41:52 +00:00
|
|
|
void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
|
2019-04-15 02:45:57 +00:00
|
|
|
{
|
2021-03-09 14:46:52 +00:00
|
|
|
const auto & index = row_format.format_idx_to_column_idx[file_column];
|
|
|
|
if (index)
|
|
|
|
deserializeField(type, serializations[*index], column, file_column);
|
2019-08-27 16:53:26 +00:00
|
|
|
else
|
2021-12-15 11:30:57 +00:00
|
|
|
format_reader->skipField(row_format.escaping_rules[file_column]);
|
2019-04-15 02:45:57 +00:00
|
|
|
}
|
|
|
|
|
2019-08-23 19:47:22 +00:00
|
|
|
bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
|
2019-04-15 02:45:57 +00:00
|
|
|
{
|
|
|
|
/// Garbage will be considered as wrong delimiter
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-08-23 19:47:22 +00:00
|
|
|
bool TemplateRowInputFormat::allowSyncAfterError() const
|
2019-04-17 18:10:24 +00:00
|
|
|
{
|
2019-12-25 19:17:41 +00:00
|
|
|
return !row_format.delimiters.back().empty() || !row_between_delimiter.empty();
|
2019-04-17 18:10:24 +00:00
|
|
|
}
|
|
|
|
|
2019-08-23 19:47:22 +00:00
|
|
|
void TemplateRowInputFormat::syncAfterError()
|
2019-04-17 18:10:24 +00:00
|
|
|
{
|
2021-12-10 17:54:08 +00:00
|
|
|
skipToNextRowOrEof(*buf, row_format.delimiters.back(), row_between_delimiter, ignore_spaces);
|
|
|
|
end_of_stream = buf->eof();
|
|
|
|
/// It can happen that buf->position() is not at the beginning of row
|
2019-08-27 16:53:26 +00:00
|
|
|
/// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter.
|
|
|
|
/// It will cause another parsing error.
|
2019-04-21 15:37:04 +00:00
|
|
|
}
|
|
|
|
|
2019-11-26 23:46:19 +00:00
|
|
|
void TemplateRowInputFormat::resetParser()
|
|
|
|
{
|
|
|
|
RowInputFormatWithDiagnosticInfo::resetParser();
|
|
|
|
end_of_stream = false;
|
2021-12-10 17:54:08 +00:00
|
|
|
buf->reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_)
|
|
|
|
{
|
|
|
|
buf = std::make_unique<PeekableReadBuffer>(in_);
|
|
|
|
IInputFormat::setReadBuffer(*buf);
|
2019-11-26 23:46:19 +00:00
|
|
|
}
|
2019-04-07 21:30:54 +00:00
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
TemplateFormatReader::TemplateFormatReader(
|
|
|
|
PeekableReadBuffer & buf_,
|
|
|
|
bool ignore_spaces_,
|
|
|
|
const ParsedTemplateFormatString & format_,
|
|
|
|
const ParsedTemplateFormatString & row_format_,
|
|
|
|
std::string row_between_delimiter_,
|
|
|
|
const FormatSettings & format_settings_)
|
|
|
|
: buf(&buf_)
|
|
|
|
, ignore_spaces(ignore_spaces_)
|
|
|
|
, format(format_)
|
|
|
|
, row_format(row_format_)
|
|
|
|
, row_between_delimiter(row_between_delimiter_)
|
|
|
|
, format_settings(format_settings_)
|
|
|
|
{
|
|
|
|
/// Validate format string for result set
|
|
|
|
bool has_data = false;
|
|
|
|
for (size_t i = 0; i < format.columnsCount(); ++i)
|
|
|
|
{
|
|
|
|
if (format.format_idx_to_column_idx[i])
|
|
|
|
{
|
|
|
|
if (*format.format_idx_to_column_idx[i] != 0)
|
|
|
|
format.throwInvalidFormat("Invalid input part", i);
|
|
|
|
if (has_data)
|
|
|
|
format.throwInvalidFormat("${data} can occur only once", i);
|
|
|
|
if (format.escaping_rules[i] != EscapingRule::None)
|
|
|
|
format.throwInvalidFormat("${data} must have empty or None deserialization type", i);
|
|
|
|
has_data = true;
|
|
|
|
format_data_idx = i;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (format.escaping_rules[i] == EscapingRule::XML)
|
|
|
|
format.throwInvalidFormat("XML deserialization is not supported", i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Validate format string for rows
|
|
|
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
|
|
|
{
|
|
|
|
if (row_format.escaping_rules[i] == EscapingRule::XML)
|
|
|
|
row_format.throwInvalidFormat("XML deserialization is not supported", i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void TemplateFormatReader::readPrefix()
|
|
|
|
{
|
|
|
|
size_t last_successfully_parsed_idx = 0;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format_data_idx);
|
|
|
|
}
|
|
|
|
catch (Exception & e)
|
|
|
|
{
|
|
|
|
format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void TemplateFormatReader::skipField(EscapingRule escaping_rule)
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
skipFieldByEscapingRule(*buf, escaping_rule, format_settings);
|
|
|
|
}
|
|
|
|
catch (Exception & e)
|
|
|
|
{
|
|
|
|
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
|
|
|
throwUnexpectedEof(row_num);
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Asserts delimiters and skips fields in prefix or suffix.
|
|
|
|
/// tryReadPrefixOrSuffix<bool>(...) is used in checkForSuffix() to avoid throwing an exception after read of each row
|
|
|
|
/// (most likely false will be returned on first call of checkString(...))
|
|
|
|
template <typename ReturnType>
|
|
|
|
ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end)
|
|
|
|
{
|
|
|
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
|
|
|
|
|
|
|
skipSpaces();
|
|
|
|
if constexpr (throw_exception)
|
|
|
|
assertString(format.delimiters[input_part_beg], *buf);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
|
|
|
|
return ReturnType(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (input_part_beg < input_part_end)
|
|
|
|
{
|
|
|
|
skipSpaces();
|
|
|
|
if constexpr (throw_exception)
|
|
|
|
skipField(format.escaping_rules[input_part_beg]);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
skipField(format.escaping_rules[input_part_beg]);
|
|
|
|
}
|
|
|
|
catch (const Exception & e)
|
|
|
|
{
|
|
|
|
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
|
|
|
|
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
|
|
|
|
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
|
|
|
|
throw;
|
|
|
|
/// If it's parsing error, then suffix is not found
|
|
|
|
return ReturnType(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++input_part_beg;
|
|
|
|
|
|
|
|
skipSpaces();
|
|
|
|
if constexpr (throw_exception)
|
|
|
|
assertString(format.delimiters[input_part_beg], *buf);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
|
|
|
|
return ReturnType(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if constexpr (!throw_exception)
|
|
|
|
return ReturnType(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF.
|
|
|
|
/// Otherwise returns false
|
|
|
|
bool TemplateFormatReader::checkForSuffix()
|
|
|
|
{
|
|
|
|
PeekableReadBufferCheckpoint checkpoint{*buf};
|
|
|
|
bool suffix_found = false;
|
|
|
|
size_t last_successfully_parsed_idx = format_data_idx + 1;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
suffix_found = tryReadPrefixOrSuffix<bool>(last_successfully_parsed_idx, format.columnsCount());
|
|
|
|
}
|
|
|
|
catch (const Exception & e)
|
|
|
|
{
|
|
|
|
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
|
|
|
|
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
|
|
|
|
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(suffix_found))
|
|
|
|
{
|
|
|
|
skipSpaces();
|
|
|
|
if (buf->eof())
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
buf->rollbackToCheckpoint();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void TemplateFormatReader::skipDelimiter(size_t index)
|
|
|
|
{
|
|
|
|
skipSpaces();
|
|
|
|
assertString(row_format.delimiters[index], *buf);
|
|
|
|
skipSpaces();
|
|
|
|
}
|
|
|
|
|
|
|
|
void TemplateFormatReader::skipRowEndDelimiter()
|
|
|
|
{
|
|
|
|
++row_num;
|
|
|
|
skipSpaces();
|
|
|
|
assertString(row_format.delimiters.back(), *buf);
|
|
|
|
skipSpaces();
|
|
|
|
}
|
|
|
|
|
|
|
|
void TemplateFormatReader::skipRowBetweenDelimiter()
|
|
|
|
{
|
|
|
|
skipSpaces();
|
|
|
|
assertString(row_between_delimiter, *buf);
|
|
|
|
skipSpaces();
|
|
|
|
}
|
|
|
|
|
|
|
|
TemplateSchemaReader::TemplateSchemaReader(
|
|
|
|
ReadBuffer & in_,
|
|
|
|
bool ignore_spaces_,
|
|
|
|
const ParsedTemplateFormatString & format_,
|
|
|
|
const ParsedTemplateFormatString & row_format_,
|
|
|
|
std::string row_between_delimiter,
|
2022-03-24 12:54:12 +00:00
|
|
|
const FormatSettings & format_settings_)
|
2022-03-25 12:02:21 +00:00
|
|
|
: IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules))
|
2021-12-15 11:30:57 +00:00
|
|
|
, buf(in_)
|
|
|
|
, format(format_)
|
|
|
|
, row_format(row_format_)
|
|
|
|
, format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings)
|
2022-11-17 15:21:38 +00:00
|
|
|
, default_csv_delimiter(format_settings_.csv.delimiter)
|
2021-12-15 11:30:57 +00:00
|
|
|
{
|
|
|
|
setColumnNames(row_format.column_names);
|
|
|
|
}
|
|
|
|
|
|
|
|
DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
|
|
|
|
{
|
|
|
|
if (first_row)
|
|
|
|
format_reader.readPrefix();
|
|
|
|
|
|
|
|
if (format_reader.checkForSuffix())
|
|
|
|
return {};
|
|
|
|
|
|
|
|
if (first_row)
|
|
|
|
first_row = false;
|
|
|
|
else
|
|
|
|
format_reader.skipRowBetweenDelimiter();
|
|
|
|
|
|
|
|
DataTypes data_types;
|
|
|
|
data_types.reserve(row_format.columnsCount());
|
|
|
|
String field;
|
|
|
|
for (size_t i = 0; i != row_format.columnsCount(); ++i)
|
|
|
|
{
|
|
|
|
format_reader.skipDelimiter(i);
|
2022-12-08 20:00:10 +00:00
|
|
|
updateFormatSettingsIfNeeded(row_format.escaping_rules[i], format_settings, row_format, default_csv_delimiter, i);
|
2021-12-15 11:30:57 +00:00
|
|
|
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
|
2022-12-07 21:19:27 +00:00
|
|
|
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], &json_inference_info));
|
2021-12-15 11:30:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
format_reader.skipRowEndDelimiter();
|
|
|
|
return data_types;
|
|
|
|
}
|
|
|
|
|
2022-12-07 21:19:27 +00:00
|
|
|
void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
|
2022-07-13 15:57:55 +00:00
|
|
|
{
|
2022-12-07 21:19:27 +00:00
|
|
|
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, row_format.escaping_rules[field_index], &json_inference_info);
|
2022-07-13 15:57:55 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings)
|
|
|
|
{
|
|
|
|
ParsedTemplateFormatString resultset_format;
|
|
|
|
if (settings.template_settings.resultset_format.empty())
|
|
|
|
{
|
|
|
|
/// Default format string: "${data}"
|
|
|
|
resultset_format.delimiters.resize(2);
|
|
|
|
resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None);
|
|
|
|
resultset_format.format_idx_to_column_idx.emplace_back(0);
|
|
|
|
resultset_format.column_names.emplace_back("data");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Read format string from file
|
|
|
|
resultset_format = ParsedTemplateFormatString(
|
|
|
|
FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false,
|
|
|
|
settings.schema.is_server, settings.schema.format_schema_path),
|
|
|
|
[&](const String & partName) -> std::optional<size_t>
|
|
|
|
{
|
|
|
|
if (partName == "data")
|
|
|
|
return 0;
|
|
|
|
throw Exception("Unknown input part " + partName,
|
|
|
|
ErrorCodes::SYNTAX_ERROR);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
return resultset_format;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes)
|
|
|
|
{
|
|
|
|
return ParsedTemplateFormatString(
|
|
|
|
FormatSchemaInfo(
|
|
|
|
settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path),
|
|
|
|
idx_getter, allow_indexes);
|
|
|
|
}
|
|
|
|
|
2021-10-11 16:11:50 +00:00
|
|
|
void registerInputFormatTemplate(FormatFactory & factory)
|
2019-04-07 21:30:54 +00:00
|
|
|
{
|
|
|
|
for (bool ignore_spaces : {false, true})
|
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
factory.registerInputFormat(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=](
|
2019-04-07 21:30:54 +00:00
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
2019-08-23 19:47:22 +00:00
|
|
|
IRowInputFormat::Params params,
|
2019-04-12 00:45:18 +00:00
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
2021-12-15 11:30:57 +00:00
|
|
|
auto idx_getter = [&](const String & colName) -> std::optional<size_t>
|
2019-09-24 14:25:22 +00:00
|
|
|
{
|
2021-12-15 11:30:57 +00:00
|
|
|
return sample.getPositionByName(colName);
|
|
|
|
};
|
|
|
|
|
|
|
|
return std::make_shared<TemplateRowInputFormat>(
|
|
|
|
sample,
|
|
|
|
buf,
|
|
|
|
params,
|
|
|
|
settings,
|
|
|
|
ignore_spaces,
|
|
|
|
fillResultSetFormat(settings),
|
|
|
|
fillRowFormat(settings, idx_getter, true),
|
|
|
|
settings.template_settings.row_between_delimiter);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
2019-09-24 14:25:22 +00:00
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
void registerTemplateSchemaReader(FormatFactory & factory)
|
|
|
|
{
|
|
|
|
for (bool ignore_spaces : {false, true})
|
|
|
|
{
|
2022-08-19 16:39:13 +00:00
|
|
|
String format_name = ignore_spaces ? "TemplateIgnoreSpaces" : "Template";
|
|
|
|
factory.registerSchemaReader(format_name, [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
|
2021-12-15 11:30:57 +00:00
|
|
|
{
|
|
|
|
size_t index = 0;
|
|
|
|
auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
|
|
|
|
auto row_format = fillRowFormat(settings, idx_getter, false);
|
2022-03-24 12:54:12 +00:00
|
|
|
return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings);
|
2019-04-07 21:30:54 +00:00
|
|
|
});
|
2022-08-19 16:39:13 +00:00
|
|
|
factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
|
|
|
|
{
|
|
|
|
size_t index = 0;
|
|
|
|
auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
|
|
|
|
auto row_format = fillRowFormat(settings, idx_getter, false);
|
|
|
|
std::unordered_set<FormatSettings::EscapingRule> visited_escaping_rules;
|
|
|
|
String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}",
|
|
|
|
settings.template_settings.row_format,
|
|
|
|
settings.template_settings.resultset_format,
|
|
|
|
settings.template_settings.row_between_delimiter);
|
|
|
|
for (auto escaping_rule : row_format.escaping_rules)
|
|
|
|
{
|
|
|
|
if (!visited_escaping_rules.contains(escaping_rule))
|
|
|
|
result += ", " + getAdditionalFormatInfoByEscapingRule(settings, settings.regexp.escaping_rule);
|
|
|
|
visited_escaping_rules.insert(escaping_rule);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
});
|
2019-04-07 21:30:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|