#include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int ATTEMPT_TO_READ_AFTER_EOF; extern const int CANNOT_READ_ALL_DATA; extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; extern const int CANNOT_PARSE_QUOTED_STRING; extern const int SYNTAX_ERROR; } [[noreturn]] static void throwUnexpectedEof(size_t row_num) { throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", ErrorCodes::CANNOT_READ_ALL_DATA); } static void updateFormatSettingsIfNeeded(FormatSettings::EscapingRule escaping_rule, FormatSettings & settings, const ParsedTemplateFormatString & row_format, char default_csv_delimiter, size_t file_column) { if (escaping_rule != FormatSettings::EscapingRule::CSV) return; /// Clean custom_delimiter from previous column. settings.csv.custom_delimiter.clear(); /// If field delimiter is empty, we read until default csv delimiter. if (row_format.delimiters[file_column + 1].empty()) settings.csv.delimiter = default_csv_delimiter; /// If field delimiter has length = 1, it will be more efficient to use csv.delimiter. else if (row_format.delimiters[file_column + 1].size() == 1) settings.csv.delimiter = row_format.delimiters[file_column + 1].front(); /// If we have some complex delimiter, normal CSV reading will now work properly if we will /// use the first character of delimiter (for example, if delimiter='||' and we have data 'abc|d||') /// We have special implementation for such case that uses custom delimiter, it's not so efficient, /// but works properly. else settings.csv.custom_delimiter = row_format.delimiters[file_column + 1]; } TemplateRowInputFormat::TemplateRowInputFormat( const Block & header_, ReadBuffer & in_, const Params & params_, FormatSettings settings_, bool ignore_spaces_, ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_, std::string row_between_delimiter_) : TemplateRowInputFormat( header_, std::make_unique(in_), params_, settings_, ignore_spaces_, format_, row_format_, row_between_delimiter_) { } TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::unique_ptr buf_, const Params & params_, FormatSettings settings_, bool ignore_spaces_, ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_, std::string row_between_delimiter_) : RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()), settings(std::move(settings_)), ignore_spaces(ignore_spaces_), format(std::move(format_)), row_format(std::move(row_format_)), default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_), format_reader(std::make_unique(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings)) { /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { const auto & column_index = row_format.format_idx_to_column_idx[i]; if (column_index) { if (header_.columns() <= *column_index) row_format.throwInvalidFormat("Column index " + std::to_string(*column_index) + " must be less then number of columns (" + std::to_string(header_.columns()) + ")", i); if (row_format.escaping_rules[i] == EscapingRule::None) row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i); size_t col_idx = *column_index; if (column_in_format[col_idx]) row_format.throwInvalidFormat("Duplicate column", i); column_in_format[col_idx] = true; checkSupportedDelimiterAfterField(row_format.escaping_rules[i], row_format.delimiters[i + 1], data_types[*column_index]); } else { checkSupportedDelimiterAfterField(row_format.escaping_rules[i], row_format.delimiters[i + 1], nullptr); } } for (size_t i = 0; i < header_.columns(); ++i) if (!column_in_format[i]) always_default_columns.push_back(i); } void TemplateRowInputFormat::readPrefix() { format_reader->readPrefix(); } bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) { /// This function can be called again after it returned false if (unlikely(end_of_stream)) return false; if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; } updateDiagnosticInfo(); if (likely(row_num != 1)) format_reader->skipRowBetweenDelimiter(); extra.read_columns.assign(columns.size(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { format_reader->skipDelimiter(i); if (row_format.format_idx_to_column_idx[i]) { size_t col_idx = *row_format.format_idx_to_column_idx[i]; extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i); } else format_reader->skipField(row_format.escaping_rules[i]); } format_reader->skipRowEndDelimiter(); for (const auto & idx : always_default_columns) data_types[idx]->insertDefaultInto(*columns[idx]); return true; } bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, size_t file_column) { EscapingRule escaping_rule = row_format.escaping_rules[file_column]; updateFormatSettingsIfNeeded(escaping_rule, settings, row_format, default_csv_delimiter, file_column); try { return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, settings); } catch (Exception & e) { if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) throwUnexpectedEof(row_num); throw; } } bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { out << "Suffix does not match: "; size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1; const ReadBuffer::Position row_begin_pos = buf->position(); bool caught = false; try { PeekableReadBufferCheckpoint checkpoint{*buf, true}; format_reader->tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (Exception & e) { out << e.message() << " Near column " << last_successfully_parsed_idx; caught = true; } if (!caught) { out << " There is some data after suffix (EOF expected, got "; verbosePrintString(buf->position(), std::min(buf->buffer().end(), buf->position() + 16), out); out << "). "; } out << " Format string (from format_schema): \n" << format.dump() << "\n"; if (row_begin_pos != buf->position()) { /// Pointers to buffer memory were invalidated during checking for suffix out << "\nCannot print more diagnostic info."; return false; } out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n"; out << "\nTrying to parse next row, because suffix does not match:\n"; if (likely(row_num != 1) && !parseDelimiterWithDiagnosticInfo(out, *buf, row_between_delimiter, "delimiter between rows", ignore_spaces)) return false; for (size_t i = 0; i < row_format.columnsCount(); ++i) { if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces)) return false; format_reader->skipSpaces(); if (row_format.format_idx_to_column_idx[i]) { const auto & header = getPort().getHeader(); size_t col_idx = *row_format.format_idx_to_column_idx[i]; if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, i)) { out << "Maybe it's not possible to deserialize field " + std::to_string(i) + " as " + escapingRuleToString(row_format.escaping_rules[i]); return false; } } else { static const String skipped_column_str = ""; static const DataTypePtr skipped_column_type = std::make_shared(); static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, i)) return false; } } return parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters.back(), "delimiter after last field", ignore_spaces); } bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces) { if (skip_spaces) skipWhitespaceIfAny(buf); try { assertString(delimiter, buf); } catch (const DB::Exception &) { out << "ERROR: There is no " << description << ": expected "; verbosePrintString(delimiter.data(), delimiter.data() + delimiter.size(), out); out << ", got "; if (buf.eof()) out << ""; else verbosePrintString(buf.position(), std::min(buf.position() + delimiter.size() + 10, buf.buffer().end()), out); out << '\n'; return false; } return true; } void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { const auto & index = row_format.format_idx_to_column_idx[file_column]; if (index) deserializeField(type, serializations[*index], column, file_column); else format_reader->skipField(row_format.escaping_rules[file_column]); } bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) { /// Garbage will be considered as wrong delimiter return false; } bool TemplateRowInputFormat::allowSyncAfterError() const { return !row_format.delimiters.back().empty() || !row_between_delimiter.empty(); } void TemplateRowInputFormat::syncAfterError() { skipToNextRowOrEof(*buf, row_format.delimiters.back(), row_between_delimiter, ignore_spaces); end_of_stream = buf->eof(); /// It can happen that buf->position() is not at the beginning of row /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. /// It will cause another parsing error. } void TemplateRowInputFormat::resetParser() { RowInputFormatWithDiagnosticInfo::resetParser(); end_of_stream = false; buf->reset(); } void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_) { buf = std::make_unique(in_); IInputFormat::setReadBuffer(*buf); } TemplateFormatReader::TemplateFormatReader( PeekableReadBuffer & buf_, bool ignore_spaces_, const ParsedTemplateFormatString & format_, const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter_, const FormatSettings & format_settings_) : buf(&buf_) , ignore_spaces(ignore_spaces_) , format(format_) , row_format(row_format_) , row_between_delimiter(row_between_delimiter_) , format_settings(format_settings_) { /// Validate format string for result set bool has_data = false; for (size_t i = 0; i < format.columnsCount(); ++i) { if (format.format_idx_to_column_idx[i]) { if (*format.format_idx_to_column_idx[i] != 0) format.throwInvalidFormat("Invalid input part", i); if (has_data) format.throwInvalidFormat("${data} can occur only once", i); if (format.escaping_rules[i] != EscapingRule::None) format.throwInvalidFormat("${data} must have empty or None deserialization type", i); has_data = true; format_data_idx = i; } else { if (format.escaping_rules[i] == EscapingRule::XML) format.throwInvalidFormat("XML deserialization is not supported", i); } } /// Validate format string for rows for (size_t i = 0; i < row_format.columnsCount(); ++i) { if (row_format.escaping_rules[i] == EscapingRule::XML) row_format.throwInvalidFormat("XML deserialization is not supported", i); } } void TemplateFormatReader::readPrefix() { size_t last_successfully_parsed_idx = 0; try { tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); } catch (Exception & e) { format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); } } void TemplateFormatReader::skipField(EscapingRule escaping_rule) { try { skipFieldByEscapingRule(*buf, escaping_rule, format_settings); } catch (Exception & e) { if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) throwUnexpectedEof(row_num); throw; } } /// Asserts delimiters and skips fields in prefix or suffix. /// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row /// (most likely false will be returned on first call of checkString(...)) template ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) { static constexpr bool throw_exception = std::is_same_v; skipSpaces(); if constexpr (throw_exception) assertString(format.delimiters[input_part_beg], *buf); else { if (likely(!checkString(format.delimiters[input_part_beg], *buf))) return ReturnType(false); } while (input_part_beg < input_part_end) { skipSpaces(); if constexpr (throw_exception) skipField(format.escaping_rules[input_part_beg]); else { try { skipField(format.escaping_rules[input_part_beg]); } catch (const Exception & e) { if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) throw; /// If it's parsing error, then suffix is not found return ReturnType(false); } } ++input_part_beg; skipSpaces(); if constexpr (throw_exception) assertString(format.delimiters[input_part_beg], *buf); else { if (likely(!checkString(format.delimiters[input_part_beg], *buf))) return ReturnType(false); } } if constexpr (!throw_exception) return ReturnType(true); } /// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. /// Otherwise returns false bool TemplateFormatReader::checkForSuffix() { PeekableReadBufferCheckpoint checkpoint{*buf}; bool suffix_found = false; size_t last_successfully_parsed_idx = format_data_idx + 1; try { suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (const Exception & e) { if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) throw; } if (unlikely(suffix_found)) { skipSpaces(); if (buf->eof()) return true; } buf->rollbackToCheckpoint(); return false; } void TemplateFormatReader::skipDelimiter(size_t index) { skipSpaces(); assertString(row_format.delimiters[index], *buf); skipSpaces(); } void TemplateFormatReader::skipRowEndDelimiter() { ++row_num; skipSpaces(); assertString(row_format.delimiters.back(), *buf); skipSpaces(); } void TemplateFormatReader::skipRowBetweenDelimiter() { skipSpaces(); assertString(row_between_delimiter, *buf); skipSpaces(); } TemplateSchemaReader::TemplateSchemaReader( ReadBuffer & in_, bool ignore_spaces_, const ParsedTemplateFormatString & format_, const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter, const FormatSettings & format_settings_) : IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules)) , buf(in_) , format(format_) , row_format(row_format_) , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) , default_csv_delimiter(format_settings_.csv.delimiter) { setColumnNames(row_format.column_names); } DataTypes TemplateSchemaReader::readRowAndGetDataTypes() { if (first_row) format_reader.readPrefix(); if (format_reader.checkForSuffix()) return {}; if (first_row) first_row = false; else format_reader.skipRowBetweenDelimiter(); DataTypes data_types; data_types.reserve(row_format.columnsCount()); String field; for (size_t i = 0; i != row_format.columnsCount(); ++i) { format_reader.skipDelimiter(i); updateFormatSettingsIfNeeded(row_format.escaping_rules[i], format_settings, row_format, default_csv_delimiter, i); field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], &json_inference_info)); } format_reader.skipRowEndDelimiter(); return data_types; } void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) { transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, row_format.escaping_rules[field_index], &json_inference_info); } static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings) { ParsedTemplateFormatString resultset_format; if (settings.template_settings.resultset_format.empty()) { /// Default format string: "${data}" resultset_format.delimiters.resize(2); resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); resultset_format.format_idx_to_column_idx.emplace_back(0); resultset_format.column_names.emplace_back("data"); } else { /// Read format string from file resultset_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), [&](const String & partName) -> std::optional { if (partName == "data") return 0; throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR); }); } return resultset_format; } static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes) { return ParsedTemplateFormatString( FormatSchemaInfo( settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), idx_getter, allow_indexes); } void registerInputFormatTemplate(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) { factory.registerInputFormat(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=]( ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings) { auto idx_getter = [&](const String & colName) -> std::optional { return sample.getPositionByName(colName); }; return std::make_shared( sample, buf, params, settings, ignore_spaces, fillResultSetFormat(settings), fillRowFormat(settings, idx_getter, true), settings.template_settings.row_between_delimiter); }); } } void registerTemplateSchemaReader(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) { String format_name = ignore_spaces ? "TemplateIgnoreSpaces" : "Template"; factory.registerSchemaReader(format_name, [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; auto row_format = fillRowFormat(settings, idx_getter, false); return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings); }); factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; auto row_format = fillRowFormat(settings, idx_getter, false); std::unordered_set visited_escaping_rules; String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}", settings.template_settings.row_format, settings.template_settings.resultset_format, settings.template_settings.row_between_delimiter); for (auto escaping_rule : row_format.escaping_rules) { if (!visited_escaping_rules.contains(escaping_rule)) result += ", " + getAdditionalFormatInfoByEscapingRule(settings, settings.regexp.escaping_rule); visited_escaping_rules.insert(escaping_rule); } return result; }); } } }