ClickHouse/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp

#include <Processors/Formats/Impl/TemplateRowInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/verbosePrintString.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/Operators.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/Serializations/SerializationNullable.h>

namespace DB
{

namespace ErrorCodes
{
    extern const int ATTEMPT_TO_READ_AFTER_EOF;
    extern const int CANNOT_READ_ALL_DATA;
    extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
    extern const int CANNOT_PARSE_QUOTED_STRING;
    extern const int SYNTAX_ERROR;
}

[[noreturn]] static void throwUnexpectedEof(size_t row_num)
{
    throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". "
                           "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
                           ErrorCodes::CANNOT_READ_ALL_DATA);
}

static void updateFormatSettingsIfNeeded(FormatSettings::EscapingRule escaping_rule, FormatSettings & settings, const ParsedTemplateFormatString & row_format, char default_csv_delimiter, size_t file_column)
{
    if (escaping_rule != FormatSettings::EscapingRule::CSV)
        return;

    /// Clean custom_delimiter from previous column.
    settings.csv.custom_delimiter.clear();
    /// If field delimiter is empty, we read until default csv delimiter.
    if (row_format.delimiters[file_column + 1].empty())
        settings.csv.delimiter = default_csv_delimiter;
    /// If field delimiter has length = 1, it will be more efficient to use csv.delimiter.
    else if (row_format.delimiters[file_column + 1].size() == 1)
        settings.csv.delimiter = row_format.delimiters[file_column + 1].front();
    /// If we have some complex delimiter, normal CSV reading will now work properly if we will
    /// use the first character of delimiter (for example, if delimiter='||' and we have data 'abc|d||')
    /// We have special implementation for such case that uses custom delimiter, it's not so efficient,
    /// but works properly.
    else
        settings.csv.custom_delimiter = row_format.delimiters[file_column + 1];
}

TemplateRowInputFormat::TemplateRowInputFormat(
    const Block & header_,
    ReadBuffer & in_,
    const Params & params_,
    FormatSettings settings_,
    bool ignore_spaces_,
    ParsedTemplateFormatString format_,
    ParsedTemplateFormatString row_format_,
    std::string row_between_delimiter_)
    : TemplateRowInputFormat(
        header_, std::make_unique<PeekableReadBuffer>(in_), params_, settings_, ignore_spaces_, format_, row_format_, row_between_delimiter_)
{
}

TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::unique_ptr<PeekableReadBuffer> buf_, const Params & params_,
                                               FormatSettings settings_, bool ignore_spaces_,
                                               ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_,
                                               std::string row_between_delimiter_)
    : RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()),
      settings(std::move(settings_)), ignore_spaces(ignore_spaces_),
      format(std::move(format_)), row_format(std::move(row_format_)),
      default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_),
      format_reader(std::make_unique<TemplateFormatReader>(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings))
{
    /// Validate format string for rows
    std::vector<UInt8> column_in_format(header_.columns(), false);
    for (size_t i = 0; i < row_format.columnsCount(); ++i)
    {
        const auto & column_index = row_format.format_idx_to_column_idx[i];
        if (column_index)
        {
            if (header_.columns() <= *column_index)
                row_format.throwInvalidFormat("Column index " + std::to_string(*column_index) +
                                              " must be less then number of columns (" + std::to_string(header_.columns()) + ")", i);
            if (row_format.escaping_rules[i] == EscapingRule::None)
                row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i);

            size_t col_idx = *column_index;
            if (column_in_format[col_idx])
                row_format.throwInvalidFormat("Duplicate column", i);
            column_in_format[col_idx] = true;

            checkSupportedDelimiterAfterField(row_format.escaping_rules[i], row_format.delimiters[i + 1], data_types[*column_index]);
        }
        else
        {
            checkSupportedDelimiterAfterField(row_format.escaping_rules[i], row_format.delimiters[i + 1], nullptr);
        }
    }

    for (size_t i = 0; i < header_.columns(); ++i)
        if (!column_in_format[i])
            always_default_columns.push_back(i);
}

void TemplateRowInputFormat::readPrefix()
{
    format_reader->readPrefix();
}

bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra)
{
    /// This function can be called again after it returned false
    if (unlikely(end_of_stream))
        return false;

    if (unlikely(format_reader->checkForSuffix()))
    {
        end_of_stream = true;
        return false;
    }

    updateDiagnosticInfo();

    if (likely(row_num != 1))
        format_reader->skipRowBetweenDelimiter();

    extra.read_columns.assign(columns.size(), false);

    for (size_t i = 0; i < row_format.columnsCount(); ++i)
    {
        format_reader->skipDelimiter(i);

        if (row_format.format_idx_to_column_idx[i])
        {
            size_t col_idx = *row_format.format_idx_to_column_idx[i];
            extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i);
        }
        else
            format_reader->skipField(row_format.escaping_rules[i]);
    }

    format_reader->skipRowEndDelimiter();

    for (const auto & idx : always_default_columns)
        data_types[idx]->insertDefaultInto(*columns[idx]);

    return true;
}

bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
    const SerializationPtr & serialization, IColumn & column, size_t file_column)
{
    EscapingRule escaping_rule = row_format.escaping_rules[file_column];
    updateFormatSettingsIfNeeded(escaping_rule, settings, row_format, default_csv_delimiter, file_column);

    try
    {
        return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, settings);
    }
    catch (Exception & e)
    {
        if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throwUnexpectedEof(row_num);
        throw;
    }
}

bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
{
    out << "Suffix does not match: ";
    size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1;
    const ReadBuffer::Position row_begin_pos = buf->position();
    bool caught = false;
    try
    {
        PeekableReadBufferCheckpoint checkpoint{*buf, true};
        format_reader->tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format.columnsCount());
    }
    catch (Exception & e)
    {
        out << e.message() << " Near column " << last_successfully_parsed_idx;
        caught = true;
    }
    if (!caught)
    {
        out << " There is some data after suffix (EOF expected, got ";
        verbosePrintString(buf->position(), std::min(buf->buffer().end(), buf->position() + 16), out);
        out << "). ";
    }
    out << " Format string (from format_schema): \n" << format.dump() << "\n";

    if (row_begin_pos != buf->position())
    {
        /// Pointers to buffer memory were invalidated during checking for suffix
        out << "\nCannot print more diagnostic info.";
        return false;
    }

    out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n";
    out << "\nTrying to parse next row, because suffix does not match:\n";
    if (likely(row_num != 1) && !parseDelimiterWithDiagnosticInfo(out, *buf, row_between_delimiter, "delimiter between rows", ignore_spaces))
        return false;

    for (size_t i = 0; i < row_format.columnsCount(); ++i)
    {
        if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces))
            return false;

        format_reader->skipSpaces();
        if (row_format.format_idx_to_column_idx[i])
        {
            const auto & header = getPort().getHeader();
            size_t col_idx = *row_format.format_idx_to_column_idx[i];
            if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx],
                                                        *columns[col_idx], out, i))
            {
                out << "Maybe it's not possible to deserialize field " + std::to_string(i) +
                       " as " + escapingRuleToString(row_format.escaping_rules[i]);
                return false;
            }
        }
        else
        {
            static const String skipped_column_str = "<SKIPPED COLUMN>";
            static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
            static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
            if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, i))
                return false;
        }
    }

    return parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters.back(), "delimiter after last field", ignore_spaces);
}

bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces)
{
    if (skip_spaces)
        skipWhitespaceIfAny(buf);
    try
    {
        assertString(delimiter, buf);
    }
    catch (const DB::Exception &)
    {
        out << "ERROR: There is no " << description << ": expected ";
        verbosePrintString(delimiter.data(), delimiter.data() + delimiter.size(), out);
        out << ", got ";
        if (buf.eof())
            out << "<End of stream>";
        else
            verbosePrintString(buf.position(), std::min(buf.position() + delimiter.size() + 10, buf.buffer().end()), out);
        out << '\n';
        return false;
    }
    return true;
}

void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
{
    const auto & index = row_format.format_idx_to_column_idx[file_column];
    if (index)
        deserializeField(type, serializations[*index], column, file_column);
    else
        format_reader->skipField(row_format.escaping_rules[file_column]);
}

bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
{
    /// Garbage will be considered as wrong delimiter
    return false;
}

bool TemplateRowInputFormat::allowSyncAfterError() const
{
    return !row_format.delimiters.back().empty() || !row_between_delimiter.empty();
}

void TemplateRowInputFormat::syncAfterError()
{
    skipToNextRowOrEof(*buf, row_format.delimiters.back(), row_between_delimiter, ignore_spaces);
    end_of_stream = buf->eof();
    /// It can happen that buf->position() is not at the beginning of row
    /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter.
    /// It will cause another parsing error.
}

void TemplateRowInputFormat::resetParser()
{
    RowInputFormatWithDiagnosticInfo::resetParser();
    end_of_stream = false;
    buf->reset();
}

void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_)
{
    buf = std::make_unique<PeekableReadBuffer>(in_);
    IInputFormat::setReadBuffer(*buf);
}

TemplateFormatReader::TemplateFormatReader(
    PeekableReadBuffer & buf_,
    bool ignore_spaces_,
    const ParsedTemplateFormatString & format_,
    const ParsedTemplateFormatString & row_format_,
    std::string row_between_delimiter_,
    const FormatSettings & format_settings_)
    : buf(&buf_)
    , ignore_spaces(ignore_spaces_)
    , format(format_)
    , row_format(row_format_)
    , row_between_delimiter(row_between_delimiter_)
    , format_settings(format_settings_)
{
    /// Validate format string for result set
    bool has_data = false;
    for (size_t i = 0; i < format.columnsCount(); ++i)
    {
        if (format.format_idx_to_column_idx[i])
        {
            if (*format.format_idx_to_column_idx[i] != 0)
                format.throwInvalidFormat("Invalid input part", i);
            if (has_data)
                format.throwInvalidFormat("${data} can occur only once", i);
            if (format.escaping_rules[i] != EscapingRule::None)
                format.throwInvalidFormat("${data} must have empty or None deserialization type", i);
            has_data = true;
            format_data_idx = i;
        }
        else
        {
            if (format.escaping_rules[i] == EscapingRule::XML)
                format.throwInvalidFormat("XML deserialization is not supported", i);
        }
    }

    /// Validate format string for rows
    for (size_t i = 0; i < row_format.columnsCount(); ++i)
    {
        if (row_format.escaping_rules[i] == EscapingRule::XML)
            row_format.throwInvalidFormat("XML deserialization is not supported", i);
    }
}

void TemplateFormatReader::readPrefix()
{
    size_t last_successfully_parsed_idx = 0;
    try
    {
        tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format_data_idx);
    }
    catch (Exception & e)
    {
        format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx);
    }
}

void TemplateFormatReader::skipField(EscapingRule escaping_rule)
{
    try
    {
        skipFieldByEscapingRule(*buf, escaping_rule, format_settings);
    }
    catch (Exception & e)
    {
        if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throwUnexpectedEof(row_num);
        throw;
    }
}

/// Asserts delimiters and skips fields in prefix or suffix.
/// tryReadPrefixOrSuffix<bool>(...) is used in checkForSuffix() to avoid throwing an exception after read of each row
/// (most likely false will be returned on first call of checkString(...))
template <typename ReturnType>
ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end)
{
    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;

    skipSpaces();
    if constexpr (throw_exception)
        assertString(format.delimiters[input_part_beg], *buf);
    else
    {
        if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
            return ReturnType(false);
    }

    while (input_part_beg < input_part_end)
    {
        skipSpaces();
        if constexpr (throw_exception)
            skipField(format.escaping_rules[input_part_beg]);
        else
        {
            try
            {
                skipField(format.escaping_rules[input_part_beg]);
            }
            catch (const Exception & e)
            {
                if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
                    e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
                    e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
                    throw;
                /// If it's parsing error, then suffix is not found
                return ReturnType(false);
            }
        }
        ++input_part_beg;

        skipSpaces();
        if constexpr (throw_exception)
            assertString(format.delimiters[input_part_beg], *buf);
        else
        {
            if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
                return ReturnType(false);
        }
    }

    if constexpr (!throw_exception)
        return ReturnType(true);
}

/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF.
/// Otherwise returns false
bool TemplateFormatReader::checkForSuffix()
{
    PeekableReadBufferCheckpoint checkpoint{*buf};
    bool suffix_found = false;
    size_t last_successfully_parsed_idx = format_data_idx + 1;
    try
    {
        suffix_found = tryReadPrefixOrSuffix<bool>(last_successfully_parsed_idx, format.columnsCount());
    }
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
            e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
            e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
            throw;
    }

    if (unlikely(suffix_found))
    {
        skipSpaces();
        if (buf->eof())
            return true;
    }

    buf->rollbackToCheckpoint();
    return false;
}

void TemplateFormatReader::skipDelimiter(size_t index)
{
    skipSpaces();
    assertString(row_format.delimiters[index], *buf);
    skipSpaces();
}

void TemplateFormatReader::skipRowEndDelimiter()
{
    ++row_num;
    skipSpaces();
    assertString(row_format.delimiters.back(), *buf);
    skipSpaces();
}

void TemplateFormatReader::skipRowBetweenDelimiter()
{
    skipSpaces();
    assertString(row_between_delimiter, *buf);
    skipSpaces();
}

TemplateSchemaReader::TemplateSchemaReader(
    ReadBuffer & in_,
    bool ignore_spaces_,
    const ParsedTemplateFormatString & format_,
    const ParsedTemplateFormatString & row_format_,
    std::string row_between_delimiter,
    const FormatSettings & format_settings_)
    : IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules))
    , buf(in_)
    , format(format_)
    , row_format(row_format_)
    , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings)
    , default_csv_delimiter(format_settings_.csv.delimiter)
{
    setColumnNames(row_format.column_names);
}

DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
{
    if (first_row)
        format_reader.readPrefix();

    if (format_reader.checkForSuffix())
        return {};

    if (first_row)
        first_row = false;
    else
        format_reader.skipRowBetweenDelimiter();

    DataTypes data_types;
    data_types.reserve(row_format.columnsCount());
    String field;
    for (size_t i = 0; i != row_format.columnsCount(); ++i)
    {
        format_reader.skipDelimiter(i);
        updateFormatSettingsIfNeeded(row_format.escaping_rules[i], format_settings, row_format, default_csv_delimiter, i);
        field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
        data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], &json_inference_info));
    }

    format_reader.skipRowEndDelimiter();
    return data_types;
}

void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
    transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, row_format.escaping_rules[field_index], &json_inference_info);
}

static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings)
{
    ParsedTemplateFormatString resultset_format;
    if (settings.template_settings.resultset_format.empty())
    {
        /// Default format string: "${data}"
        resultset_format.delimiters.resize(2);
        resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None);
        resultset_format.format_idx_to_column_idx.emplace_back(0);
        resultset_format.column_names.emplace_back("data");
    }
    else
    {
        /// Read format string from file
        resultset_format = ParsedTemplateFormatString(
            FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false,
                             settings.schema.is_server, settings.schema.format_schema_path),
            [&](const String & partName) -> std::optional<size_t>
            {
                if (partName == "data")
                    return 0;
                throw Exception("Unknown input part " + partName,
                                ErrorCodes::SYNTAX_ERROR);
            });
    }
    return resultset_format;
}

static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes)
{
    return ParsedTemplateFormatString(
        FormatSchemaInfo(
            settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path),
        idx_getter, allow_indexes);
}

void registerInputFormatTemplate(FormatFactory & factory)
{
    for (bool ignore_spaces : {false, true})
    {
        factory.registerInputFormat(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=](
                ReadBuffer & buf,
                const Block & sample,
                IRowInputFormat::Params params,
                const FormatSettings & settings)
        {
            auto idx_getter = [&](const String & colName) -> std::optional<size_t>
            {
                return sample.getPositionByName(colName);
            };

            return std::make_shared<TemplateRowInputFormat>(
                sample,
                buf,
                params,
                settings,
                ignore_spaces,
                fillResultSetFormat(settings),
                fillRowFormat(settings, idx_getter, true),
                settings.template_settings.row_between_delimiter);
        });
    }
}

void registerTemplateSchemaReader(FormatFactory & factory)
{
    for (bool ignore_spaces : {false, true})
    {
        String format_name = ignore_spaces ? "TemplateIgnoreSpaces" : "Template";
        factory.registerSchemaReader(format_name, [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
        {
            size_t index = 0;
            auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
            auto row_format = fillRowFormat(settings, idx_getter, false);
            return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings);
        });
        factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
        {
            size_t index = 0;
            auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
            auto row_format = fillRowFormat(settings, idx_getter, false);
            std::unordered_set<FormatSettings::EscapingRule> visited_escaping_rules;
            String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}",
                settings.template_settings.row_format,
                settings.template_settings.resultset_format,
                settings.template_settings.row_between_delimiter);
            for (auto escaping_rule : row_format.escaping_rules)
            {
                if (!visited_escaping_rules.contains(escaping_rule))
                    result += ", " + getAdditionalFormatInfoByEscapingRule(settings, settings.regexp.escaping_rule);
                visited_escaping_rules.insert(escaping_rule);
            }
            return result;
        });
    }
}

}