add support_crlf for TSV format

This commit is contained in:
Blargian 2024-02-04 15:29:57 +01:00
parent 31416bc488
commit ab384f8652
8 changed files with 68 additions and 22 deletions

View File

@ -151,9 +151,13 @@ static inline void read(const SerializationFixedString & self, IColumn & column,
}
void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); });
read(*this, column, [&istr, &settings](ColumnFixedString::Chars & data)
{
settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<ColumnFixedString::Chars,true>(data, istr)
: readEscapedStringInto<ColumnFixedString::Chars,false>(data, istr);
});
}

View File

@ -290,6 +290,7 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
const SerializationPtr & nested_serialization)
{
const String & null_representation = settings.tsv.null_representation;
const bool supports_crlf = settings.tsv.crlf_end_of_line_input;
/// Some data types can deserialize absence of data (e.g. empty string), so eof is ok.
if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0]))
@ -309,10 +310,10 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
/// Check if we have enough data in buffer to check if it's a null.
if (istr.available() > null_representation.size())
{
auto check_for_null = [&istr, &null_representation]()
auto check_for_null = [&istr, &null_representation, &supports_crlf]()
{
auto * pos = istr.position();
if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n'))
if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n' || (supports_crlf && *istr.position() == '\r')))
return true;
istr.position() = pos;
return false;

View File

@ -301,9 +301,13 @@ void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & is
}
void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); });
read(column, [&](ColumnString::Chars & data)
{
settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<PaddedPODArray<UInt8>,true>(data, istr)
: readEscapedStringInto<PaddedPODArray<UInt8>,false>(data, istr);
});
}

View File

@ -76,7 +76,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca
/// Empty field, just skip spaces
break;
case FormatSettings::EscapingRule::Escaped:
readEscapedStringInto(out, buf);
readEscapedStringInto<NullOutput,false>(out, buf);
break;
case FormatSettings::EscapingRule::Quoted:
readQuotedFieldInto(out, buf);
@ -236,7 +236,7 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin
if constexpr (read_string)
readEscapedString(result, buf);
else
readTSVField(result, buf);
readTSVField<false>(result, buf);
break;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule));

View File

@ -496,13 +496,19 @@ static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf)
}
template <typename Vector, bool parse_complex_escape_sequence>
template <typename Vector, bool parse_complex_escape_sequence, bool support_crlf>
void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf)
{
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end());
char * next_pos;
if constexpr (support_crlf)
{
next_pos = find_first_symbols<'\t', '\n', '\\','\r'>(buf.position(), buf.buffer().end());
} else {
next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end());
}
appendToStringOrVector(s, buf, next_pos);
buf.position() = next_pos;
@ -529,25 +535,41 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf)
}
}
}
if (*buf.position() == '\r')
{
++buf.position(); // advance to \n after \r
}
}
}
template <typename Vector>
template <typename Vector, bool support_crlf>
void readEscapedStringInto(Vector & s, ReadBuffer & buf)
{
readEscapedStringIntoImpl<Vector, true>(s, buf);
readEscapedStringIntoImpl<Vector, true, support_crlf>(s, buf);
}
void readEscapedString(String & s, ReadBuffer & buf)
{
s.clear();
readEscapedStringInto(s, buf);
readEscapedStringInto<String,false>(s, buf);
}
template void readEscapedStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readEscapedStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
template<bool support_crlf>
void readEscapedStringCRLF(String & s, ReadBuffer & buf)
{
s.clear();
readEscapedStringInto<String,support_crlf>(s, buf);
}
template void readEscapedStringInto<PaddedPODArray<UInt8>,false>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readEscapedStringInto<NullOutput,false>(NullOutput & s, ReadBuffer & buf);
template void readEscapedStringInto<PaddedPODArray<UInt8>,true>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readEscapedStringInto<NullOutput,true>(NullOutput & s, ReadBuffer & buf);
template void readEscapedStringCRLF<true>(String & s, ReadBuffer & buf);
template void readEscapedStringCRLF<false>(String & s, ReadBuffer & buf);
/** If enable_sql_style_quoting == true,
* strings like 'abc''def' will be parsed as abc'def.
@ -1761,10 +1783,16 @@ void readJSONField(String & s, ReadBuffer & buf)
readParsedValueInto(s, buf, parse_func);
}
template<bool support_crlf>
void readTSVField(String & s, ReadBuffer & buf)
{
s.clear();
readEscapedStringIntoImpl<String, false>(s, buf);
readEscapedStringIntoImpl<String, false, support_crlf>(s, buf);
}
template void readTSVField<true>(String & s, ReadBuffer & buf);
template void readTSVField<false>(String & s, ReadBuffer & buf);
}

View File

@ -548,6 +548,9 @@ void readString(String & s, ReadBuffer & buf);
void readEscapedString(String & s, ReadBuffer & buf);
template<bool support_crlf>
void readEscapedStringCRLF(String & s, ReadBuffer & buf);
void readQuotedString(String & s, ReadBuffer & buf);
void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
@ -601,7 +604,7 @@ void readStringInto(Vector & s, ReadBuffer & buf);
template <typename Vector>
void readNullTerminated(Vector & s, ReadBuffer & buf);
template <typename Vector>
template <typename Vector, bool support_crlf>
void readEscapedStringInto(Vector & s, ReadBuffer & buf);
template <bool enable_sql_style_quoting, typename Vector>
@ -1757,6 +1760,7 @@ void readQuotedField(String & s, ReadBuffer & buf);
void readJSONField(String & s, ReadBuffer & buf);
template<bool support_crlf>
void readTSVField(String & s, ReadBuffer & buf);
/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).

View File

@ -134,7 +134,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex
/// If the key is not found, skip the value.
NullOutput sink;
readEscapedStringInto(sink, *in);
readEscapedStringInto<NullOutput,false>(sink, *in);
}
else
{

View File

@ -10,6 +10,7 @@
#include <Formats/verbosePrintString.h>
#include <Formats/EscapingRuleUtils.h>
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
#include "Formats/FormatSettings.h"
namespace DB
{
@ -105,14 +106,17 @@ template <bool read_string>
String TabSeparatedFormatReader::readFieldIntoString()
{
String field;
bool support_crlf = format_settings.tsv.crlf_end_of_line_input;
if (is_raw)
readString(field, *buf);
else
{
if constexpr (read_string)
readEscapedString(field, *buf);
support_crlf ? readEscapedStringCRLF<true>(field, *buf)
: readEscapedStringCRLF<false>(field, *buf);
else
readTSVField(field, *buf);
support_crlf ? readTSVField<true>(field, *buf)
: readTSVField<false>(field, *buf);
}
return field;
}
@ -123,7 +127,8 @@ void TabSeparatedFormatReader::skipField()
if (is_raw)
readStringInto(out, *buf);
else
readEscapedStringInto(out, *buf);
format_settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<NullOutput,true>(out, *buf)
: readEscapedStringInto<NullOutput,false>(out, *buf);
}
void TabSeparatedFormatReader::skipHeaderRow()