mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-12 09:22:05 +00:00
add support_crlf for TSV format
This commit is contained in:
parent
31416bc488
commit
ab384f8652
@ -151,9 +151,13 @@ static inline void read(const SerializationFixedString & self, IColumn & column,
|
||||
}
|
||||
|
||||
|
||||
void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
||||
{
|
||||
read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); });
|
||||
read(*this, column, [&istr, &settings](ColumnFixedString::Chars & data)
|
||||
{
|
||||
settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<ColumnFixedString::Chars,true>(data, istr)
|
||||
: readEscapedStringInto<ColumnFixedString::Chars,false>(data, istr);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@ -290,6 +290,7 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
|
||||
const SerializationPtr & nested_serialization)
|
||||
{
|
||||
const String & null_representation = settings.tsv.null_representation;
|
||||
const bool supports_crlf = settings.tsv.crlf_end_of_line_input;
|
||||
|
||||
/// Some data types can deserialize absence of data (e.g. empty string), so eof is ok.
|
||||
if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0]))
|
||||
@ -309,10 +310,10 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
|
||||
/// Check if we have enough data in buffer to check if it's a null.
|
||||
if (istr.available() > null_representation.size())
|
||||
{
|
||||
auto check_for_null = [&istr, &null_representation]()
|
||||
auto check_for_null = [&istr, &null_representation, &supports_crlf]()
|
||||
{
|
||||
auto * pos = istr.position();
|
||||
if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n'))
|
||||
if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n' || (supports_crlf && *istr.position() == '\r')))
|
||||
return true;
|
||||
istr.position() = pos;
|
||||
return false;
|
||||
|
@ -301,9 +301,13 @@ void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & is
|
||||
}
|
||||
|
||||
|
||||
void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
||||
{
|
||||
read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); });
|
||||
read(column, [&](ColumnString::Chars & data)
|
||||
{
|
||||
settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<PaddedPODArray<UInt8>,true>(data, istr)
|
||||
: readEscapedStringInto<PaddedPODArray<UInt8>,false>(data, istr);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@ -76,7 +76,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca
|
||||
/// Empty field, just skip spaces
|
||||
break;
|
||||
case FormatSettings::EscapingRule::Escaped:
|
||||
readEscapedStringInto(out, buf);
|
||||
readEscapedStringInto<NullOutput,false>(out, buf);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::Quoted:
|
||||
readQuotedFieldInto(out, buf);
|
||||
@ -236,7 +236,7 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin
|
||||
if constexpr (read_string)
|
||||
readEscapedString(result, buf);
|
||||
else
|
||||
readTSVField(result, buf);
|
||||
readTSVField<false>(result, buf);
|
||||
break;
|
||||
default:
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule));
|
||||
|
@ -496,13 +496,19 @@ static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf)
|
||||
}
|
||||
|
||||
|
||||
template <typename Vector, bool parse_complex_escape_sequence>
|
||||
template <typename Vector, bool parse_complex_escape_sequence, bool support_crlf>
|
||||
void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf)
|
||||
{
|
||||
while (!buf.eof())
|
||||
{
|
||||
char * next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end());
|
||||
|
||||
char * next_pos;
|
||||
if constexpr (support_crlf)
|
||||
{
|
||||
next_pos = find_first_symbols<'\t', '\n', '\\','\r'>(buf.position(), buf.buffer().end());
|
||||
} else {
|
||||
next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end());
|
||||
}
|
||||
|
||||
appendToStringOrVector(s, buf, next_pos);
|
||||
buf.position() = next_pos;
|
||||
|
||||
@ -529,25 +535,41 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*buf.position() == '\r')
|
||||
{
|
||||
++buf.position(); // advance to \n after \r
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Vector>
|
||||
template <typename Vector, bool support_crlf>
|
||||
void readEscapedStringInto(Vector & s, ReadBuffer & buf)
|
||||
{
|
||||
readEscapedStringIntoImpl<Vector, true>(s, buf);
|
||||
readEscapedStringIntoImpl<Vector, true, support_crlf>(s, buf);
|
||||
}
|
||||
|
||||
|
||||
void readEscapedString(String & s, ReadBuffer & buf)
|
||||
{
|
||||
s.clear();
|
||||
readEscapedStringInto(s, buf);
|
||||
readEscapedStringInto<String,false>(s, buf);
|
||||
}
|
||||
|
||||
template void readEscapedStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
||||
template void readEscapedStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
|
||||
template<bool support_crlf>
|
||||
void readEscapedStringCRLF(String & s, ReadBuffer & buf)
|
||||
{
|
||||
s.clear();
|
||||
readEscapedStringInto<String,support_crlf>(s, buf);
|
||||
}
|
||||
|
||||
template void readEscapedStringInto<PaddedPODArray<UInt8>,false>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
||||
template void readEscapedStringInto<NullOutput,false>(NullOutput & s, ReadBuffer & buf);
|
||||
template void readEscapedStringInto<PaddedPODArray<UInt8>,true>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
|
||||
template void readEscapedStringInto<NullOutput,true>(NullOutput & s, ReadBuffer & buf);
|
||||
|
||||
template void readEscapedStringCRLF<true>(String & s, ReadBuffer & buf);
|
||||
template void readEscapedStringCRLF<false>(String & s, ReadBuffer & buf);
|
||||
|
||||
/** If enable_sql_style_quoting == true,
|
||||
* strings like 'abc''def' will be parsed as abc'def.
|
||||
@ -1761,10 +1783,16 @@ void readJSONField(String & s, ReadBuffer & buf)
|
||||
readParsedValueInto(s, buf, parse_func);
|
||||
}
|
||||
|
||||
template<bool support_crlf>
|
||||
void readTSVField(String & s, ReadBuffer & buf)
|
||||
{
|
||||
s.clear();
|
||||
readEscapedStringIntoImpl<String, false>(s, buf);
|
||||
readEscapedStringIntoImpl<String, false, support_crlf>(s, buf);
|
||||
}
|
||||
|
||||
template void readTSVField<true>(String & s, ReadBuffer & buf);
|
||||
template void readTSVField<false>(String & s, ReadBuffer & buf);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -548,6 +548,9 @@ void readString(String & s, ReadBuffer & buf);
|
||||
|
||||
void readEscapedString(String & s, ReadBuffer & buf);
|
||||
|
||||
template<bool support_crlf>
|
||||
void readEscapedStringCRLF(String & s, ReadBuffer & buf);
|
||||
|
||||
void readQuotedString(String & s, ReadBuffer & buf);
|
||||
void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
|
||||
|
||||
@ -601,7 +604,7 @@ void readStringInto(Vector & s, ReadBuffer & buf);
|
||||
template <typename Vector>
|
||||
void readNullTerminated(Vector & s, ReadBuffer & buf);
|
||||
|
||||
template <typename Vector>
|
||||
template <typename Vector, bool support_crlf>
|
||||
void readEscapedStringInto(Vector & s, ReadBuffer & buf);
|
||||
|
||||
template <bool enable_sql_style_quoting, typename Vector>
|
||||
@ -1757,6 +1760,7 @@ void readQuotedField(String & s, ReadBuffer & buf);
|
||||
|
||||
void readJSONField(String & s, ReadBuffer & buf);
|
||||
|
||||
template<bool support_crlf>
|
||||
void readTSVField(String & s, ReadBuffer & buf);
|
||||
|
||||
/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
|
||||
|
@ -134,7 +134,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex
|
||||
|
||||
/// If the key is not found, skip the value.
|
||||
NullOutput sink;
|
||||
readEscapedStringInto(sink, *in);
|
||||
readEscapedStringInto<NullOutput,false>(sink, *in);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <Formats/verbosePrintString.h>
|
||||
#include <Formats/EscapingRuleUtils.h>
|
||||
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
|
||||
#include "Formats/FormatSettings.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -105,14 +106,17 @@ template <bool read_string>
|
||||
String TabSeparatedFormatReader::readFieldIntoString()
|
||||
{
|
||||
String field;
|
||||
bool support_crlf = format_settings.tsv.crlf_end_of_line_input;
|
||||
if (is_raw)
|
||||
readString(field, *buf);
|
||||
else
|
||||
{
|
||||
if constexpr (read_string)
|
||||
readEscapedString(field, *buf);
|
||||
support_crlf ? readEscapedStringCRLF<true>(field, *buf)
|
||||
: readEscapedStringCRLF<false>(field, *buf);
|
||||
else
|
||||
readTSVField(field, *buf);
|
||||
support_crlf ? readTSVField<true>(field, *buf)
|
||||
: readTSVField<false>(field, *buf);
|
||||
}
|
||||
return field;
|
||||
}
|
||||
@ -123,7 +127,8 @@ void TabSeparatedFormatReader::skipField()
|
||||
if (is_raw)
|
||||
readStringInto(out, *buf);
|
||||
else
|
||||
readEscapedStringInto(out, *buf);
|
||||
format_settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<NullOutput,true>(out, *buf)
|
||||
: readEscapedStringInto<NullOutput,false>(out, *buf);
|
||||
}
|
||||
|
||||
void TabSeparatedFormatReader::skipHeaderRow()
|
||||
|
Loading…
Reference in New Issue
Block a user