Improve reading CSV field in CustomSeparated/Template format

This commit is contained in:
avogar 2022-11-17 15:21:38 +00:00
parent 10cc6fa415
commit fcfdd73d17
10 changed files with 214 additions and 26 deletions

View File

@ -117,6 +117,7 @@ struct FormatSettings
char tuple_delimiter = ',';
bool use_best_effort_in_schema_inference = true;
UInt64 skip_first_lines = 0;
String custom_delimiter;
} csv;
struct HiveText

View File

@ -642,9 +642,10 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
const char delimiter = settings.delimiter;
const char maybe_quote = *buf.position();
const String & custom_delimiter = settings.custom_delimiter;
/// Emptiness and not even in quotation marks.
if (maybe_quote == delimiter)
if (custom_delimiter.empty() && maybe_quote == delimiter)
return;
if ((settings.allow_single_quotes && maybe_quote == '\'') || (settings.allow_double_quotes && maybe_quote == '"'))
@ -682,6 +683,42 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
}
else
{
/// If custom_delimiter is specified, we should read until first occurrences of
/// custom_delimiter in buffer.
if (!custom_delimiter.empty())
{
PeekableReadBuffer * peekable_buf = dynamic_cast<PeekableReadBuffer *>(&buf);
if (!peekable_buf)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer");
while (true)
{
if (peekable_buf->eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter);
char * next_pos = reinterpret_cast<char *>(memchr(peekable_buf->position(), custom_delimiter[0], peekable_buf->available()));
if (!next_pos)
next_pos = peekable_buf->buffer().end();
appendToStringOrVector(s, *peekable_buf, next_pos);
peekable_buf->position() = next_pos;
if (!buf.hasPendingData())
continue;
{
PeekableReadBufferCheckpoint checkpoint{*peekable_buf, true};
if (checkString(custom_delimiter, *peekable_buf))
return;
}
s.push_back(*peekable_buf->position());
++peekable_buf->position();
}
return;
}
/// Unquoted case. Look for delimiter or \r or \n.
while (!buf.eof())
{
@ -776,6 +813,68 @@ void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & sett
s.push_back(quote);
}
void readCSVWithTwoPossibleDelimitersImpl(String & s, PeekableReadBuffer & buf, const String & first_delimiter, const String & second_delimiter)
{
/// Read all data until first_delimiter or second_delimiter
while (true)
{
if (buf.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, R"(Unexpected EOF while reading CSV string, expected on of delimiters "{}" or "{}")", first_delimiter, second_delimiter);
char * next_pos = buf.position();
while (next_pos != buf.buffer().end() && *next_pos != first_delimiter[0] && *next_pos != second_delimiter[0])
++next_pos;
appendToStringOrVector(s, buf, next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == first_delimiter[0])
{
PeekableReadBufferCheckpoint checkpoint(buf, true);
if (checkString(first_delimiter, buf))
return;
}
if (*buf.position() == second_delimiter[0])
{
PeekableReadBufferCheckpoint checkpoint(buf, true);
if (checkString(second_delimiter, buf))
return;
}
s.push_back(*buf.position());
++buf.position();
}
}
String readCSVStringWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter)
{
String res;
/// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
if (!buf.eof() && ((settings.allow_single_quotes && *buf.position() == '\'') || (settings.allow_double_quotes && *buf.position() == '"')))
readCSVStringInto(res, buf, settings);
else
readCSVWithTwoPossibleDelimitersImpl(res, buf, first_delimiter, second_delimiter);
return res;
}
String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter)
{
String res;
/// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
if (!buf.eof() && ((settings.allow_single_quotes && *buf.position() == '\'') || (settings.allow_double_quotes && *buf.position() == '"')))
readCSVField(res, buf, settings);
else
readCSVWithTwoPossibleDelimitersImpl(res, buf, first_delimiter, second_delimiter);
return res;
}
template void readCSVStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
template void readCSVStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings);

View File

@ -558,9 +558,10 @@ void readStringUntilWhitespace(String & s, ReadBuffer & buf);
* - string could be placed in quotes; quotes could be single: ' if FormatSettings::CSV::allow_single_quotes is true
* or double: " if FormatSettings::CSV::allow_double_quotes is true;
* - or string could be unquoted - this is determined by first character;
* - if string is unquoted, then it is read until next delimiter,
* either until end of line (CR or LF),
* or until end of stream;
* - if string is unquoted, then:
* - If settings.custom_delimiter is not specified, it is read until next settings.delimiter, either until end of line (CR or LF) or until end of stream;
* - If settings.custom_delimiter is specified it reads until first occurrences of settings.custom_delimiter in buffer.
* This works only if provided buffer is PeekableReadBuffer.
* but spaces and tabs at begin and end of unquoted string are consumed but ignored (note that this behaviour differs from RFC).
* - if string is in quotes, then it will be read until closing quote,
* but sequences of two consecutive quotes are parsed as single quote inside string;
@ -570,6 +571,13 @@ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & set
/// Differ from readCSVString in that it doesn't remove quotes around field if any.
void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
/// Read string in CSV format until the first occurrence of first_delimiter or second_delimiter.
/// Similar to readCSVString if string is in quotes, we read only data in quotes.
String readCSVStringWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter);
/// Same as above but includes quotes in the result if any.
String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter);
/// Read and append result to array of characters.
template <typename Vector>
void readStringInto(Vector & s, ReadBuffer & buf);

View File

@ -12,16 +12,6 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
}
static FormatSettings updateFormatSettings(const FormatSettings & settings)
{
if (settings.custom.escaping_rule != FormatSettings::EscapingRule::CSV || settings.custom.field_delimiter.empty())
return settings;
auto updated = settings;
updated.csv.delimiter = settings.custom.field_delimiter.front();
return updated;
}
CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
const Block & header_,
ReadBuffer & in_buf_,
@ -31,7 +21,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
bool ignore_spaces_,
const FormatSettings & format_settings_)
: CustomSeparatedRowInputFormat(
header_, std::make_unique<PeekableReadBuffer>(in_buf_), params_, with_names_, with_types_, ignore_spaces_, updateFormatSettings(format_settings_))
header_, std::make_unique<PeekableReadBuffer>(in_buf_), params_, with_names_, with_types_, ignore_spaces_, format_settings_)
{
}
@ -171,15 +161,31 @@ bool CustomSeparatedFormatReader::checkEndOfRow()
}
template <bool is_header>
String CustomSeparatedFormatReader::readFieldIntoString(bool is_first)
String CustomSeparatedFormatReader::readFieldIntoString(bool is_first, bool is_last, bool is_unknown)
{
if (!is_first)
skipFieldDelimiter();
skipSpaces();
updateFormatSettings(is_last);
if constexpr (is_header)
{
/// If the number of columns is unknown and we use CSV escaping rule,
/// we don't know what delimiter to expect after the value,
/// so we should read until we meet field_delimiter or row_after_delimiter.
if (is_unknown && format_settings.custom.escaping_rule == FormatSettings::EscapingRule::CSV)
return readCSVStringWithTwoPossibleDelimiters(
*buf, format_settings.csv, format_settings.custom.field_delimiter, format_settings.custom.row_after_delimiter);
return readStringByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings);
}
else
{
if (is_unknown && format_settings.custom.escaping_rule == FormatSettings::EscapingRule::CSV)
return readCSVFieldWithTwoPossibleDelimiters(
*buf, format_settings.csv, format_settings.custom.field_delimiter, format_settings.custom.row_after_delimiter);
return readFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings);
}
}
template <bool is_header>
@ -192,14 +198,14 @@ std::vector<String> CustomSeparatedFormatReader::readRowImpl()
{
do
{
values.push_back(readFieldIntoString<is_header>(values.empty()));
values.push_back(readFieldIntoString<is_header>(values.empty(), false, true));
} while (!checkEndOfRow());
columns = values.size();
}
else
{
for (size_t i = 0; i != columns; ++i)
values.push_back(readFieldIntoString<is_header>(i == 0));
values.push_back(readFieldIntoString<is_header>(i == 0, i + 1 == columns, false));
}
skipRowEndDelimiter();
@ -223,9 +229,33 @@ void CustomSeparatedFormatReader::skipHeaderRow()
skipRowEndDelimiter();
}
bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &)
void CustomSeparatedFormatReader::updateFormatSettings(bool is_last_column)
{
if (format_settings.custom.escaping_rule != FormatSettings::EscapingRule::CSV)
return;
format_settings.csv.custom_delimiter.clear();
if (is_last_column)
{
if (format_settings.custom.row_after_delimiter.size() == 1)
format_settings.csv.delimiter = format_settings.custom.row_after_delimiter.front();
else
format_settings.csv.custom_delimiter = format_settings.custom.row_after_delimiter;
}
else
{
if (format_settings.custom.field_delimiter.size() == 1)
format_settings.csv.delimiter = format_settings.custom.field_delimiter.front();
else
format_settings.csv.custom_delimiter = format_settings.custom.field_delimiter;
}
}
bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String &)
{
skipSpaces();
updateFormatSettings(is_last_file_column);
return deserializeFieldByEscapingRule(type, serialization, column, *buf, format_settings.custom.escaping_rule, format_settings);
}
@ -237,6 +267,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
if (!check_eof)
return false;
/// Allow optional \n before eof.
checkChar('\n', *buf);
return buf->eof();
}
@ -246,6 +278,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
if (!check_eof)
return true;
/// Allow optional \n before eof.
checkChar('\n', *buf);
if (buf->eof())
return true;
}
@ -312,7 +346,7 @@ CustomSeparatedSchemaReader::CustomSeparatedSchemaReader(
&reader,
getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule))
, buf(in_)
, reader(buf, ignore_spaces_, updateFormatSettings(format_setting_))
, reader(buf, ignore_spaces_, format_setting_)
{
}

View File

@ -83,7 +83,9 @@ private:
std::vector<String> readRowImpl();
template <bool read_string>
String readFieldIntoString(bool is_first);
String readFieldIntoString(bool is_first, bool is_last, bool is_unknown);
void updateFormatSettings(bool is_last_column);
PeekableReadBuffer * buf;
bool ignore_spaces;

View File

@ -130,9 +130,16 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
{
EscapingRule escaping_rule = row_format.escaping_rules[file_column];
if (escaping_rule == EscapingRule::CSV)
/// Will read unquoted string until settings.csv.delimiter
settings.csv.delimiter = row_format.delimiters[file_column + 1].empty() ? default_csv_delimiter :
row_format.delimiters[file_column + 1].front();
{
settings.csv.custom_delimiter.clear();
if (row_format.delimiters[file_column + 1].empty())
settings.csv.delimiter = default_csv_delimiter;
else if (row_format.delimiters[file_column + 1].size() == 1)
settings.csv.delimiter = row_format.delimiters[file_column + 1].front();
else
settings.csv.custom_delimiter = row_format.delimiters[file_column + 1];
}
try
{
return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, settings);
@ -466,6 +473,7 @@ TemplateSchemaReader::TemplateSchemaReader(
, format(format_)
, row_format(row_format_)
, format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings)
, default_csv_delimiter(format_settings_.csv.delimiter)
{
setColumnNames(row_format.column_names);
}
@ -490,7 +498,15 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
{
format_reader.skipDelimiter(i);
if (row_format.escaping_rules[i] == FormatSettings::EscapingRule::CSV)
format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front();
{
format_settings.csv.custom_delimiter.clear();
if (row_format.delimiters[i + 1].empty())
format_settings.csv.delimiter = default_csv_delimiter;
else if (row_format.delimiters[i + 1].size() == 1)
format_settings.csv.delimiter = row_format.delimiters[i + 1].front();
else
format_settings.csv.custom_delimiter = row_format.delimiters[i + 1];
}
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i]));

View File

@ -128,6 +128,7 @@ private:
const ParsedTemplateFormatString row_format;
TemplateFormatReader format_reader;
bool first_row = true;
const char default_csv_delimiter;
};
bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces);

View File

@ -111,7 +111,7 @@ public:
protected:
ReadBuffer * in;
const FormatSettings format_settings;
FormatSettings format_settings;
};
/// Base class for schema inference for formats with -WithNames and -WithNamesAndTypes suffixes.

View File

@ -0,0 +1,7 @@
1 2
3 4
1 2
3 4
ab|c de&f
ab|c de*f gh&k
|av *ad &ad

View File

@ -0,0 +1,20 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
echo "1||2&&3||4&&" | $CLICKHOUSE_LOCAL --input-format=CustomSeparated --format_custom_field_delimiter='||' --format_custom_row_after_delimiter='&&' --format_custom_escaping_rule='CSV' -q "select * from table"
echo "1||2|||3||4|||" | $CLICKHOUSE_LOCAL --input-format=CustomSeparated --format_custom_field_delimiter='||' --format_custom_row_after_delimiter='|||' --format_custom_escaping_rule='CSV' -q "select * from table"
echo "ab|c||de&f&&" | $CLICKHOUSE_LOCAL --input-format=CustomSeparated --format_custom_field_delimiter='||' --format_custom_row_after_delimiter='&&' --format_custom_escaping_rule='CSV' -q "select * from table"
echo -e "\${column_1:CSV}||\${column_2:CSV}**\${column_3:CSV}&&" > row_format_02481
TEMPLATE_SETTINGS="SETTINGS format_template_rows_between_delimiter='\n', format_template_row='row_format_02481'"
echo -e "ab|c||de*f**gh&k&&\n|av||*ad**&ad&&" | $CLICKHOUSE_LOCAL -q "select * from table" --input-format=Template --format_template_row='row_format_02481' --format_template_rows_between_delimiter ""
rm row_format_02481