review fix

This commit is contained in:
kevinyhzou 2023-06-14 10:48:21 +08:00
parent 911f8ad8dc
commit f3b99156ac
7 changed files with 31 additions and 27 deletions

View File

@ -468,7 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`.
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
- [input_format_csv_use_whitespace_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_use_whitespace_tab_as_delimiter) - use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -914,9 +914,9 @@ Result
" string "
```
### input_format_csv_use_whitespace_tab_as_delimiter {#input_format_csv_use_whitespace_tab_as_delimiter}
### input_format_csv_allow_whitespace_or_tab_as_delimiter {#input_format_csv_allow_whitespace_or_tab_as_delimiter}
Use whitespace or tab as field delimiter in CSV strings.
Allow to use whitespace or tab as field delimiter in CSV strings.
Default value: `false`.
@ -925,7 +925,7 @@ Default value: `false`.
Query
```bash
echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_use_whitespace_tab_as_delimiter=true --format_csv_delimiter=' '
echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_allow_whitespace_or_tab_as_delimiter=true --format_csv_delimiter=' '
```
Result
@ -937,7 +937,7 @@ a b
Query
```bash
echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_use_whitespace_tab_as_delimiter=true --format_csv_delimiter='\t'
echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_allow_whitespace_or_tab_as_delimiter=true --format_csv_delimiter='\t'
```
Result

View File

@ -850,7 +850,7 @@ class IColumn;
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_use_whitespace_tab_as_delimiter, false, "Use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \

View File

@ -70,7 +70,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
format_settings.csv.use_whitespace_tab_as_delimiter = settings.input_format_csv_use_whitespace_tab_as_delimiter;
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;

View File

@ -137,7 +137,7 @@ struct FormatSettings
String custom_delimiter;
bool try_detect_header = true;
bool trim_whitespaces = true;
bool use_whitespace_tab_as_delimiter = false;
bool allow_whitespace_or_tab_as_delimiter = false;
} csv;
struct HiveText

View File

@ -25,10 +25,14 @@ namespace ErrorCodes
namespace
{
void checkBadDelimiter(char delimiter, bool use_whitespace_tab_as_delimiter)
void checkBadDelimiter(char delimiter, bool allow_whitespace_or_tab_as_delimiter)
{
if ((delimiter == ' ' || delimiter == '\t') && allow_whitespace_or_tab_as_delimiter)
{
return;
}
constexpr std::string_view bad_delimiters = " \t\"'.UL";
if (bad_delimiters.find(delimiter) != std::string_view::npos && !use_whitespace_tab_as_delimiter)
if (bad_delimiters.find(delimiter) != std::string_view::npos)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"CSV format may not work correctly with delimiter '{}'. Try use CustomSeparated format instead",
@ -68,7 +72,7 @@ CSVRowInputFormat::CSVRowInputFormat(
format_settings_.csv.try_detect_header),
buf(std::move(in_))
{
checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.use_whitespace_tab_as_delimiter);
checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.allow_whitespace_or_tab_as_delimiter);
}
CSVRowInputFormat::CSVRowInputFormat(
@ -90,7 +94,7 @@ CSVRowInputFormat::CSVRowInputFormat(
format_settings_.csv.try_detect_header),
buf(std::move(in_))
{
checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.use_whitespace_tab_as_delimiter);
checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.allow_whitespace_or_tab_as_delimiter);
}
void CSVRowInputFormat::syncAfterError()
@ -134,9 +138,9 @@ static void skipEndOfLine(ReadBuffer & in)
}
/// Skip `whitespace` symbols allowed in CSV.
static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & use_whitespace_tab_as_delimiter)
static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & allow_whitespace_or_tab_as_delimiter)
{
if (use_whitespace_tab_as_delimiter)
if (allow_whitespace_or_tab_as_delimiter)
{
return;
}
@ -150,7 +154,7 @@ CSVFormatReader::CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings
void CSVFormatReader::skipFieldDelimiter()
{
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
assertChar(format_settings.csv.delimiter, *buf);
}
@ -158,7 +162,7 @@ template <bool read_string>
String CSVFormatReader::readCSVFieldIntoString()
{
if (format_settings.csv.trim_whitespaces) [[likely]]
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
String field;
if constexpr (read_string)
@ -170,14 +174,14 @@ String CSVFormatReader::readCSVFieldIntoString()
void CSVFormatReader::skipField()
{
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
NullOutput out;
readCSVStringInto(out, *buf, format_settings.csv);
}
void CSVFormatReader::skipRowEndDelimiter()
{
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
if (buf->eof())
return;
@ -186,7 +190,7 @@ void CSVFormatReader::skipRowEndDelimiter()
if (*buf->position() == format_settings.csv.delimiter)
++buf->position();
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
if (buf->eof())
return;
@ -198,7 +202,7 @@ void CSVFormatReader::skipHeaderRow()
do
{
skipField();
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
} while (checkChar(format_settings.csv.delimiter, *buf));
skipRowEndDelimiter();
@ -211,7 +215,7 @@ std::vector<String> CSVFormatReader::readRowImpl()
do
{
fields.push_back(readCSVFieldIntoString<is_header>());
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
} while (checkChar(format_settings.csv.delimiter, *buf));
skipRowEndDelimiter();
@ -224,7 +228,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
try
{
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
assertChar(delimiter, *buf);
}
catch (const DB::Exception &)
@ -250,7 +254,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
if (buf->eof())
return true;
@ -259,7 +263,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
if (*buf->position() == format_settings.csv.delimiter)
{
++buf->position();
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
if (buf->eof())
return true;
}
@ -287,7 +291,7 @@ bool CSVFormatReader::readField(
const String & /*column_name*/)
{
if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(removeNullable(type))) [[likely]]
skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter);
skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter);
const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter;
const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r');

View File

@ -10,8 +10,8 @@ $CLICKHOUSE_CLIENT -q "drop table if exists test_whitespace"
$CLICKHOUSE_CLIENT -q "drop table if exists test_tab"
$CLICKHOUSE_CLIENT -q "create table test_whitespace (x UInt32, y String, z String) engine=MergeTree order by x"
$CLICKHOUSE_CLIENT -q "create table test_tab (x UInt32, y String, z String) engine=MergeTree order by x"
cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_use_whitespace_tab_as_delimiter=true FORMAT CSV"
cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_use_whitespace_tab_as_delimiter=true FORMAT CSV"
cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_allow_whitespace_or_tab_as_delimiter=true FORMAT CSV"
cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_allow_whitespace_or_tab_as_delimiter=true FORMAT CSV"
$CLICKHOUSE_CLIENT -q "select * from test_whitespace"
$CLICKHOUSE_CLIENT -q "select * from test_tab"
$CLICKHOUSE_CLIENT -q "drop table test_whitespace"