Merge pull request #2263 from luc1ph3r/arbitrary-csv-delimiter

Support an arbitrary CSV delimiter
This commit is contained in:
alexey-milovidov 2018-04-27 13:13:09 -07:00 committed by GitHub
commit 093c054b1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 178 additions and 25 deletions

View File

@ -7,8 +7,8 @@ namespace DB
{
CSVRowOutputStream::CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_)
: ostr(ostr_), sample(sample_), with_names(with_names_), with_types(with_types_)
CSVRowOutputStream::CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const char delimiter_, bool with_names_, bool with_types_)
: ostr(ostr_), sample(sample_), delimiter(delimiter_), with_names(with_names_), with_types(with_types_)
{
size_t columns = sample.columns();
data_types.resize(columns);
@ -32,7 +32,7 @@ void CSVRowOutputStream::writePrefix()
for (size_t i = 0; i < columns; ++i)
{
writeCSVString(sample.safeGetByPosition(i).name, ostr);
writeChar(i == columns - 1 ? '\n' : ',', ostr);
writeChar(i == columns - 1 ? '\n' : delimiter, ostr);
}
}
@ -41,7 +41,7 @@ void CSVRowOutputStream::writePrefix()
for (size_t i = 0; i < columns; ++i)
{
writeCSVString(sample.safeGetByPosition(i).type->getName(), ostr);
writeChar(i == columns - 1 ? '\n' : ',', ostr);
writeChar(i == columns - 1 ? '\n' : delimiter, ostr);
}
}
}
@ -55,7 +55,7 @@ void CSVRowOutputStream::writeField(const IColumn & column, const IDataType & ty
void CSVRowOutputStream::writeFieldDelimiter()
{
writeChar(',', ostr);
writeChar(delimiter, ostr);
}

View File

@ -19,7 +19,7 @@ public:
/** with_names - output in the first line a header with column names
* with_types - output in the next line header with the names of the types
*/
CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_ = false, bool with_types_ = false);
CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const char delimiter_, bool with_names_ = false, bool with_types_ = false);
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
void writeFieldDelimiter() override;
@ -44,6 +44,7 @@ protected:
WriteBuffer & ostr;
const Block sample;
const char delimiter;
bool with_names;
bool with_types;
DataTypes data_types;

View File

@ -81,13 +81,12 @@ BlockInputStreamPtr FormatFactory::getInput(const String & name, ReadBuffer & bu
{
return wrap_row_stream(std::make_shared<ValuesRowInputStream>(buf, sample, context, settings.input_format_values_interpret_expressions));
}
else if (name == "CSV")
else if (name == "CSV" || name == "CSVWithNames")
{
return wrap_row_stream(std::make_shared<CSVRowInputStream>(buf, sample, ','));
}
else if (name == "CSVWithNames")
{
return wrap_row_stream(std::make_shared<CSVRowInputStream>(buf, sample, ',', true));
char csv_delimiter = settings.format_csv_delimiter;
bool with_names = name == "CSVWithNames";
return wrap_row_stream(std::make_shared<CSVRowInputStream>(buf, sample, csv_delimiter, with_names));
}
else if (name == "TSKV")
{
@ -152,10 +151,13 @@ static BlockOutputStreamPtr getOutputImpl(const String & name, WriteBuffer & buf
return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<TabSeparatedRowOutputStream>(buf, sample, true, true), sample);
else if (name == "TabSeparatedRaw" || name == "TSVRaw")
return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<TabSeparatedRawRowOutputStream>(buf, sample), sample);
else if (name == "CSV")
return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<CSVRowOutputStream>(buf, sample), sample);
else if (name == "CSVWithNames")
return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<CSVRowOutputStream>(buf, sample, true), sample);
else if (name == "CSV" || name == "CSVWithNames")
{
char csv_delimiter = settings.format_csv_delimiter;
bool with_names = name == "CSVWithNames";
return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<CSVRowOutputStream>(buf, sample, csv_delimiter, with_names), sample);
}
else if (name == "Pretty")
return std::make_shared<PrettyBlockOutputStream>(buf, sample, false, settings.output_format_pretty_max_rows, context);
else if (name == "PrettyCompact")

View File

@ -194,9 +194,9 @@ void DataTypeFixedString::serializeTextCSV(const IColumn & column, size_t row_nu
}
void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char /*delimiter*/) const
void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char delimiter) const
{
read(*this, column, [&istr](ColumnFixedString::Chars_t & data) { readCSVStringInto(data, istr); });
read(*this, column, [&istr, delimiter](ColumnFixedString::Chars_t & data) { readCSVStringInto(data, istr, delimiter); });
}

View File

@ -285,9 +285,9 @@ void DataTypeString::serializeTextCSV(const IColumn & column, size_t row_num, Wr
}
void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char /*delimiter*/) const
void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char delimiter) const
{
read(column, [&](ColumnString::Chars_t & data) { readCSVStringInto(data, istr); });
read(column, [&](ColumnString::Chars_t & data) { readCSVStringInto(data, istr, delimiter); });
}

View File

@ -247,7 +247,8 @@ struct Settings
M(SettingUInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.") \
M(SettingUInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.") \
M(SettingUInt64, max_network_bandwidth_for_user, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running user queries. Zero means unlimited.")\
M(SettingUInt64, max_network_bandwidth_for_all_users, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running queries. Zero means unlimited.")
M(SettingUInt64, max_network_bandwidth_for_all_users, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running queries. Zero means unlimited.") \
M(SettingChar, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.") \
#define DECLARE(TYPE, NAME, DEFAULT, DESCRIPTION) \
TYPE NAME {DEFAULT};

View File

@ -26,6 +26,7 @@ namespace ErrorCodes
extern const int UNKNOWN_COMPRESSION_METHOD;
extern const int UNKNOWN_DISTRIBUTED_PRODUCT_MODE;
extern const int UNKNOWN_GLOBAL_SUBQUERIES_METHOD;
extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH;
}
@ -706,4 +707,58 @@ struct SettingString
}
};
struct SettingChar
{
private:
void checkStringIsACharacter(const String & x) const
{
if (x.size() != 1)
throw Exception("A setting's value string has to be an exactly one character long", ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH);
}
public:
char value;
bool changed = false;
SettingChar(char x = '\0') : value(x) {}
operator char() const { return value; }
SettingChar & operator= (char x) { set(x); return *this; }
String toString() const
{
return String(1, value);
}
void set(char x) {
value = x;
changed = true;
}
void set(const String & x)
{
checkStringIsACharacter(x);
value = x[0];
changed = true;
}
void set(const Field & x)
{
const String & s = safeGet<const String &>(x);
set(s);
}
void set(ReadBuffer & buf)
{
String x;
readBinary(x, buf);
checkStringIsACharacter(x);
set(x);
}
void write(WriteBuffer & buf) const
{
writeBinary(toString(), buf);
}
};
}

View File

@ -0,0 +1,30 @@
Hello, world 123 2016-01-01
Hello, "world" 456 2016-01-02
Hello "world" 789 2016-01-03
Hello\n world 100 2016-01-04
Hello, world 123 2016-01-01
Hello, "world" 456 2016-01-02
Hello "world" 789 2016-01-03
Hello\n world 100 2016-01-04
"Hello, world";123;"2016-01-01"
"Hello, ""world""";456;"2016-01-02"
"Hello ""world""";789;"2016-01-03"
"Hello
world";100;"2016-01-04"
"Hello, world"/123/"2016-01-01"
"Hello, ""world"""/456/"2016-01-02"
"Hello ""world"""/789/"2016-01-03"
"Hello
world"/100/"2016-01-04"
abc,def hello
hello world
hello "world" abc,def
"abc,def";"hello"
"hello";"world"
"hello ""world""";"abc,def"
"abc,def","hello"
"hello","world"
"hello ""world""","abc,def"
"abc,def"/"hello"
"hello"/"world"
"hello ""world"""/"abc,def"

View File

@ -0,0 +1,52 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.csv";
$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s String, n UInt64, d Date) ENGINE = Memory";
echo '"Hello, world"| 123| "2016-01-01"
"Hello, ""world"""| "456"| 2016-01-02|
Hello "world"| 789 |2016-01-03
"Hello
world"| 100| 2016-01-04|' | $CLICKHOUSE_CLIENT --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv ORDER BY d";
$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s String, n UInt64, d Date) ENGINE = Memory";
echo '"Hello, world"; 123; "2016-01-01"
"Hello, ""world"""; "456"; 2016-01-02;
Hello "world"; 789 ;2016-01-03
"Hello
world"; 100; 2016-01-04;' | $CLICKHOUSE_CLIENT --multiquery --query="SET format_csv_delimiter=';'; INSERT INTO test.csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv ORDER BY d";
$CLICKHOUSE_CLIENT --format_csv_delimiter=";" --query="SELECT * FROM test.csv ORDER BY d FORMAT CSV";
$CLICKHOUSE_CLIENT --format_csv_delimiter="/" --query="SELECT * FROM test.csv ORDER BY d FORMAT CSV";
$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s1 String, s2 String) ENGINE = Memory";
echo 'abc,def;hello;
hello; world;
"hello ""world""";abc,def;' | $CLICKHOUSE_CLIENT --multiquery --query="SET format_csv_delimiter=';'; INSERT INTO test.csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv";
$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s1 String, s2 String) ENGINE = Memory";
echo '"s1";"s2"
abc,def;hello;
hello; world;
"hello ""world""";abc,def;' | $CLICKHOUSE_CLIENT --multiquery --query="SET format_csv_delimiter=';'; INSERT INTO test.csv FORMAT CSVWithNames";
$CLICKHOUSE_CLIENT --format_csv_delimiter=";" --query="SELECT * FROM test.csv FORMAT CSV";
$CLICKHOUSE_CLIENT --format_csv_delimiter="," --query="SELECT * FROM test.csv FORMAT CSV";
$CLICKHOUSE_CLIENT --format_csv_delimiter="/" --query="SELECT * FROM test.csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";

View File

@ -2,9 +2,11 @@
Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)).
When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by commas. Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost).
When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by a delimiter&ast;. Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost).
When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to a comma or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) are all supported.
&ast;By default — `,`. See a [format_csv_delimiter](/docs/en/operations/settings/settings/#format_csv_delimiter) setting for additional info.
When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to a delimiter or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) are all supported.
The CSV format supports the output of totals and extremes the same way as `TabSeparated`.

View File

@ -338,3 +338,7 @@ It works for JSONEachRow and TSKV formats.
## output_format_json_quote_64bit_integers
If the value is true, integers appear in quotes when using JSON\* Int64 and UInt64 formats (for compatibility with most JavaScript implementations); otherwise, integers are output without the quotes.
## format_csv_delimiter
The character to be considered as a delimiter in CSV data. By default, `,`.

View File

@ -2,8 +2,10 @@
Формат comma separated values ([RFC](https://tools.ietf.org/html/rfc4180)).
При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются запятыми. Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж).
При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются символом-разделителем&ast;. Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж).
При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. В том числе, строки могут быть расположены без кавычек - тогда они парсятся до запятой или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты.
&ast;По умолчанию — `,`. См. настройку [format_csv_delimiter](/docs/ru/operations/settings/settings/#format_csv_delimiter) для дополнительной информации.
При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. В том числе, строки могут быть расположены без кавычек - тогда они парсятся до символа-разделителя или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты.
Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`.

View File

@ -335,3 +335,7 @@ ClickHouse применяет настройку в том случае, ког
## output_format_json_quote_64bit_integers
Если значение истинно, то при использовании JSON\* форматов UInt64 и Int64 числа выводятся в кавычках (из соображений совместимости с большинством реализаций JavaScript), иначе - без кавычек.
## format_csv_delimiter
Символ, интерпретируемый как разделитель в данных формата CSV. По умолчанию — `,`.