Merge pull request #2263 from luc1ph3r/arbitrary-csv-delimiter

Support an arbitrary CSV delimiter
2024-09-21 01:00:48 +00:00 · 2018-04-27 13:13:09 -07:00 · 2018-04-27 13:13:09 -07:00 · 093c054b1f
commit 093c054b1f
parent b43e02cf4a 96e2dfa790
13 changed files with 178 additions and 25 deletions
--- a/dbms/src/DataStreams/CSVRowOutputStream.cpp
+++ b/dbms/src/DataStreams/CSVRowOutputStream.cpp
@ -7,8 +7,8 @@ namespace DB
 {


-CSVRowOutputStream::CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_)
-    : ostr(ostr_), sample(sample_), with_names(with_names_), with_types(with_types_)
+CSVRowOutputStream::CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const char delimiter_, bool with_names_, bool with_types_)
+    : ostr(ostr_), sample(sample_), delimiter(delimiter_), with_names(with_names_), with_types(with_types_)
 {
    size_t columns = sample.columns();
    data_types.resize(columns);
@ -32,7 +32,7 @@ void CSVRowOutputStream::writePrefix()
        for (size_t i = 0; i < columns; ++i)
        {
            writeCSVString(sample.safeGetByPosition(i).name, ostr);
-            writeChar(i == columns - 1 ? '\n' : ',', ostr);
+            writeChar(i == columns - 1 ? '\n' : delimiter, ostr);
        }
    }

@ -41,7 +41,7 @@ void CSVRowOutputStream::writePrefix()
        for (size_t i = 0; i < columns; ++i)
        {
            writeCSVString(sample.safeGetByPosition(i).type->getName(), ostr);
-            writeChar(i == columns - 1 ? '\n' : ',', ostr);
+            writeChar(i == columns - 1 ? '\n' : delimiter, ostr);
        }
    }
 }
@ -55,7 +55,7 @@ void CSVRowOutputStream::writeField(const IColumn & column, const IDataType & ty

 void CSVRowOutputStream::writeFieldDelimiter()
 {
-    writeChar(',', ostr);
+    writeChar(delimiter, ostr);
 }


--- a/dbms/src/DataStreams/CSVRowOutputStream.h
+++ b/dbms/src/DataStreams/CSVRowOutputStream.h
@ -19,7 +19,7 @@ public:
    /** with_names - output in the first line a header with column names
      * with_types - output in the next line header with the names of the types
      */
-    CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_ = false, bool with_types_ = false);
+    CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const char delimiter_, bool with_names_ = false, bool with_types_ = false);

    void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
    void writeFieldDelimiter() override;
@ -44,6 +44,7 @@ protected:

    WriteBuffer & ostr;
    const Block sample;
+    const char delimiter;
    bool with_names;
    bool with_types;
    DataTypes data_types;
--- a/dbms/src/DataStreams/FormatFactory.cpp
+++ b/dbms/src/DataStreams/FormatFactory.cpp
@ -81,13 +81,12 @@ BlockInputStreamPtr FormatFactory::getInput(const String & name, ReadBuffer & bu
    {
        return wrap_row_stream(std::make_shared<ValuesRowInputStream>(buf, sample, context, settings.input_format_values_interpret_expressions));
    }
-    else if (name == "CSV")
+    else if (name == "CSV" || name == "CSVWithNames")
    {
-        return wrap_row_stream(std::make_shared<CSVRowInputStream>(buf, sample, ','));
-    }
-    else if (name == "CSVWithNames")
-    {
-        return wrap_row_stream(std::make_shared<CSVRowInputStream>(buf, sample, ',', true));
+        char csv_delimiter = settings.format_csv_delimiter;
+        bool with_names = name == "CSVWithNames";
+
+        return wrap_row_stream(std::make_shared<CSVRowInputStream>(buf, sample, csv_delimiter, with_names));
    }
    else if (name == "TSKV")
    {
@ -152,10 +151,13 @@ static BlockOutputStreamPtr getOutputImpl(const String & name, WriteBuffer & buf
        return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<TabSeparatedRowOutputStream>(buf, sample, true, true), sample);
    else if (name == "TabSeparatedRaw" || name == "TSVRaw")
        return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<TabSeparatedRawRowOutputStream>(buf, sample), sample);
-    else if (name == "CSV")
-        return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<CSVRowOutputStream>(buf, sample), sample);
-    else if (name == "CSVWithNames")
-        return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<CSVRowOutputStream>(buf, sample, true), sample);
+    else if (name == "CSV" || name == "CSVWithNames")
+    {
+        char csv_delimiter = settings.format_csv_delimiter;
+        bool with_names = name == "CSVWithNames";
+
+        return std::make_shared<BlockOutputStreamFromRowOutputStream>(std::make_shared<CSVRowOutputStream>(buf, sample, csv_delimiter, with_names), sample);
+    }
    else if (name == "Pretty")
        return std::make_shared<PrettyBlockOutputStream>(buf, sample, false, settings.output_format_pretty_max_rows, context);
    else if (name == "PrettyCompact")
--- a/dbms/src/DataTypes/DataTypeFixedString.cpp
+++ b/dbms/src/DataTypes/DataTypeFixedString.cpp
@ -194,9 +194,9 @@ void DataTypeFixedString::serializeTextCSV(const IColumn & column, size_t row_nu
 }


-void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char /*delimiter*/) const
+void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char delimiter) const
 {
-    read(*this, column, [&istr](ColumnFixedString::Chars_t & data) { readCSVStringInto(data, istr); });
+    read(*this, column, [&istr, delimiter](ColumnFixedString::Chars_t & data) { readCSVStringInto(data, istr, delimiter); });
 }


--- a/dbms/src/DataTypes/DataTypeString.cpp
+++ b/dbms/src/DataTypes/DataTypeString.cpp
@ -285,9 +285,9 @@ void DataTypeString::serializeTextCSV(const IColumn & column, size_t row_num, Wr
 }


-void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char /*delimiter*/) const
+void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char delimiter) const
 {
-    read(column, [&](ColumnString::Chars_t & data) { readCSVStringInto(data, istr); });
+    read(column, [&](ColumnString::Chars_t & data) { readCSVStringInto(data, istr, delimiter); });
 }


--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@ -247,7 +247,8 @@ struct Settings
    M(SettingUInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.") \
    M(SettingUInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.") \
    M(SettingUInt64, max_network_bandwidth_for_user, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running user queries. Zero means unlimited.")\
-    M(SettingUInt64, max_network_bandwidth_for_all_users, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running queries. Zero means unlimited.")
+    M(SettingUInt64, max_network_bandwidth_for_all_users, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running queries. Zero means unlimited.") \
+    M(SettingChar, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.") \

 #define DECLARE(TYPE, NAME, DEFAULT, DESCRIPTION) \
    TYPE NAME {DEFAULT};
--- a/dbms/src/Interpreters/SettingsCommon.h
+++ b/dbms/src/Interpreters/SettingsCommon.h
@ -26,6 +26,7 @@ namespace ErrorCodes
    extern const int UNKNOWN_COMPRESSION_METHOD;
    extern const int UNKNOWN_DISTRIBUTED_PRODUCT_MODE;
    extern const int UNKNOWN_GLOBAL_SUBQUERIES_METHOD;
+    extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH;
 }


@ -706,4 +707,58 @@ struct SettingString
    }
 };

+
+struct SettingChar
+{
+private:
+    void checkStringIsACharacter(const String & x) const
+    {
+        if (x.size() != 1)
+            throw Exception("A setting's value string has to be an exactly one character long", ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH);
+    }
+public:
+    char value;
+    bool changed = false;
+
+    SettingChar(char x = '\0') : value(x) {}
+
+    operator char() const { return value; }
+    SettingChar & operator= (char x) { set(x); return *this; }
+
+    String toString() const
+    {
+        return String(1, value);
+    }
+
+    void set(char x) {
+        value = x;
+        changed = true;
+    }
+
+    void set(const String & x)
+    {
+        checkStringIsACharacter(x);
+        value = x[0];
+        changed = true;
+    }
+
+    void set(const Field & x)
+    {
+        const String & s = safeGet<const String &>(x);
+        set(s);
+    }
+
+    void set(ReadBuffer & buf)
+    {
+        String x;
+        readBinary(x, buf);
+        checkStringIsACharacter(x);
+        set(x);
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writeBinary(toString(), buf);
+    }
+};
 }
--- a/dbms/tests/queries/0_stateless/00630_arbitrary_csv_delimiter.reference
+++ b/dbms/tests/queries/0_stateless/00630_arbitrary_csv_delimiter.reference
@ -0,0 +1,30 @@
+Hello, world	123	2016-01-01
+Hello, "world"	456	2016-01-02
+Hello "world"	789	2016-01-03
+Hello\n world	100	2016-01-04
+Hello, world	123	2016-01-01
+Hello, "world"	456	2016-01-02
+Hello "world"	789	2016-01-03
+Hello\n world	100	2016-01-04
+"Hello, world";123;"2016-01-01"
+"Hello, ""world""";456;"2016-01-02"
+"Hello ""world""";789;"2016-01-03"
+"Hello
+ world";100;"2016-01-04"
+"Hello, world"/123/"2016-01-01"
+"Hello, ""world"""/456/"2016-01-02"
+"Hello ""world"""/789/"2016-01-03"
+"Hello
+ world"/100/"2016-01-04"
+abc,def	hello
+hello	world
+hello "world"	abc,def
+"abc,def";"hello"
+"hello";"world"
+"hello ""world""";"abc,def"
+"abc,def","hello"
+"hello","world"
+"hello ""world""","abc,def"
+"abc,def"/"hello"
+"hello"/"world"
+"hello ""world"""/"abc,def"
--- a/dbms/tests/queries/0_stateless/00630_arbitrary_csv_delimiter.sh
+++ b/dbms/tests/queries/0_stateless/00630_arbitrary_csv_delimiter.sh
@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. $CURDIR/../shell_config.sh
+
+$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.csv";
+$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s String, n UInt64, d Date) ENGINE = Memory";
+
+echo '"Hello, world"| 123| "2016-01-01"
+"Hello, ""world"""| "456"| 2016-01-02|
+Hello "world"| 789 |2016-01-03
+"Hello
+ world"| 100| 2016-01-04|' | $CLICKHOUSE_CLIENT --format_csv_delimiter="|"  --query="INSERT INTO test.csv FORMAT CSV";
+
+$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv ORDER BY d";
+
+$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
+$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s String, n UInt64, d Date) ENGINE = Memory";
+
+echo '"Hello, world"; 123; "2016-01-01"
+"Hello, ""world"""; "456"; 2016-01-02;
+Hello "world"; 789 ;2016-01-03
+"Hello
+ world"; 100; 2016-01-04;' | $CLICKHOUSE_CLIENT --multiquery --query="SET format_csv_delimiter=';'; INSERT INTO test.csv FORMAT CSV";
+
+$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv ORDER BY d";
+$CLICKHOUSE_CLIENT --format_csv_delimiter=";" --query="SELECT * FROM test.csv ORDER BY d FORMAT CSV";
+$CLICKHOUSE_CLIENT --format_csv_delimiter="/" --query="SELECT * FROM test.csv ORDER BY d FORMAT CSV";
+
+$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
+$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s1 String, s2 String) ENGINE = Memory";
+
+echo 'abc,def;hello;
+hello; world;
+"hello ""world""";abc,def;' | $CLICKHOUSE_CLIENT --multiquery --query="SET format_csv_delimiter=';'; INSERT INTO test.csv FORMAT CSV";
+
+
+$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv";
+
+$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
+$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (s1 String, s2 String) ENGINE = Memory";
+
+echo '"s1";"s2"
+abc,def;hello;
+hello; world;
+"hello ""world""";abc,def;' | $CLICKHOUSE_CLIENT --multiquery --query="SET format_csv_delimiter=';'; INSERT INTO test.csv FORMAT CSVWithNames";
+
+$CLICKHOUSE_CLIENT --format_csv_delimiter=";" --query="SELECT * FROM test.csv FORMAT CSV";
+$CLICKHOUSE_CLIENT --format_csv_delimiter="," --query="SELECT * FROM test.csv FORMAT CSV";
+$CLICKHOUSE_CLIENT --format_csv_delimiter="/" --query="SELECT * FROM test.csv FORMAT CSV";
+
+$CLICKHOUSE_CLIENT --query="DROP TABLE test.csv";
--- a/docs/en/formats/csv.md
+++ b/docs/en/formats/csv.md
@ -2,9 +2,11 @@

 Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)).

-When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by commas. Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost).
+When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by a delimiter&ast;. Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost).

-When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to a comma or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) are all supported.
+&ast;By default — `,`. See a [format_csv_delimiter](/docs/en/operations/settings/settings/#format_csv_delimiter) setting for additional info.
+
+When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to a delimiter or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) are all supported.

 The CSV format supports the output of totals and extremes the same way as `TabSeparated`.

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -338,3 +338,7 @@ It works for JSONEachRow and TSKV formats.
 ## output_format_json_quote_64bit_integers

 If the value is true, integers appear in quotes when using JSON\* Int64 and UInt64 formats  (for compatibility with most JavaScript implementations); otherwise, integers are output without the quotes.
+
+## format_csv_delimiter
+
+The character to be considered as a delimiter in CSV data. By default, `,`.
--- a/docs/ru/formats/csv.md
+++ b/docs/ru/formats/csv.md
@ -2,8 +2,10 @@

 Формат comma separated values ([RFC](https://tools.ietf.org/html/rfc4180)).

-При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются запятыми. Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж).
+При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются символом-разделителем&ast;. Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж).

-При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. В том числе, строки могут быть расположены без кавычек - тогда они парсятся до запятой или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты.
+&ast;По умолчанию — `,`. См. настройку [format_csv_delimiter](/docs/ru/operations/settings/settings/#format_csv_delimiter) для дополнительной информации.
+
+При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. В том числе, строки могут быть расположены без кавычек - тогда они парсятся до символа-разделителя или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты.

 Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`.
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -335,3 +335,7 @@ ClickHouse применяет настройку в том случае, ког
 ## output_format_json_quote_64bit_integers

 Если значение истинно, то при использовании JSON\* форматов UInt64 и Int64 числа выводятся в кавычках (из соображений совместимости с большинством реализаций JavaScript), иначе - без кавычек.
+
+## format_csv_delimiter
+
+Символ, интерпретируемый как разделитель в данных формата CSV. По умолчанию — `,`.