mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge pull request #30497 from Avogar/null-deserialization
Add custom null representation support for TSV/CSV input formats, fix Nullable(String) deserializing in some formats
This commit is contained in:
commit
327a34e9da
@ -130,7 +130,8 @@ Only a small set of symbols are escaped. You can easily stumble onto a string va
|
||||
|
||||
Arrays are written as a list of comma-separated values in square brackets. Number items in the array are formatted as normally. `Date` and `DateTime` types are written in single quotes. Strings are written in single quotes with the same escaping rules as above.
|
||||
|
||||
[NULL](../sql-reference/syntax.md) is formatted as `\N`.
|
||||
[NULL](../sql-reference/syntax.md) is formatted according to setting [format_tsv_null_representation](../operations/settings/settings.md#settings-format_tsv_null_representation) (default value is `\N`).
|
||||
|
||||
|
||||
If setting [input_format_tsv_empty_as_default](../operations/settings/settings.md#settings-input_format_tsv_empty_as_default) is enabled,
|
||||
empty input fields are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too.
|
||||
@ -405,7 +406,7 @@ When parsing, all values can be parsed either with or without quotes. Both doubl
|
||||
If setting [input_format_csv_empty_as_default](../operations/settings/settings.md#settings-input_format_csv_empty_as_default) is enabled,
|
||||
empty unquoted input values are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too.
|
||||
|
||||
`NULL` is formatted as `\N` or `NULL` or an empty unquoted string (see settings [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) and [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)).
|
||||
`NULL` is formatted according to setting [format_csv_null_representation](../operations/settings/settings.md#settings-format_csv_null_representation) (default value is `\N`).
|
||||
|
||||
The CSV format supports the output of totals and extremes the same way as `TabSeparated`.
|
||||
|
||||
|
@ -1576,10 +1576,6 @@ When `output_format_json_quote_denormals = 1`, the query returns:
|
||||
|
||||
The character is interpreted as a delimiter in the CSV data. By default, the delimiter is `,`.
|
||||
|
||||
## input_format_csv_unquoted_null_literal_as_null {#settings-input_format_csv_unquoted_null_literal_as_null}
|
||||
|
||||
For CSV input format enables or disables parsing of unquoted `NULL` as literal (synonym for `\N`).
|
||||
|
||||
## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number}
|
||||
|
||||
Enables or disables parsing enum values as enum ids for CSV input format.
|
||||
@ -2940,9 +2936,9 @@ Possible values:
|
||||
|
||||
Default value: `1`.
|
||||
|
||||
## output_format_csv_null_representation {#output_format_csv_null_representation}
|
||||
## format_csv_null_representation {#format_csv_null_representation}
|
||||
|
||||
Defines the representation of `NULL` for [CSV](../../interfaces/formats.md#csv) output format. User can set any string as a value, for example, `My NULL`.
|
||||
Defines the representation of `NULL` for [CSV](../../interfaces/formats.md#csv) output and input formats. User can set any string as a value, for example, `My NULL`.
|
||||
|
||||
Default value: `\N`.
|
||||
|
||||
@ -2965,7 +2961,7 @@ Result
|
||||
Query
|
||||
|
||||
```sql
|
||||
SET output_format_csv_null_representation = 'My NULL';
|
||||
SET format_csv_null_representation = 'My NULL';
|
||||
SELECT * FROM csv_custom_null FORMAT CSV;
|
||||
```
|
||||
|
||||
@ -2977,9 +2973,9 @@ My NULL
|
||||
My NULL
|
||||
```
|
||||
|
||||
## output_format_tsv_null_representation {#output_format_tsv_null_representation}
|
||||
## format_tsv_null_representation {#format_tsv_null_representation}
|
||||
|
||||
Defines the representation of `NULL` for [TSV](../../interfaces/formats.md#tabseparated) output format. User can set any string as a value, for example, `My NULL`.
|
||||
Defines the representation of `NULL` for [TSV](../../interfaces/formats.md#tabseparated) output and input formats. User can set any string as a value, for example, `My NULL`.
|
||||
|
||||
Default value: `\N`.
|
||||
|
||||
@ -3002,7 +2998,7 @@ Result
|
||||
Query
|
||||
|
||||
```sql
|
||||
SET output_format_tsv_null_representation = 'My NULL';
|
||||
SET format_tsv_null_representation = 'My NULL';
|
||||
SELECT * FROM tsv_custom_null FORMAT TSV;
|
||||
```
|
||||
|
||||
|
@ -555,7 +555,6 @@ class IColumn;
|
||||
M(Bool, format_csv_allow_single_quotes, true, "If it is set to true, allow strings in single quotes.", 0) \
|
||||
M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \
|
||||
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
|
||||
M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \
|
||||
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
|
||||
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
|
||||
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \
|
||||
@ -598,8 +597,8 @@ class IColumn;
|
||||
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
|
||||
M(UInt64, output_format_avro_rows_in_file, 1, "Max rows in a file (if permitted by storage)", 0) \
|
||||
M(Bool, output_format_tsv_crlf_end_of_line, false, "If it is set true, end of line in TSV format will be \\r\\n instead of \\n.", 0) \
|
||||
M(String, output_format_csv_null_representation, "\\N", "Custom NULL representation in CSV format", 0) \
|
||||
M(String, output_format_tsv_null_representation, "\\N", "Custom NULL representation in TSV format", 0) \
|
||||
M(String, format_csv_null_representation, "\\N", "Custom NULL representation in CSV format", 0) \
|
||||
M(String, format_tsv_null_representation, "\\N", "Custom NULL representation in TSV format", 0) \
|
||||
M(Bool, output_format_decimal_trailing_zeros, false, "Output trailing zeros when printing Decimal values. E.g. 1.230000 instead of 1.23.", 0) \
|
||||
\
|
||||
M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
|
||||
|
@ -7,12 +7,12 @@
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Core/Field.h>
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ConcatReadBuffer.h>
|
||||
#include <IO/PeekableReadBuffer.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <base/scope_guard.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -285,70 +285,93 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R
|
||||
|
||||
template<typename ReturnType, bool escaped>
|
||||
ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,
|
||||
const SerializationPtr & nested)
|
||||
const SerializationPtr & nested_serialization)
|
||||
{
|
||||
/// Little tricky, because we cannot discriminate null from first character.
|
||||
const String & null_representation = settings.tsv.null_representation;
|
||||
|
||||
if (istr.eof() || *istr.position() != '\\') /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok.
|
||||
/// Some data types can deserialize absence of data (e.g. empty string), so eof is ok.
|
||||
if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0]))
|
||||
{
|
||||
/// This is not null, surely.
|
||||
return safeDeserialize<ReturnType>(column, *nested,
|
||||
return safeDeserialize<ReturnType>(column, *nested_serialization,
|
||||
[] { return false; },
|
||||
[&nested, &istr, &settings] (IColumn & nested_column)
|
||||
[&nested_serialization, &istr, &settings] (IColumn & nested_column)
|
||||
{
|
||||
if constexpr (escaped)
|
||||
nested->deserializeTextEscaped(nested_column, istr, settings);
|
||||
nested_serialization->deserializeTextEscaped(nested_column, istr, settings);
|
||||
else
|
||||
nested->deserializeTextRaw(nested_column, istr, settings);
|
||||
nested_serialization->deserializeTextRaw(nested_column, istr, settings);
|
||||
});
|
||||
}
|
||||
else
|
||||
|
||||
/// Check if we have enough data in buffer to check if it's a null.
|
||||
if (istr.available() > null_representation.size())
|
||||
{
|
||||
/// Now we know, that data in buffer starts with backslash.
|
||||
++istr.position();
|
||||
|
||||
if (istr.eof())
|
||||
throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
|
||||
return safeDeserialize<ReturnType>(column, *nested,
|
||||
[&istr]
|
||||
{
|
||||
if (*istr.position() == 'N')
|
||||
{
|
||||
++istr.position();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
[&nested, &istr, &settings] (IColumn & nested_column)
|
||||
{
|
||||
if (istr.position() != istr.buffer().begin())
|
||||
{
|
||||
/// We could step back to consume backslash again.
|
||||
--istr.position();
|
||||
if constexpr (escaped)
|
||||
nested->deserializeTextEscaped(nested_column, istr, settings);
|
||||
else
|
||||
nested->deserializeTextRaw(nested_column, istr, settings);
|
||||
}
|
||||
else
|
||||
{
|
||||
/// Otherwise, we need to place backslash back in front of istr.
|
||||
ReadBufferFromMemory prefix("\\", 1);
|
||||
ConcatReadBuffer prepended_istr(prefix, istr);
|
||||
|
||||
if constexpr (escaped)
|
||||
nested->deserializeTextEscaped(nested_column, prepended_istr, settings);
|
||||
else
|
||||
nested->deserializeTextRaw(nested_column, prepended_istr, settings);
|
||||
|
||||
/// Synchronise cursor position in original buffer.
|
||||
|
||||
if (prepended_istr.count() > 1)
|
||||
istr.position() = prepended_istr.position();
|
||||
}
|
||||
});
|
||||
auto check_for_null = [&istr, &null_representation]()
|
||||
{
|
||||
auto * pos = istr.position();
|
||||
if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n'))
|
||||
return true;
|
||||
istr.position() = pos;
|
||||
return false;
|
||||
};
|
||||
auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column)
|
||||
{
|
||||
if constexpr (escaped)
|
||||
nested_serialization->deserializeTextEscaped(nested_column, istr, settings);
|
||||
else
|
||||
nested_serialization->deserializeTextRaw(nested_column, istr, settings);
|
||||
};
|
||||
return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested);
|
||||
}
|
||||
|
||||
/// We don't have enough data in buffer to check if it's a null.
|
||||
/// Use PeekableReadBuffer to make a checkpoint before checking null
|
||||
/// representation and rollback if check was failed.
|
||||
PeekableReadBuffer buf(istr, true);
|
||||
auto check_for_null = [&buf, &null_representation]()
|
||||
{
|
||||
buf.setCheckpoint();
|
||||
SCOPE_EXIT(buf.dropCheckpoint());
|
||||
if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'))
|
||||
return true;
|
||||
|
||||
buf.rollbackToCheckpoint();
|
||||
return false;
|
||||
};
|
||||
|
||||
auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column)
|
||||
{
|
||||
auto * pos = buf.position();
|
||||
if constexpr (escaped)
|
||||
nested_serialization->deserializeTextEscaped(nested_column, buf, settings);
|
||||
else
|
||||
nested_serialization->deserializeTextRaw(nested_column, buf, settings);
|
||||
/// Check that we don't have any unread data in PeekableReadBuffer own memory.
|
||||
if (likely(!buf.hasUnreadData()))
|
||||
return;
|
||||
|
||||
/// We have some unread data in PeekableReadBuffer own memory.
|
||||
/// It can happen only if there is a string instead of a number
|
||||
/// or if someone uses tab or LF in TSV null_representation.
|
||||
/// In the first case we cannot continue reading anyway. The second case seems to be unlikely.
|
||||
if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos)
|
||||
throw DB::ParsingException("TSV custom null representation containing '\\t' or '\\n' may not work correctly "
|
||||
"for large input.", ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
|
||||
WriteBufferFromOwnString parsed_value;
|
||||
if constexpr (escaped)
|
||||
nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||
else
|
||||
nested_serialization->serializeTextRaw(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||
throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable"
|
||||
+ " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos)
|
||||
+ "\", which was deserialized as \""
|
||||
+ parsed_value.str() + "\". It seems that input data is ill-formatted.",
|
||||
ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
};
|
||||
|
||||
return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested);
|
||||
}
|
||||
|
||||
void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
@ -389,13 +412,30 @@ template <typename ReturnType>
|
||||
ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,
|
||||
const SerializationPtr & nested)
|
||||
{
|
||||
return safeDeserialize<ReturnType>(column, *nested,
|
||||
[&istr]
|
||||
{
|
||||
return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr)
|
||||
|| checkStringByFirstCharacterAndAssertTheRest("ᴺᵁᴸᴸ", istr);
|
||||
},
|
||||
[&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeWholeText(nested_column, istr, settings); });
|
||||
PeekableReadBuffer buf(istr, true);
|
||||
auto check_for_null = [&buf]()
|
||||
{
|
||||
buf.setCheckpoint();
|
||||
SCOPE_EXIT(buf.dropCheckpoint());
|
||||
|
||||
if (checkStringCaseInsensitive("NULL", buf) && buf.eof())
|
||||
return true;
|
||||
|
||||
buf.rollbackToCheckpoint();
|
||||
if (checkStringCaseInsensitive("ᴺᵁᴸᴸ", buf) && buf.eof())
|
||||
return true;
|
||||
|
||||
buf.rollbackToCheckpoint();
|
||||
return false;
|
||||
};
|
||||
|
||||
auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column)
|
||||
{
|
||||
nested->deserializeWholeText(nested_column, buf, settings);
|
||||
assert(!buf.hasUnreadData());
|
||||
};
|
||||
|
||||
return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested);
|
||||
}
|
||||
|
||||
|
||||
@ -416,74 +456,77 @@ void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & is
|
||||
|
||||
template<typename ReturnType>
|
||||
ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,
|
||||
const SerializationPtr & nested)
|
||||
const SerializationPtr & nested_serialization)
|
||||
{
|
||||
constexpr char const * null_literal = "NULL";
|
||||
constexpr size_t len = 4;
|
||||
size_t null_prefix_len = 0;
|
||||
|
||||
auto check_for_null = [&istr, &settings, &null_prefix_len]
|
||||
const String & null_representation = settings.csv.null_representation;
|
||||
if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0]))
|
||||
{
|
||||
if (checkStringByFirstCharacterAndAssertTheRest("\\N", istr))
|
||||
return true;
|
||||
if (!settings.csv.unquoted_null_literal_as_null)
|
||||
/// This is not null, surely.
|
||||
return safeDeserialize<ReturnType>(column, *nested_serialization,
|
||||
[] { return false; },
|
||||
[&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); });
|
||||
}
|
||||
|
||||
/// Check if we have enough data in buffer to check if it's a null.
|
||||
if (istr.available() > null_representation.size())
|
||||
{
|
||||
auto check_for_null = [&istr, &null_representation, &settings]()
|
||||
{
|
||||
auto * pos = istr.position();
|
||||
if (checkString(null_representation, istr) && (*istr.position() == settings.csv.delimiter || *istr.position() == '\r' || *istr.position() == '\n'))
|
||||
return true;
|
||||
istr.position() = pos;
|
||||
return false;
|
||||
|
||||
/// Check for unquoted NULL
|
||||
while (!istr.eof() && null_prefix_len < len && null_literal[null_prefix_len] == *istr.position())
|
||||
};
|
||||
auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column)
|
||||
{
|
||||
++null_prefix_len;
|
||||
++istr.position();
|
||||
}
|
||||
if (null_prefix_len == len)
|
||||
nested_serialization->deserializeTextCSV(nested_column, istr, settings);
|
||||
};
|
||||
return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested);
|
||||
}
|
||||
|
||||
/// We don't have enough data in buffer to check if it's a null.
|
||||
/// Use PeekableReadBuffer to make a checkpoint before checking null
|
||||
/// representation and rollback if the check was failed.
|
||||
PeekableReadBuffer buf(istr, true);
|
||||
auto check_for_null = [&buf, &null_representation, &settings]()
|
||||
{
|
||||
buf.setCheckpoint();
|
||||
SCOPE_EXIT(buf.dropCheckpoint());
|
||||
if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\r' || *buf.position() == '\n'))
|
||||
return true;
|
||||
|
||||
/// Value and "NULL" have common prefix, but value is not "NULL".
|
||||
/// Restore previous buffer position if possible.
|
||||
if (null_prefix_len <= istr.offset())
|
||||
{
|
||||
istr.position() -= null_prefix_len;
|
||||
null_prefix_len = 0;
|
||||
}
|
||||
buf.rollbackToCheckpoint();
|
||||
return false;
|
||||
};
|
||||
|
||||
auto deserialize_nested = [&nested, &settings, &istr, &null_prefix_len] (IColumn & nested_column)
|
||||
auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column)
|
||||
{
|
||||
if (likely(!null_prefix_len))
|
||||
nested->deserializeTextCSV(nested_column, istr, settings);
|
||||
else
|
||||
{
|
||||
/// Previous buffer position was not restored,
|
||||
/// so we need to prepend extracted characters (rare case)
|
||||
ReadBufferFromMemory prepend(null_literal, null_prefix_len);
|
||||
ConcatReadBuffer buf(prepend, istr);
|
||||
nested->deserializeTextCSV(nested_column, buf, settings);
|
||||
auto * pos = buf.position();
|
||||
nested_serialization->deserializeTextCSV(nested_column, buf, settings);
|
||||
/// Check that we don't have any unread data in PeekableReadBuffer own memory.
|
||||
if (likely(!buf.hasUnreadData()))
|
||||
return;
|
||||
|
||||
/// Check if all extracted characters were read by nested parser and update buffer position
|
||||
if (null_prefix_len < buf.count())
|
||||
istr.position() = buf.position();
|
||||
else if (null_prefix_len > buf.count())
|
||||
{
|
||||
/// It can happen only if there is an unquoted string instead of a number
|
||||
/// or if someone uses 'U' or 'L' as delimiter in CSV.
|
||||
/// In the first case we cannot continue reading anyway. The second case seems to be unlikely.
|
||||
if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L')
|
||||
throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly "
|
||||
"with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
WriteBufferFromOwnString parsed_value;
|
||||
nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||
throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len)
|
||||
+ std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable"
|
||||
+ " at position " + std::to_string(istr.count()) + ": got \"" + std::string(null_literal, buf.count())
|
||||
+ "\", which was deserialized as \""
|
||||
+ parsed_value.str() + "\". It seems that input data is ill-formatted.",
|
||||
ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
}
|
||||
}
|
||||
/// We have some unread data in PeekableReadBuffer own memory.
|
||||
/// It can happen only if there is an unquoted string instead of a number
|
||||
/// or if someone uses csv delimiter, LF or CR in CSV null representation.
|
||||
/// In the first case we cannot continue reading anyway. The second case seems to be unlikely.
|
||||
if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos
|
||||
|| null_representation.find('\n') != std::string::npos)
|
||||
throw DB::ParsingException("CSV custom null representation containing format_csv_delimiter, '\\r' or '\\n' may not work correctly "
|
||||
"for large input.", ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
|
||||
WriteBufferFromOwnString parsed_value;
|
||||
nested_serialization->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||
throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable"
|
||||
+ " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos)
|
||||
+ "\", which was deserialized as \""
|
||||
+ parsed_value.str() + "\". It seems that input data is ill-formatted.",
|
||||
ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
};
|
||||
|
||||
return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested);
|
||||
return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested);
|
||||
}
|
||||
|
||||
void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <DataTypes/Serializations/SerializationString.h>
|
||||
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Common/assert_cast.h>
|
||||
@ -9,8 +8,6 @@
|
||||
#include <Core/Field.h>
|
||||
|
||||
#include <Formats/FormatSettings.h>
|
||||
#include <Formats/ProtobufReader.h>
|
||||
#include <Formats/ProtobufWriter.h>
|
||||
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
@ -59,8 +59,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.delimiter = settings.format_csv_delimiter;
|
||||
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
|
||||
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
|
||||
format_settings.csv.null_representation = settings.output_format_csv_null_representation;
|
||||
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
|
||||
format_settings.csv.null_representation = settings.format_csv_null_representation;
|
||||
format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
|
||||
format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
|
||||
format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
|
||||
@ -103,7 +102,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line;
|
||||
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
|
||||
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
|
||||
format_settings.tsv.null_representation = settings.output_format_tsv_null_representation;
|
||||
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
|
||||
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
||||
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
||||
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
||||
|
@ -74,7 +74,6 @@ struct FormatSettings
|
||||
char delimiter = ',';
|
||||
bool allow_single_quotes = true;
|
||||
bool allow_double_quotes = true;
|
||||
bool unquoted_null_literal_as_null = false;
|
||||
bool empty_as_default = false;
|
||||
bool crlf_end_of_line = false;
|
||||
bool input_format_enum_as_number = false;
|
||||
|
@ -9,7 +9,7 @@ namespace ErrorCodes
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/)
|
||||
PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= 0*/)
|
||||
: BufferWithOwnMemory(start_size_), sub_buf(sub_buf_)
|
||||
{
|
||||
padded &= sub_buf.isPadded();
|
||||
@ -27,6 +27,7 @@ void PeekableReadBuffer::reset()
|
||||
peeked_size = 0;
|
||||
checkpoint = std::nullopt;
|
||||
checkpoint_in_own_memory = false;
|
||||
use_stack_memory = true;
|
||||
|
||||
if (!currentlyReadFromOwnMemory())
|
||||
sub_buf.position() = pos;
|
||||
@ -72,21 +73,23 @@ bool PeekableReadBuffer::peekNext()
|
||||
sub_buf.position() = copy_from;
|
||||
}
|
||||
|
||||
char * memory_data = getMemoryData();
|
||||
|
||||
/// Save unread data from sub-buffer to own memory
|
||||
memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_copy);
|
||||
memcpy(memory_data + peeked_size, sub_buf.position(), bytes_to_copy);
|
||||
|
||||
/// If useSubbufferOnly() is false, then checkpoint is in own memory and it was updated in resizeOwnMemoryIfNecessary
|
||||
/// Otherwise, checkpoint now at the beginning of own memory
|
||||
if (checkpoint && useSubbufferOnly())
|
||||
{
|
||||
checkpoint.emplace(memory.data());
|
||||
checkpoint.emplace(memory_data);
|
||||
checkpoint_in_own_memory = true;
|
||||
}
|
||||
|
||||
if (currentlyReadFromOwnMemory())
|
||||
{
|
||||
/// Update buffer size
|
||||
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, offset());
|
||||
BufferBase::set(memory_data, peeked_size + bytes_to_copy, offset());
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -99,7 +102,7 @@ bool PeekableReadBuffer::peekNext()
|
||||
else
|
||||
pos_offset = 0;
|
||||
}
|
||||
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset);
|
||||
BufferBase::set(memory_data, peeked_size + bytes_to_copy, pos_offset);
|
||||
}
|
||||
|
||||
peeked_size += bytes_to_copy;
|
||||
@ -125,8 +128,9 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop)
|
||||
/// Checkpoint is in own memory and position is not.
|
||||
assert(checkpointInOwnMemory());
|
||||
|
||||
char * memory_data = getMemoryData();
|
||||
/// Switch to reading from own memory.
|
||||
BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data());
|
||||
BufferBase::set(memory_data, peeked_size, *checkpoint - memory_data);
|
||||
}
|
||||
|
||||
if (drop)
|
||||
@ -224,12 +228,31 @@ void PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append)
|
||||
bool need_update_pos = currentlyReadFromOwnMemory();
|
||||
size_t offset = 0;
|
||||
if (need_update_checkpoint)
|
||||
offset = *checkpoint - memory.data();
|
||||
{
|
||||
char * memory_data = getMemoryData();
|
||||
offset = *checkpoint - memory_data;
|
||||
}
|
||||
else if (need_update_pos)
|
||||
offset = this->offset();
|
||||
|
||||
size_t new_size = peeked_size + bytes_to_append;
|
||||
if (memory.size() < new_size)
|
||||
|
||||
if (use_stack_memory)
|
||||
{
|
||||
/// If stack memory is still enough, do nothing.
|
||||
if (sizeof(stack_memory) >= new_size)
|
||||
return;
|
||||
|
||||
/// Stack memory is not enough, allocate larger buffer.
|
||||
use_stack_memory = false;
|
||||
memory.resize(std::max(size_t(DBMS_DEFAULT_BUFFER_SIZE), new_size));
|
||||
memcpy(memory.data(), stack_memory, sizeof(stack_memory));
|
||||
if (need_update_checkpoint)
|
||||
checkpoint.emplace(memory.data() + offset);
|
||||
if (need_update_pos)
|
||||
BufferBase::set(memory.data(), peeked_size, pos - stack_memory);
|
||||
}
|
||||
else if (memory.size() < new_size)
|
||||
{
|
||||
if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size())
|
||||
{
|
||||
@ -273,10 +296,11 @@ void PeekableReadBuffer::makeContinuousMemoryFromCheckpointToPos()
|
||||
|
||||
size_t bytes_to_append = pos - sub_buf.position();
|
||||
resizeOwnMemoryIfNecessary(bytes_to_append);
|
||||
memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_append);
|
||||
char * memory_data = getMemoryData();
|
||||
memcpy(memory_data + peeked_size, sub_buf.position(), bytes_to_append);
|
||||
sub_buf.position() = pos;
|
||||
peeked_size += bytes_to_append;
|
||||
BufferBase::set(memory.data(), peeked_size, peeked_size);
|
||||
BufferBase::set(memory_data, peeked_size, peeked_size);
|
||||
}
|
||||
|
||||
PeekableReadBuffer::~PeekableReadBuffer()
|
||||
@ -287,7 +311,7 @@ PeekableReadBuffer::~PeekableReadBuffer()
|
||||
|
||||
bool PeekableReadBuffer::hasUnreadData() const
|
||||
{
|
||||
return peeked_size && pos != memory.data() + peeked_size;
|
||||
return peeked_size && pos != getMemoryData() + peeked_size;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ class PeekableReadBuffer : public BufferWithOwnMemory<ReadBuffer>
|
||||
{
|
||||
friend class PeekableReadBufferCheckpoint;
|
||||
public:
|
||||
explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE);
|
||||
explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = 0);
|
||||
|
||||
~PeekableReadBuffer() override;
|
||||
|
||||
@ -84,11 +84,21 @@ private:
|
||||
/// Updates all invalidated pointers and sizes.
|
||||
void resizeOwnMemoryIfNecessary(size_t bytes_to_append);
|
||||
|
||||
char * getMemoryData() { return use_stack_memory ? stack_memory : memory.data(); }
|
||||
const char * getMemoryData() const { return use_stack_memory ? stack_memory : memory.data(); }
|
||||
|
||||
|
||||
ReadBuffer & sub_buf;
|
||||
size_t peeked_size = 0;
|
||||
std::optional<Position> checkpoint = std::nullopt;
|
||||
bool checkpoint_in_own_memory = false;
|
||||
|
||||
/// To prevent expensive and in some cases unnecessary memory allocations on PeekableReadBuffer
|
||||
/// creation (for example if PeekableReadBuffer is often created or if we need to remember small amount of
|
||||
/// data after checkpoint), at the beginning we will use small amount of memory on stack and allocate
|
||||
/// larger buffer only if reserved memory is not enough.
|
||||
char stack_memory[16];
|
||||
bool use_stack_memory = true;
|
||||
};
|
||||
|
||||
|
||||
|
15
tests/performance/tsv_csv_nullable_parsing.xml
Normal file
15
tests/performance/tsv_csv_nullable_parsing.xml
Normal file
@ -0,0 +1,15 @@
|
||||
<test>
|
||||
|
||||
<create_query>CREATE TABLE IF NOT EXISTS table_tsv (s Nullable(String)) ENGINE = File('TSV')</create_query>
|
||||
<create_query>CREATE TABLE IF NOT EXISTS table_csv (s Nullable(String)) ENGINE = File('CSV')</create_query>
|
||||
|
||||
<fill_query>INSERT INTO table_tsv SELECT number % 2 ? 'Some text' : NULL FROM numbers(1000000) FORMAT TSV</fill_query>
|
||||
<fill_query>INSERT INTO table_csv SELECT number % 2 ? 'Some text' : NULL FROM numbers(1000000) FORMAT CSV</fill_query>
|
||||
|
||||
<query>SELECT * FROM table_tsv FORMAT Null</query>
|
||||
<query>SELECT * FROM table_csv FORMAT Null</query>
|
||||
|
||||
<drop_query>DROP TABLE IF EXISTS table_tsv</drop_query>
|
||||
<drop_query>DROP TABLE IF EXISTS table_csv</drop_query>
|
||||
|
||||
</test>
|
@ -33,7 +33,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Europe/Moscow
|
||||
|
||||
echo 'NULL, NULL
|
||||
"2016-01-01 01:02:03",NUL
|
||||
"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV";
|
||||
"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --format_csv_null_representation='NULL' --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV";
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST";
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
|
||||
|
@ -9,7 +9,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE tsv_custom_null (id Nullable(UInt32)) E
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="INSERT INTO tsv_custom_null VALUES (NULL)";
|
||||
|
||||
$CLICKHOUSE_CLIENT --output_format_tsv_null_representation='MyNull' --query="SELECT * FROM tsv_custom_null FORMAT TSV";
|
||||
$CLICKHOUSE_CLIENT --format_tsv_null_representation='MyNull' --query="SELECT * FROM tsv_custom_null FORMAT TSV";
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE tsv_custom_null";
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
# output_format_csv_null_representation should initially be \\N
|
||||
# format_csv_null_representation should initially be \\N
|
||||
"val1",\N,"val3"
|
||||
# Changing output_format_csv_null_representation
|
||||
# Changing format_csv_null_representation
|
||||
"val1",∅,"val3"
|
||||
|
@ -7,10 +7,10 @@ CREATE TABLE test_data (
|
||||
|
||||
INSERT INTO test_data VALUES ('val1', NULL, 'val3');
|
||||
|
||||
SELECT '# output_format_csv_null_representation should initially be \\N';
|
||||
SELECT '# format_csv_null_representation should initially be \\N';
|
||||
SELECT * FROM test_data FORMAT CSV;
|
||||
|
||||
SELECT '# Changing output_format_csv_null_representation';
|
||||
SET output_format_csv_null_representation = '∅';
|
||||
SELECT '# Changing format_csv_null_representation';
|
||||
SET format_csv_null_representation = '∅';
|
||||
SELECT * FROM test_data FORMAT CSV;
|
||||
SET output_format_csv_null_representation = '\\N';
|
||||
SET format_csv_null_representation = '\\N';
|
||||
|
@ -0,0 +1,76 @@
|
||||
TSV
|
||||
\N
|
||||
\N
|
||||
Some text
|
||||
\N
|
||||
Some text
|
||||
\N
|
||||
Some more text
|
||||
\N
|
||||
\N
|
||||
Some more text
|
||||
1 Some text 1
|
||||
1 \N 1
|
||||
CustomNullSome text
|
||||
CustomNullSome text
|
||||
\N
|
||||
Some more text
|
||||
\N
|
||||
\N
|
||||
Some more text
|
||||
1 \N 1
|
||||
1 \N 1
|
||||
CSV
|
||||
\N
|
||||
\N
|
||||
\\NSome text
|
||||
\N
|
||||
\\NSome text
|
||||
\N
|
||||
Some more text
|
||||
\N
|
||||
\N
|
||||
Some more text
|
||||
1 \\NSome text 1
|
||||
1 \N 1
|
||||
CustomNullSome text
|
||||
CustomNullSome text
|
||||
\N
|
||||
Some more text
|
||||
\N
|
||||
\N
|
||||
Some more text
|
||||
1 \N 1
|
||||
1 \N 1
|
||||
Corner cases
|
||||
TSV
|
||||
Some text \N
|
||||
Some text CustomNull Some text
|
||||
OK
|
||||
OK
|
||||
CSV
|
||||
Some text \N
|
||||
Some text CustomNull Some text
|
||||
OK
|
||||
OK
|
||||
Large custom NULL
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
||||
0000000000Custom NULL representation0000000000
|
133
tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh
Executable file
133
tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh
Executable file
@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
|
||||
DATA_FILE=$USER_FILES_PATH/test_02103_null.data
|
||||
|
||||
echo "TSV"
|
||||
|
||||
echo 'Custom NULL representation' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='Custom NULL representation'"
|
||||
|
||||
echo -e 'N\tU\tL\tL' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='N\tU\tL\tL'"
|
||||
|
||||
echo -e "\\NSome text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "\\N" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "\\NSome text\n\\N\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "\\N\n\\N\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "1\t\\NSome text\t1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32')"
|
||||
|
||||
echo -e "1\t\\N\t1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32')"
|
||||
|
||||
echo -e "CustomNullSome text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "CustomNullSome text\nCustomNull\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "CustomNull\nCustomNull\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "1\tCustomNull\t1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_tsv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "1\tCustomNull\t1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_tsv_null_representation='CustomNull'"
|
||||
|
||||
|
||||
echo "CSV"
|
||||
|
||||
echo 'Custom NULL representation' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='Custom NULL representation'"
|
||||
|
||||
echo -e 'N,U,L,L' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='N,U,L,L'"
|
||||
|
||||
echo -e "\\NSome text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "\\N" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "\\NSome text\n\\N\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "\\N\n\\N\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')"
|
||||
|
||||
echo -e "1,\\NSome text,1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32')"
|
||||
|
||||
echo -e "1,\\N,1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32')"
|
||||
|
||||
echo -e "CustomNullSome text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "CustomNullSome text\nCustomNull\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "CustomNull\nCustomNull\nSome more text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "1,CustomNull,1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_csv_null_representation='CustomNull'"
|
||||
|
||||
echo -e "1,CustomNull,1" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_csv_null_representation='CustomNull'"
|
||||
|
||||
|
||||
echo 'Corner cases'
|
||||
echo 'TSV'
|
||||
|
||||
echo -e "Some text\tCustomNull" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_tsv_null_representation='CustomNull', input_format_parallel_parsing=0"
|
||||
|
||||
echo -e "Some text\tCustomNull Some text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_tsv_null_representation='CustomNull', input_format_parallel_parsing=0"
|
||||
|
||||
echo -e "Some text\t123NNN" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(Int32)') settings max_read_buffer_size=14, format_tsv_null_representation='123NN', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL'
|
||||
|
||||
echo -e "Some text\tNU\tLL" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(String)') settings max_read_buffer_size=13, format_tsv_null_representation='NU\tL', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL'
|
||||
|
||||
echo 'CSV'
|
||||
|
||||
echo -e "Some text,CustomNull" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_csv_null_representation='CustomNull', input_format_parallel_parsing=0"
|
||||
|
||||
echo -e "Some text,CustomNull Some text" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_csv_null_representation='CustomNull', input_format_parallel_parsing=0"
|
||||
|
||||
echo -e "Some text,123NNN" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(Int32)') settings max_read_buffer_size=14, format_csv_null_representation='123NN', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL'
|
||||
|
||||
echo -e "Some text,NU,LL" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=13, format_csv_null_representation='NU,L', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL'
|
||||
|
||||
|
||||
echo 'Large custom NULL'
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '0000000000Custom NULL representation0000000000' FROM numbers(10)" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS max_read_buffer_size=5, input_format_parallel_parsing=0, format_tsv_null_representation='0000000000Custom NULL representation0000000000'"
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS max_read_buffer_size=5, input_format_parallel_parsing=0, format_tsv_null_representation='0000000000Custom NULL representation000000000'"
|
||||
|
||||
rm $DATA_FILE
|
||||
|
@ -0,0 +1,2 @@
|
||||
NULLSome string
|
||||
NULLSome string
|
18
tests/queries/0_stateless/02104_json_strings_nullable_string.sh
Executable file
18
tests/queries/0_stateless/02104_json_strings_nullable_string.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
DATA_FILE=$USER_FILES_PATH/test_02104_null.data
|
||||
|
||||
echo -e '{"s" : "NULLSome string"}' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02104_null.data', 'JSONStringsEachRow', 's Nullable(String)')"
|
||||
|
||||
echo -e '["NULLSome string"]' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02104_null.data', 'JSONCompactStringsEachRow', 's Nullable(String)')"
|
||||
|
||||
rm $DATA_FILE
|
||||
|
Loading…
Reference in New Issue
Block a user