From d36176ad85a0a51a63bd91849878a045b83008e1 Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Wed, 17 Jul 2024 03:19:07 +0200 Subject: [PATCH 01/13] Support for the `input_format_json_empty_as_default` setting --- docs/en/interfaces/formats.md | 1 + .../operations/settings/settings-formats.md | 11 ++++ src/Core/Settings.h | 1 + src/DataTypes/DataTypeAggregateFunction.h | 2 + src/DataTypes/DataTypeDate.h | 1 + src/DataTypes/DataTypeDate32.h | 1 + src/DataTypes/DataTypeDateTime.h | 1 + src/DataTypes/DataTypeDateTime64.h | 2 + src/DataTypes/DataTypeIPv4andIPv6.h | 2 + src/DataTypes/DataTypeUUID.h | 1 + src/DataTypes/IDataType.h | 3 + src/DataTypes/Serializations/ISerialization.h | 10 +++- .../SerializationAggregateFunction.cpp | 2 +- .../SerializationAggregateFunction.h | 5 +- .../SerializationAsStringNonTrivialJSON.h | 56 +++++++++++++++++++ .../Serializations/SerializationDate.cpp | 4 +- .../Serializations/SerializationDate.h | 7 ++- .../Serializations/SerializationDate32.cpp | 4 +- .../Serializations/SerializationDate32.h | 7 ++- .../Serializations/SerializationDateTime.cpp | 4 +- .../Serializations/SerializationDateTime.h | 7 ++- .../SerializationDateTime64.cpp | 6 +- .../Serializations/SerializationDateTime64.h | 7 ++- .../SerializationIPv4andIPv6.cpp | 4 +- .../Serializations/SerializationIPv4andIPv6.h | 7 ++- .../Serializations/SerializationNullable.cpp | 52 ++++++++++++++--- .../Serializations/SerializationNullable.h | 1 + .../Serializations/SerializationUUID.cpp | 4 +- .../Serializations/SerializationUUID.h | 7 ++- src/Formats/FormatFactory.cpp | 2 + src/Formats/FormatSettings.h | 2 + src/Formats/JSONUtils.cpp | 30 ++++++++-- src/IO/ReadHelpers.cpp | 17 ++++++ src/IO/ReadHelpers.h | 4 ++ .../03203_json_empty_as_default.reference | 20 +++++++ .../03203_json_empty_as_default.sql | 53 ++++++++++++++++++ 36 files changed, 300 insertions(+), 48 deletions(-) create mode 100644 src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h create mode 100644 tests/queries/0_stateless/03203_json_empty_as_default.reference create mode 100644 tests/queries/0_stateless/03203_json_empty_as_default.sql diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 8892c6d8d3f..8fc66bef5b1 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1342,6 +1342,7 @@ SELECT * FROM json_each_row_nested - [input_format_json_ignore_unknown_keys_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_ignore_unknown_keys_in_named_tuple) - ignore unknown keys in json object for named tuples. Default value - `false`. - [input_format_json_compact_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_json_compact_allow_variable_number_of_columns) - allow variable number of columns in JSONCompact/JSONCompactEachRow format, ignore extra columns and use default values on missing columns. Default value - `false`. - [input_format_json_throw_on_bad_escape_sequence](/docs/en/operations/settings/settings-formats.md/#input_format_json_throw_on_bad_escape_sequence) - throw an exception if JSON string contains bad escape sequence. If disabled, bad escape sequences will remain as is in the data. Default value - `true`. +- [input_format_json_empty_as_default](/docs/en/operations/settings/settings-formats.md/#input_format_json_empty_as_default) - treat empty fields in JSON input as default values. Default value - `false`. For complex default expressions [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings-formats.md/#input_format_defaults_for_omitted_fields) must be enabled too. - [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`. - [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`. - [output_format_json_quote_denormals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index f8b40cd81ac..758d5c1ab49 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -741,6 +741,17 @@ Possible values: Default value: 0. +### input_format_json_empty_as_default {#input_format_json_empty_as_default} + +When enabled, replace empty input fields in JSON with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Possible values: + ++ 0 — Disable. ++ 1 — Enable. + +Default value: 0. + ## TSV format settings {#tsv-format-settings} ### input_format_tsv_empty_as_default {#input_format_tsv_empty_as_default} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 0808e8eb49f..90045b9a8ff 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1133,6 +1133,7 @@ class IColumn; M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \ M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \ M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \ + M(Bool, input_format_json_empty_as_default, false, "Treat empty fields in JSON input as default values.", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index e3a4f9726d9..2ab2e53100b 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -64,6 +64,8 @@ public: SerializationPtr doGetDefaultSerialization() const override; bool supportsSparseSerialization() const override { return false; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } + bool isVersioned() const; /// Version is not empty only if it was parsed from AST or implicitly cast to 0 or version according diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 0e08b9ba2ca..cb7a603705d 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -17,6 +17,7 @@ public: bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 65633e7a228..53bd010b7cf 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -18,6 +18,7 @@ public: bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index 5519240dee1..11b579920ba 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -44,6 +44,7 @@ public: bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index 64cedd798d1..dd5ff7e6550 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -39,6 +39,8 @@ public: bool isSummable() const override { return false; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } + protected: SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/DataTypeIPv4andIPv6.h b/src/DataTypes/DataTypeIPv4andIPv6.h index 5aea55751a7..520af4f21e0 100644 --- a/src/DataTypes/DataTypeIPv4andIPv6.h +++ b/src/DataTypes/DataTypeIPv4andIPv6.h @@ -46,6 +46,7 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(IPv4); } bool isCategorial() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } SerializationPtr doGetDefaultSerialization() const override { return std::make_shared>(); } }; @@ -84,6 +85,7 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(IPv6); } bool isCategorial() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } SerializationPtr doGetDefaultSerialization() const override { return std::make_shared>(); } }; diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index 90cdd90d68d..1aeab1b78ba 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -42,6 +42,7 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(UUID); } bool isCategorial() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } + bool isNonTriviallySerializedAsStringJSON() const override { return true; } SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 397ae3d8be9..8033e82d1bc 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -328,6 +328,9 @@ public: /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); + /// non-numeric non-string data type serialized as JSON string + virtual bool isNonTriviallySerializedAsStringJSON() const { return false; } + protected: friend class DataTypeFactory; friend class AggregateFunctionSimpleState; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 5d0bf60c59f..76c88fe7522 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -398,12 +398,20 @@ public: virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; virtual bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; + /// The following two methods are implemented only for non-numeric non-string simple data types. + virtual void deserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method deserializeTextNoEmptyCheckJSON is not supported"); + } + virtual bool tryDeserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method tryDeserializeTextNoEmptyCheckJSON is not supported"); + } virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t /*indent*/) const { serializeTextJSON(column, row_num, ostr, settings); } - /** Text serialization for putting into the XML format. */ virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp index 41b198890e4..39c3f389619 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -182,7 +182,7 @@ void SerializationAggregateFunction::serializeTextJSON(const IColumn & column, s } -void SerializationAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationAggregateFunction::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String s; readJSONString(s, istr, settings.json); diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.h b/src/DataTypes/Serializations/SerializationAggregateFunction.h index c45fc79f714..6afa1cd4e97 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.h +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.h @@ -3,12 +3,13 @@ #include #include +#include namespace DB { -class SerializationAggregateFunction final : public ISerialization +class SerializationAggregateFunction final : public SerializationAsStringNonTrivialJSON { private: AggregateFunctionPtr function; @@ -37,7 +38,7 @@ public: void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h new file mode 100644 index 00000000000..7d8375368a7 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +/** Serialization for non-numeric non-string data types serialized as JSON strings + * For these data types, we support an option, input_format_json_empty_as_default, which, when set to 1, + * allows for JSON deserialization to treat an encountered empty string as a default value for the specified type. + * Derived classes must implement the following methods: + * deserializeTextNoEmptyCheckJSON() and tryDeserializeTextNoEmptyCheckJSON() + * instead of deserializeTextJSON() and tryDeserializeTextJSON() respectively. + */ +template +requires std::derived_from +class SerializationAsStringNonTrivialJSON : public T +{ +public: + using T::T; + + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & format_settings) const override + { + if (format_settings.json.empty_as_default && tryMatchEmptyString(istr)) + column.insertDefault(); + else + deserializeTextNoEmptyCheckJSON(column, istr, format_settings); + } + + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & format_settings) const override + { + if (format_settings.json.empty_as_default && tryMatchEmptyString(istr)) + { + column.insertDefault(); + return true; + } + else + return tryDeserializeTextNoEmptyCheckJSON(column, istr, format_settings); + } + + virtual void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override = 0; + + virtual bool tryDeserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method tryDeserializeTextNoEmptyCheckJSON is not supported"); + } +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp index 38e1bb87b6d..3f122189c22 100644 --- a/src/DataTypes/Serializations/SerializationDate.cpp +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -85,7 +85,7 @@ void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num writeChar('"', ostr); } -void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationDate::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; assertChar('"', istr); @@ -94,7 +94,7 @@ void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } -bool SerializationDate::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +bool SerializationDate::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h index dcf79eb49da..10c83171527 100644 --- a/src/DataTypes/Serializations/SerializationDate.h +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -1,12 +1,13 @@ #pragma once #include +#include #include namespace DB { -class SerializationDate final : public SerializationNumber +class SerializationDate final : public SerializationAsStringNonTrivialJSON> { public: explicit SerializationDate(const DateLUTImpl & time_zone_ = DateLUT::instance()); @@ -21,8 +22,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationDate32.cpp b/src/DataTypes/Serializations/SerializationDate32.cpp index 70a22d59e42..8ad07b534ce 100644 --- a/src/DataTypes/Serializations/SerializationDate32.cpp +++ b/src/DataTypes/Serializations/SerializationDate32.cpp @@ -83,7 +83,7 @@ void SerializationDate32::serializeTextJSON(const IColumn & column, size_t row_n writeChar('"', ostr); } -void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationDate32::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; assertChar('"', istr); @@ -92,7 +92,7 @@ void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); } -bool SerializationDate32::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +bool SerializationDate32::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDate32.h b/src/DataTypes/Serializations/SerializationDate32.h index be2e2b76c1d..ac6239fbc2b 100644 --- a/src/DataTypes/Serializations/SerializationDate32.h +++ b/src/DataTypes/Serializations/SerializationDate32.h @@ -1,11 +1,12 @@ #pragma once #include +#include #include namespace DB { -class SerializationDate32 final : public SerializationNumber +class SerializationDate32 final : public SerializationAsStringNonTrivialJSON> { public: explicit SerializationDate32(const DateLUTImpl & time_zone_ = DateLUT::instance()); @@ -20,8 +21,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index c5c819ce7fa..80b2d51140d 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -180,7 +180,7 @@ void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row writeChar('"', ostr); } -void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationDateTime::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; if (checkChar('"', istr)) @@ -196,7 +196,7 @@ void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & i assert_cast(column).getData().push_back(static_cast(x)); } -bool SerializationDateTime::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationDateTime::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; if (checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h index 584b0c4116b..0041f221ccf 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.h +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include class DateLUTImpl; @@ -8,7 +9,7 @@ class DateLUTImpl; namespace DB { -class SerializationDateTime final : public SerializationNumber, public TimezoneMixin +class SerializationDateTime final : public SerializationAsStringNonTrivialJSON>, public TimezoneMixin { public: explicit SerializationDateTime(const TimezoneMixin & time_zone_); @@ -23,8 +24,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index 442e29edd52..cdac7f785d1 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -15,7 +15,7 @@ namespace DB SerializationDateTime64::SerializationDateTime64( UInt32 scale_, const TimezoneMixin & time_zone_) - : SerializationDecimalBase(DecimalUtils::max_precision, scale_) + : SerializationAsStringNonTrivialJSON>(DecimalUtils::max_precision, scale_) , TimezoneMixin(time_zone_) { } @@ -170,7 +170,7 @@ void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t r writeChar('"', ostr); } -void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationDateTime64::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; if (checkChar('"', istr)) @@ -185,7 +185,7 @@ void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } -bool SerializationDateTime64::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationDateTime64::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; if (checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h index b49bd1e9098..a3bc4d1ad4e 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.h +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include class DateLUTImpl; @@ -8,7 +9,7 @@ class DateLUTImpl; namespace DB { -class SerializationDateTime64 final : public SerializationDecimalBase, public TimezoneMixin +class SerializationDateTime64 final : public SerializationAsStringNonTrivialJSON>, public TimezoneMixin { public: SerializationDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_); @@ -25,8 +26,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp index c1beceb4533..ecd50a0b9b8 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -69,7 +69,7 @@ void SerializationIP::serializeTextJSON(const DB::IColumn & column, size_t } template -void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +void SerializationIP::deserializeTextNoEmptyCheckJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { IPv x; assertChar('"', istr); @@ -84,7 +84,7 @@ void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuf } template -bool SerializationIP::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +bool SerializationIP::tryDeserializeTextNoEmptyCheckJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const { IPv x; if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h index a53f257646b..44f36252741 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h @@ -4,13 +4,14 @@ #include #include #include +#include #include namespace DB { template -class SerializationIP : public SimpleTextSerialization +class SerializationIP : public SerializationAsStringNonTrivialJSON { public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -22,8 +23,8 @@ public: bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index e72dd3a42f5..72c6a661dde 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -844,25 +844,52 @@ bool SerializationNullable::tryDeserializeNullJSON(DB::ReadBuffer & istr) return checkString("null", istr); } -template -ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) +namespace +{ + +enum class Strategy : uint8_t +{ + Deserialize, + DeserializeNoEmptyCheck, + TryDeserialize +}; + +template struct ReturnTypeImpl; +template <> struct ReturnTypeImpl { using Type = void; }; +template <> struct ReturnTypeImpl { using Type = bool; }; +template <> struct ReturnTypeImpl { using Type = void; }; + +template +using ReturnType = typename ReturnTypeImpl::Type; + +template struct AlwaysFalse : std::false_type {}; + +template +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { auto check_for_null = [](ReadBuffer & buf){ return checkStringByFirstCharacterAndAssertTheRest("null", buf); }; auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) { - if constexpr (std::is_same_v) + if constexpr (strategy == Strategy::TryDeserialize) return nested->tryDeserializeTextJSON(nested_column, buf, settings); - nested->deserializeTextJSON(nested_column, buf, settings); + else if constexpr (strategy == Strategy::Deserialize) + nested->deserializeTextJSON(nested_column, buf, settings); + else if constexpr (strategy == Strategy::DeserializeNoEmptyCheck) + nested->deserializeTextNoEmptyCheckJSON(nested_column, buf, settings); + else + static_assert(AlwaysFalse::value); }; - return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); + return deserializeImpl>(column, istr, check_for_null, deserialize_nested, is_null); +} + } void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { ColumnNullable & col = assert_cast(column); bool is_null; - deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); + deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); safeAppendToNullMap(col, is_null); } @@ -870,20 +897,27 @@ bool SerializationNullable::tryDeserializeTextJSON(IColumn & column, ReadBuffer { ColumnNullable & col = assert_cast(column); bool is_null; - return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); + return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); } bool SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) { bool is_null; - deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextNoEmptyCheckJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); return !is_null; } bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) { bool is_null; - return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index f7d2d2eadf0..c5215e2a39f 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -88,6 +88,7 @@ public: static bool deserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); static bool deserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); static bool deserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextNoEmptyCheckJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); static bool deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); /// If Check for NULL and deserialize value into non-nullable column or insert default value of nested type. diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index f18466ad8ad..21a0ccf676c 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -94,7 +94,7 @@ void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num writeChar('"', ostr); } -void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationUUID::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { UUID x; assertChar('"', istr); @@ -103,7 +103,7 @@ void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } -bool SerializationUUID::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +bool SerializationUUID::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { UUID x; if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h index 458504f8f42..185d1d44c16 100644 --- a/src/DataTypes/Serializations/SerializationUUID.h +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -1,11 +1,12 @@ #pragma once #include +#include namespace DB { -class SerializationUUID : public SimpleTextSerialization +class SerializationUUID : public SerializationAsStringNonTrivialJSON { public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -15,8 +16,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index a78836ff63c..59131f34697 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -151,6 +151,8 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects; format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence; format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields; + format_settings.json.case_insensitive_column_matching = settings.input_format_json_case_insensitive_column_matching; + format_settings.json.empty_as_default = settings.input_format_json_empty_as_default; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.force_null_for_omitted_fields = settings.input_format_force_null_for_omitted_fields; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index f0359218775..8c247bb960c 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -234,6 +234,8 @@ struct FormatSettings bool infer_incomplete_types_as_strings = true; bool throw_on_bad_escape_sequence = true; bool ignore_unnecessary_fields = true; + bool case_insensitive_column_matching = false; + bool empty_as_default = false; } json{}; struct diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 017befe5b0e..73189e81f97 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -286,11 +286,33 @@ namespace JSONUtils return true; } - if (as_nullable) - return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, in, format_settings, serialization); + if (format_settings.json.empty_as_default && type->isNonTriviallySerializedAsStringJSON()) + { + /// We have a non-numeric non-string data type at the top level. + /// At first glance, it looks like we sort of duplicate the work done in + /// SerializationAsStringNonTrivialJSON. Actually we need to proceed as + /// done here because we want to return false if we inserted a default + /// value on purpose, which the ISerialization interface does not allow for. + if (tryMatchEmptyString(in)) + { + column.insertDefault(); + return false; + } - serialization->deserializeTextJSON(column, in, format_settings); - return true; + if (as_nullable) + return SerializationNullable::deserializeNullAsDefaultOrNestedTextNoEmptyCheckJSON(column, in, format_settings, serialization); + + serialization->deserializeTextNoEmptyCheckJSON(column, in, format_settings); + return true; + } + else + { + if (as_nullable) + return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, in, format_settings, serialization); + + serialization->deserializeTextJSON(column, in, format_settings); + return true; + } } catch (Exception & e) { diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 9559462e62b..e8ef667a4e3 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1137,6 +1137,23 @@ template void readCSVStringInto(String & s, ReadBuffer & b template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template void readCSVStringInto, false, false>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +bool tryMatchEmptyString(ReadBuffer & buf) +{ + if (buf.eof() || *buf.position() != '"') + return false; + + ++buf.position(); + + if (buf.eof() || *buf.position() != '"') + { + --buf.position(); + return false; + } + + ++buf.position(); + + return true; +} template ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::JSON & settings) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index ffba4fafb5c..59c1923e02a 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -665,6 +665,10 @@ void readStringUntilEOFInto(Vector & s, ReadBuffer & buf); template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +/// Consumes the current token if it is an empty string, i.e. two consecutive double quotes, +/// Returns true if consumed. +bool tryMatchEmptyString(ReadBuffer & buf); + /// ReturnType is either bool or void. If bool, the function will return false instead of throwing an exception. template ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::JSON & settings); diff --git a/tests/queries/0_stateless/03203_json_empty_as_default.reference b/tests/queries/0_stateless/03203_json_empty_as_default.reference new file mode 100644 index 00000000000..53d7014e452 --- /dev/null +++ b/tests/queries/0_stateless/03203_json_empty_as_default.reference @@ -0,0 +1,20 @@ +1970-01-01 +1970-01-01 +1970-01-01 00:00:00 +1970-01-01 00:00:00.000 +0.0.0.0 +:: +00000000-0000-0000-0000-000000000000 +1 +:: +2001:db8:3333:4444:5555:6666:7777:8888 +:: +['00000000-0000-0000-0000-000000000000','b15f852c-c41a-4fd6-9247-1929c841715e','00000000-0000-0000-0000-000000000000'] +['::','::'] +('1970-01-01','0.0.0.0','abc') +{'abc':'::'} +00000000-0000-0000-0000-000000000000 +[['2001:db8:3333:4444:cccc:dddd:eeee:ffff','::'],['::','2001:db8:3333:4444:5555:6666:7777:8888']] +['00000000-0000-0000-0000-000000000000','b15f852c-c41a-4fd6-9247-1929c841715e'] +(['00000000-0000-0000-0000-000000000000'],('00000000-0000-0000-0000-000000000000',{'abc':'::'})) +{('1970-01-01','0.0.0.0'):'00000000-0000-0000-0000-000000000000'} diff --git a/tests/queries/0_stateless/03203_json_empty_as_default.sql b/tests/queries/0_stateless/03203_json_empty_as_default.sql new file mode 100644 index 00000000000..35652f4c751 --- /dev/null +++ b/tests/queries/0_stateless/03203_json_empty_as_default.sql @@ -0,0 +1,53 @@ +set input_format_json_empty_as_default = 1; +set allow_experimental_variant_type = 1; + +# Simple types +SELECT x FROM format(JSONEachRow, 'x Date', '{"x":""}'); +SELECT x FROM format(JSONEachRow, 'x Date32', '{"x":""}'); +SELECT toTimeZone(x, 'UTC') FROM format(JSONEachRow, 'x DateTime', '{"x":""}'); +SELECT toTimeZone(x, 'UTC') FROM format(JSONEachRow, 'x DateTime64', '{"x":""}'); +SELECT x FROM format(JSONEachRow, 'x IPv4', '{"x":""}'); +SELECT x FROM format(JSONEachRow, 'x IPv6', '{"x":""}'); +SELECT x FROM format(JSONEachRow, 'x UUID', '{"x":""}'); + +# Simple type AggregateFunction +DROP TABLE IF EXISTS table1; +CREATE TABLE table1(col AggregateFunction(uniq, UInt64)) ENGINE=Memory(); +DROP TABLE IF EXISTS table2; +CREATE TABLE table2(UserID UInt64) ENGINE=Memory(); + +INSERT INTO table1 SELECT uniqState(UserID) FROM table2; +INSERT INTO table1 SELECT x FROM format(JSONEachRow, 'x AggregateFunction(uniq, UInt64)' AS T, '{"x":""}'); +SELECT COUNT(DISTINCT col) FROM table1; + +DROP TABLE table1; +DROP TABLE table2; + +# The setting input_format_defaults_for_omitted_fields determines the default value if enabled. +CREATE TABLE table1(address IPv6 DEFAULT toIPv6('2001:db8:3333:4444:5555:6666:7777:8888')) ENGINE=Memory(); + +set input_format_defaults_for_omitted_fields = 0; +INSERT INTO table1 FORMAT JSONEachRow {"address":""}; + +set input_format_defaults_for_omitted_fields = 1; +INSERT INTO table1 FORMAT JSONEachRow {"address":""}; + +SELECT * FROM table1 ORDER BY address ASC; + +DROP TABLE table1; + +# Nullable +SELECT x FROM format(JSONEachRow, 'x Nullable(IPv6)', '{"x":""}'); + +# Compound types +SELECT x FROM format(JSONEachRow, 'x Array(UUID)', '{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e",""]}'); +SELECT x FROM format(JSONEachRow, 'x Array(Nullable(IPv6))', '{"x":["",""]}'); +SELECT x FROM format(JSONEachRow, 'x Tuple(Date, IPv4, String)', '{"x":["", "", "abc"]}'); +SELECT x FROM format(JSONEachRow, 'x Map(String, IPv6)', '{"x":{"abc": ""}}'); +SELECT x FROM format(JSONEachRow, 'x Variant(Date, UUID)', '{"x":""}'); + +# Deep composition +SELECT x FROM format(JSONEachRow, 'x Array(Array(IPv6))', '{"x":[["2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF", ""], ["", "2001:db8:3333:4444:5555:6666:7777:8888"]]}'); +SELECT x FROM format(JSONEachRow, 'x Variant(Date, Array(UUID))', '{"x":["", "b15f852c-c41a-4fd6-9247-1929c841715e"]}'); +SELECT x FROM format(JSONEachRow, 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))', '{"x":[[""], ["",{"abc":""}]]}'); +SELECT x FROM format(JSONEachRow, 'x Map(Tuple(Date,IPv4), Variant(UUID,IPv6))', '{"x":{["",""]:""}}'); From ad24989b31d1517757591d712959ea64ee3a900d Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Sun, 21 Jul 2024 16:39:55 +0200 Subject: [PATCH 02/13] slightly better comment --- src/DataTypes/IDataType.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 8033e82d1bc..174531cad27 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -328,7 +328,7 @@ public: /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); - /// non-numeric non-string data type serialized as JSON string + /// Checks if this is a non-numeric non-string type which is serialized as a JSON string. virtual bool isNonTriviallySerializedAsStringJSON() const { return false; } protected: From d5bea37c96c1ccdeacccb980295cb02765c94197 Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Tue, 23 Jul 2024 00:43:15 +0200 Subject: [PATCH 03/13] post-review changes --- src/DataTypes/Serializations/ISerialization.h | 1 + .../SerializationAsStringNonTrivialJSON.h | 5 ++++ .../Serializations/SerializationNullable.cpp | 2 +- .../03203_json_empty_as_default.reference | 27 +++++++++++++++++++ .../03203_json_empty_as_default.sql | 27 ++++++++++++------- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 76c88fe7522..3f4f4b8f875 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -17,6 +17,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; } class IDataType; diff --git a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h index 7d8375368a7..c6d6b864bed 100644 --- a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h +++ b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h @@ -12,6 +12,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + /** Serialization for non-numeric non-string data types serialized as JSON strings * For these data types, we support an option, input_format_json_empty_as_default, which, when set to 1, * allows for JSON deserialization to treat an encountered empty string as a default value for the specified type. diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 72c6a661dde..3cd767e58eb 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -868,7 +868,7 @@ template ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { auto check_for_null = [](ReadBuffer & buf){ return checkStringByFirstCharacterAndAssertTheRest("null", buf); }; - auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) + auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) -> ReturnType { if constexpr (strategy == Strategy::TryDeserialize) return nested->tryDeserializeTextJSON(nested_column, buf, settings); diff --git a/tests/queries/0_stateless/03203_json_empty_as_default.reference b/tests/queries/0_stateless/03203_json_empty_as_default.reference index 53d7014e452..1c689228cdf 100644 --- a/tests/queries/0_stateless/03203_json_empty_as_default.reference +++ b/tests/queries/0_stateless/03203_json_empty_as_default.reference @@ -1,20 +1,47 @@ +-- Simple types +-- { echoOn } +SELECT x FROM format(JSONEachRow, 'x Date', '{"x":""}'); 1970-01-01 +SELECT x FROM format(JSONEachRow, 'x Date32', '{"x":""}'); 1970-01-01 +SELECT toTimeZone(x, 'UTC') FROM format(JSONEachRow, 'x DateTime', '{"x":""}'); 1970-01-01 00:00:00 +SELECT toTimeZone(x, 'UTC') FROM format(JSONEachRow, 'x DateTime64', '{"x":""}'); 1970-01-01 00:00:00.000 +SELECT x FROM format(JSONEachRow, 'x IPv4', '{"x":""}'); 0.0.0.0 +SELECT x FROM format(JSONEachRow, 'x IPv6', '{"x":""}'); :: +SELECT x FROM format(JSONEachRow, 'x UUID', '{"x":""}'); 00000000-0000-0000-0000-000000000000 +-- { echoOn } +SELECT COUNT(DISTINCT col) FROM table1; 1 +-- { echoOn } +SELECT * FROM table1 ORDER BY address ASC; :: 2001:db8:3333:4444:5555:6666:7777:8888 +-- Nullable +-- { echoOn } +SELECT x FROM format(JSONEachRow, 'x Nullable(IPv6)', '{"x":""}'); :: +-- Compound types +SELECT x FROM format(JSONEachRow, 'x Array(UUID)', '{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e",""]}'); ['00000000-0000-0000-0000-000000000000','b15f852c-c41a-4fd6-9247-1929c841715e','00000000-0000-0000-0000-000000000000'] +SELECT x FROM format(JSONEachRow, 'x Array(Nullable(IPv6))', '{"x":["",""]}'); ['::','::'] +SELECT x FROM format(JSONEachRow, 'x Tuple(Date, IPv4, String)', '{"x":["", "", "abc"]}'); ('1970-01-01','0.0.0.0','abc') +SELECT x FROM format(JSONEachRow, 'x Map(String, IPv6)', '{"x":{"abc": ""}}'); {'abc':'::'} +SELECT x FROM format(JSONEachRow, 'x Variant(Date, UUID)', '{"x":""}'); 00000000-0000-0000-0000-000000000000 +-- Deep composition +SELECT x FROM format(JSONEachRow, 'x Array(Array(IPv6))', '{"x":[["2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF", ""], ["", "2001:db8:3333:4444:5555:6666:7777:8888"]]}'); [['2001:db8:3333:4444:cccc:dddd:eeee:ffff','::'],['::','2001:db8:3333:4444:5555:6666:7777:8888']] +SELECT x FROM format(JSONEachRow, 'x Variant(Date, Array(UUID))', '{"x":["", "b15f852c-c41a-4fd6-9247-1929c841715e"]}'); ['00000000-0000-0000-0000-000000000000','b15f852c-c41a-4fd6-9247-1929c841715e'] +SELECT x FROM format(JSONEachRow, 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))', '{"x":[[""], ["",{"abc":""}]]}'); (['00000000-0000-0000-0000-000000000000'],('00000000-0000-0000-0000-000000000000',{'abc':'::'})) +SELECT x FROM format(JSONEachRow, 'x Map(Tuple(Date,IPv4), Variant(UUID,IPv6))', '{"x":{["",""]:""}}'); {('1970-01-01','0.0.0.0'):'00000000-0000-0000-0000-000000000000'} diff --git a/tests/queries/0_stateless/03203_json_empty_as_default.sql b/tests/queries/0_stateless/03203_json_empty_as_default.sql index 35652f4c751..1243d450c2e 100644 --- a/tests/queries/0_stateless/03203_json_empty_as_default.sql +++ b/tests/queries/0_stateless/03203_json_empty_as_default.sql @@ -1,7 +1,7 @@ -set input_format_json_empty_as_default = 1; -set allow_experimental_variant_type = 1; +SET input_format_json_empty_as_default = 1, allow_experimental_variant_type = 1; -# Simple types +-- Simple types +-- { echoOn } SELECT x FROM format(JSONEachRow, 'x Date', '{"x":""}'); SELECT x FROM format(JSONEachRow, 'x Date32', '{"x":""}'); SELECT toTimeZone(x, 'UTC') FROM format(JSONEachRow, 'x DateTime', '{"x":""}'); @@ -9,8 +9,9 @@ SELECT toTimeZone(x, 'UTC') FROM format(JSONEachRow, 'x DateTime64', '{"x":""}') SELECT x FROM format(JSONEachRow, 'x IPv4', '{"x":""}'); SELECT x FROM format(JSONEachRow, 'x IPv6', '{"x":""}'); SELECT x FROM format(JSONEachRow, 'x UUID', '{"x":""}'); +-- { echoOff } -# Simple type AggregateFunction +-- Simple type AggregateFunction DROP TABLE IF EXISTS table1; CREATE TABLE table1(col AggregateFunction(uniq, UInt64)) ENGINE=Memory(); DROP TABLE IF EXISTS table2; @@ -18,35 +19,41 @@ CREATE TABLE table2(UserID UInt64) ENGINE=Memory(); INSERT INTO table1 SELECT uniqState(UserID) FROM table2; INSERT INTO table1 SELECT x FROM format(JSONEachRow, 'x AggregateFunction(uniq, UInt64)' AS T, '{"x":""}'); + +-- { echoOn } SELECT COUNT(DISTINCT col) FROM table1; +-- { echoOff } DROP TABLE table1; DROP TABLE table2; -# The setting input_format_defaults_for_omitted_fields determines the default value if enabled. +-- The setting input_format_defaults_for_omitted_fields determines the default value if enabled. CREATE TABLE table1(address IPv6 DEFAULT toIPv6('2001:db8:3333:4444:5555:6666:7777:8888')) ENGINE=Memory(); -set input_format_defaults_for_omitted_fields = 0; +SET input_format_defaults_for_omitted_fields = 0; INSERT INTO table1 FORMAT JSONEachRow {"address":""}; -set input_format_defaults_for_omitted_fields = 1; +SET input_format_defaults_for_omitted_fields = 1; INSERT INTO table1 FORMAT JSONEachRow {"address":""}; +-- { echoOn } SELECT * FROM table1 ORDER BY address ASC; +-- { echoOff } DROP TABLE table1; -# Nullable +-- Nullable +-- { echoOn } SELECT x FROM format(JSONEachRow, 'x Nullable(IPv6)', '{"x":""}'); -# Compound types +-- Compound types SELECT x FROM format(JSONEachRow, 'x Array(UUID)', '{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e",""]}'); SELECT x FROM format(JSONEachRow, 'x Array(Nullable(IPv6))', '{"x":["",""]}'); SELECT x FROM format(JSONEachRow, 'x Tuple(Date, IPv4, String)', '{"x":["", "", "abc"]}'); SELECT x FROM format(JSONEachRow, 'x Map(String, IPv6)', '{"x":{"abc": ""}}'); SELECT x FROM format(JSONEachRow, 'x Variant(Date, UUID)', '{"x":""}'); -# Deep composition +-- Deep composition SELECT x FROM format(JSONEachRow, 'x Array(Array(IPv6))', '{"x":[["2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF", ""], ["", "2001:db8:3333:4444:5555:6666:7777:8888"]]}'); SELECT x FROM format(JSONEachRow, 'x Variant(Date, Array(UUID))', '{"x":["", "b15f852c-c41a-4fd6-9247-1929c841715e"]}'); SELECT x FROM format(JSONEachRow, 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))', '{"x":[[""], ["",{"abc":""}]]}'); From 7b09ec9ccbf80f6d156f0029c2e6aefa32d7159a Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Tue, 23 Jul 2024 01:40:11 +0200 Subject: [PATCH 04/13] added input_format_json_empty_as_default to setting changes history --- src/Core/SettingsChangesHistory.cpp | 260 ++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 20a8721c10e..6a3b32f0d1b 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -509,6 +509,266 @@ static std::initializer_list col >= '2023-01-01' AND col <= '2023-12-31')"}, + {"extract_key_value_pairs_max_pairs_per_row", 0, 0, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory."}, + {"default_view_definer", "CURRENT_USER", "CURRENT_USER", "Allows to set default `DEFINER` option while creating a view"}, + {"default_materialized_view_sql_security", "DEFINER", "DEFINER", "Allows to set a default value for SQL SECURITY option when creating a materialized view"}, + {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, + {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, + {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, + }}, + {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, + {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, + {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, + {"allow_experimental_variant_type", false, false, "Add new experimental Variant type"}, + {"use_variant_as_common_type", false, false, "Allow to use Variant in if/multiIf if there is no common type"}, + {"output_format_arrow_use_64_bit_indexes_for_dictionary", false, false, "Allow to use 64 bit indexes type in Arrow dictionaries"}, + {"parallel_replicas_mark_segment_size", 128, 128, "Add new setting to control segment size in new parallel replicas coordinator implementation"}, + {"ignore_materialized_views_with_dropped_target_table", false, false, "Add new setting to allow to ignore materialized views with dropped target table"}, + {"output_format_compression_level", 3, 3, "Allow to change compression level in the query output"}, + {"output_format_compression_zstd_window_log", 0, 0, "Allow to change zstd window log in the query output when zstd compression is used"}, + {"enable_zstd_qat_codec", false, false, "Add new ZSTD_QAT codec"}, + {"enable_vertical_final", false, true, "Use vertical final by default"}, + {"output_format_arrow_use_64_bit_indexes_for_dictionary", false, false, "Allow to use 64 bit indexes type in Arrow dictionaries"}, + {"max_rows_in_set_to_optimize_join", 100000, 0, "Disable join optimization as it prevents from read in order optimization"}, + {"output_format_pretty_color", true, "auto", "Setting is changed to allow also for auto value, disabling ANSI escapes if output is not a tty"}, + {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, + {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, + {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, + {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, + {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}, + {"azure_max_unexpected_write_error_retries", 4, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write"}, + {"split_parts_ranges_into_intersecting_and_non_intersecting_final", false, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"}, + {"split_intersecting_parts_ranges_into_layers_final", true, true, "Allow to split intersecting parts ranges into layers during FINAL optimization"}}}, + {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, + {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, + {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"}, + {"input_format_arrow_allow_missing_columns", false, true, "Allow missing columns in Arrow files by default"}}}, + {"23.11", {{"parsedatetime_parse_without_leading_zeros", false, true, "Improved compatibility with MySQL DATE_FORMAT/STR_TO_DATE"}}}, + {"23.9", {{"optimize_group_by_constant_keys", false, true, "Optimize group by constant keys by default"}, + {"input_format_json_try_infer_named_tuples_from_objects", false, true, "Try to infer named Tuples from JSON objects by default"}, + {"input_format_json_read_numbers_as_strings", false, true, "Allow to read numbers as strings in JSON formats by default"}, + {"input_format_json_read_arrays_as_strings", false, true, "Allow to read arrays as strings in JSON formats by default"}, + {"input_format_json_infer_incomplete_types_as_strings", false, true, "Allow to infer incomplete types as Strings in JSON formats by default"}, + {"input_format_json_try_infer_numbers_from_strings", true, false, "Don't infer numbers from strings in JSON formats by default to prevent possible parsing errors"}, + {"http_write_exception_in_output_format", false, true, "Output valid JSON/XML on exception in HTTP streaming."}}}, + {"23.8", {{"rewrite_count_distinct_if_with_count_distinct_implementation", false, true, "Rewrite countDistinctIf with count_distinct_implementation configuration"}}}, + {"23.7", {{"function_sleep_max_microseconds_per_block", 0, 3000000, "In previous versions, the maximum sleep time of 3 seconds was applied only for `sleep`, but not for `sleepEachRow` function. In the new version, we introduce this setting. If you set compatibility with the previous versions, we will disable the limit altogether."}}}, + {"23.6", {{"http_send_timeout", 180, 30, "3 minutes seems crazy long. Note that this is timeout for a single network write call, not for the whole upload operation."}, + {"http_receive_timeout", 180, 30, "See http_send_timeout."}}}, + {"23.5", {{"input_format_parquet_preserve_order", true, false, "Allow Parquet reader to reorder rows for better parallelism."}, + {"parallelize_output_from_storages", false, true, "Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows."}, + {"use_with_fill_by_sorting_prefix", false, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently"}, + {"output_format_parquet_compliant_nested_types", false, true, "Change an internal field name in output Parquet file schema."}}}, + {"23.4", {{"allow_suspicious_indices", true, false, "If true, index can defined with identical expressions"}, + {"allow_nonconst_timezone_arguments", true, false, "Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*()."}, + {"connect_timeout_with_failover_ms", 50, 1000, "Increase default connect timeout because of async connect"}, + {"connect_timeout_with_failover_secure_ms", 100, 1000, "Increase default secure connect timeout because of async connect"}, + {"hedged_connection_timeout_ms", 100, 50, "Start new connection in hedged requests after 50 ms instead of 100 to correspond with previous connect timeout"}, + {"formatdatetime_f_prints_single_zero", true, false, "Improved compatibility with MySQL DATE_FORMAT()/STR_TO_DATE()"}, + {"formatdatetime_parsedatetime_m_is_month_name", false, true, "Improved compatibility with MySQL DATE_FORMAT/STR_TO_DATE"}}}, + {"23.3", {{"output_format_parquet_version", "1.0", "2.latest", "Use latest Parquet format version for output format"}, + {"input_format_json_ignore_unknown_keys_in_named_tuple", false, true, "Improve parsing JSON objects as named tuples"}, + {"input_format_native_allow_types_conversion", false, true, "Allow types conversion in Native input forma"}, + {"output_format_arrow_compression_method", "none", "lz4_frame", "Use lz4 compression in Arrow output format by default"}, + {"output_format_parquet_compression_method", "snappy", "lz4", "Use lz4 compression in Parquet output format by default"}, + {"output_format_orc_compression_method", "none", "lz4_frame", "Use lz4 compression in ORC output format by default"}, + {"async_query_sending_for_remote", false, true, "Create connections and send query async across shards"}}}, + {"23.2", {{"output_format_parquet_fixed_string_as_fixed_byte_array", false, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type for FixedString by default"}, + {"output_format_arrow_fixed_string_as_fixed_byte_array", false, true, "Use Arrow FIXED_SIZE_BINARY type for FixedString by default"}, + {"query_plan_remove_redundant_distinct", false, true, "Remove redundant Distinct step in query plan"}, + {"optimize_duplicate_order_by_and_distinct", true, false, "Remove duplicate ORDER BY and DISTINCT if it's possible"}, + {"insert_keeper_max_retries", 0, 20, "Enable reconnections to Keeper on INSERT, improve reliability"}}}, + {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}, + {"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}, + {"input_format_csv_detect_header", false, true, "Detect header in CSV format by default"}, + {"input_format_tsv_detect_header", false, true, "Detect header in TSV format by default"}, + {"input_format_custom_detect_header", false, true, "Detect header in CustomSeparated format by default"}, + {"query_plan_remove_redundant_sorting", false, true, "Remove redundant sorting in query plan. For example, sorting steps related to ORDER BY clauses in subqueries"}}}, + {"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"}, + {"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"}, + {"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}}, + {"22.11", {{"use_structure_from_insertion_table_in_table_functions", 0, 2, "Improve using structure from insertion table in table functions"}}}, + {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}}, + {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"}, + {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"}, + {"format_csv_allow_single_quotes", true, false, "Most tools don't treat single quote in CSV specially, don't do it by default too"}}}, + {"22.6", {{"output_format_json_named_tuples_as_objects", false, true, "Allow to serialize named tuples as JSON objects in JSON formats by default"}, + {"input_format_skip_unknown_fields", false, true, "Optimize reading subset of columns for some input formats"}}}, + {"22.5", {{"memory_overcommit_ratio_denominator", 0, 1073741824, "Enable memory overcommit feature by default"}, + {"memory_overcommit_ratio_denominator_for_user", 0, 1073741824, "Enable memory overcommit feature by default"}}}, + {"22.4", {{"allow_settings_after_format_in_insert", true, false, "Do not allow SETTINGS after FORMAT for INSERT queries because ClickHouse interpret SETTINGS as some values, which is misleading"}}}, + {"22.3", {{"cast_ipv4_ipv6_default_on_conversion_error", true, false, "Make functions cast(value, 'IPv4') and cast(value, 'IPv6') behave same as toIPv4 and toIPv6 functions"}}}, + {"21.12", {{"stream_like_engine_allow_direct_select", true, false, "Do not allow direct select for Kafka/RabbitMQ/FileLog by default"}}}, + {"21.9", {{"output_format_decimal_trailing_zeros", true, false, "Do not output trailing zeros in text representation of Decimal types by default for better looking output"}, + {"use_hedged_requests", false, true, "Enable Hedged Requests feature by default"}}}, + {"21.7", {{"legacy_column_name_of_tuple_literal", true, false, "Add this setting only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher"}}}, + {"21.5", {{"async_socket_for_remote", false, true, "Fix all problems and turn on asynchronous reads from socket for remote queries by default again"}}}, + {"21.3", {{"async_socket_for_remote", true, false, "Turn off asynchronous reads from socket for remote queries because of some problems"}, + {"optimize_normalize_count_variants", false, true, "Rewrite aggregate functions that semantically equals to count() as count() by default"}, + {"normalize_function_names", false, true, "Normalize function names to their canonical names, this was needed for projection query routing"}}}, + {"21.2", {{"enable_global_with_statement", false, true, "Propagate WITH statements to UNION queries and all subqueries by default"}}}, + {"21.1", {{"insert_quorum_parallel", false, true, "Use parallel quorum inserts by default. It is significantly more convenient to use than sequential quorum inserts"}, + {"input_format_null_as_default", false, true, "Allow to insert NULL as default for input formats by default"}, + {"optimize_on_insert", false, true, "Enable data optimization on INSERT by default for better user experience"}, + {"use_compact_format_in_distributed_parts_names", false, true, "Use compact format for async INSERT into Distributed tables by default"}}}, + {"20.10", {{"format_regexp_escaping_rule", "Escaped", "Raw", "Use Raw as default escaping rule for Regexp format to male the behaviour more like to what users expect"}}}, + {"20.7", {{"show_table_uuid_in_table_create_query_if_not_nil", true, false, "Stop showing UID of the table in its CREATE query for Engine=Atomic"}}}, + {"20.5", {{"input_format_with_names_use_header", false, true, "Enable using header with names for formats with WithNames/WithNamesAndTypes suffixes"}, + {"allow_suspicious_codecs", true, false, "Don't allow to specify meaningless compression codecs"}}}, + {"20.4", {{"validate_polygons", false, true, "Throw exception if polygon is invalid in function pointInPolygon by default instead of returning possibly wrong results"}}}, + {"19.18", {{"enable_scalar_subquery_optimization", false, true, "Prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once"}}}, + {"19.14", {{"any_join_distinct_right_table_keys", true, false, "Disable ANY RIGHT and ANY FULL JOINs by default to avoid inconsistency"}}}, + {"19.12", {{"input_format_defaults_for_omitted_fields", false, true, "Enable calculation of complex default expressions for omitted fields for some input formats, because it should be the expected behaviour"}}}, + {"19.5", {{"max_partitions_per_insert_block", 0, 100, "Add a limit for the number of partitions in one block"}}}, + {"18.12.17", {{"enable_optimize_predicate_expression", 0, 1, "Optimize predicates to subqueries by default"}}}, }; From 906a181b97253355193e51e99ebc3320eb6d8907 Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Tue, 23 Jul 2024 12:28:23 +0200 Subject: [PATCH 05/13] fix for clang-tidy --- .../Serializations/SerializationAsStringNonTrivialJSON.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h index c6d6b864bed..f52e1982830 100644 --- a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h +++ b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h @@ -50,9 +50,9 @@ public: return tryDeserializeTextNoEmptyCheckJSON(column, istr, format_settings); } - virtual void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override = 0; + void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override = 0; - virtual bool tryDeserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const override + bool tryDeserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const override { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method tryDeserializeTextNoEmptyCheckJSON is not supported"); } From 14dad25adce2492957c10f147812308a92587b26 Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Tue, 23 Jul 2024 21:56:40 +0200 Subject: [PATCH 06/13] trigger build From 5d6d378f2470d3147f561f34bebe77200ff2181c Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Wed, 24 Jul 2024 15:40:27 +0200 Subject: [PATCH 07/13] renumbered tests --- ...as_default.reference => 03210_json_empty_as_default.reference} | 0 ..._json_empty_as_default.sql => 03210_json_empty_as_default.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{03203_json_empty_as_default.reference => 03210_json_empty_as_default.reference} (100%) rename tests/queries/0_stateless/{03203_json_empty_as_default.sql => 03210_json_empty_as_default.sql} (100%) diff --git a/tests/queries/0_stateless/03203_json_empty_as_default.reference b/tests/queries/0_stateless/03210_json_empty_as_default.reference similarity index 100% rename from tests/queries/0_stateless/03203_json_empty_as_default.reference rename to tests/queries/0_stateless/03210_json_empty_as_default.reference diff --git a/tests/queries/0_stateless/03203_json_empty_as_default.sql b/tests/queries/0_stateless/03210_json_empty_as_default.sql similarity index 100% rename from tests/queries/0_stateless/03203_json_empty_as_default.sql rename to tests/queries/0_stateless/03210_json_empty_as_default.sql From 0ff90e4a5c222caa93863dc5e4dcfa2816a64d20 Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Thu, 1 Aug 2024 01:31:31 +0200 Subject: [PATCH 08/13] post-review changes --- src/DataTypes/DataTypeAggregateFunction.h | 2 - src/DataTypes/DataTypeDate.h | 1 - src/DataTypes/DataTypeDate32.h | 1 - src/DataTypes/DataTypeDateTime.h | 1 - src/DataTypes/DataTypeDateTime64.h | 2 - src/DataTypes/DataTypeIPv4andIPv6.h | 2 - src/DataTypes/DataTypeUUID.h | 1 - src/DataTypes/IDataType.h | 3 - src/DataTypes/Serializations/ISerialization.h | 11 +- .../SerializationAggregateFunction.cpp | 2 +- .../SerializationAggregateFunction.h | 5 +- .../Serializations/SerializationArray.cpp | 128 +++++++++++++++--- .../SerializationAsStringNonTrivialJSON.h | 61 --------- .../Serializations/SerializationDate.cpp | 4 +- .../Serializations/SerializationDate.h | 7 +- .../Serializations/SerializationDate32.cpp | 4 +- .../Serializations/SerializationDate32.h | 7 +- .../Serializations/SerializationDateTime.cpp | 4 +- .../Serializations/SerializationDateTime.h | 7 +- .../SerializationDateTime64.cpp | 6 +- .../Serializations/SerializationDateTime64.h | 7 +- .../SerializationIPv4andIPv6.cpp | 4 +- .../Serializations/SerializationIPv4andIPv6.h | 7 +- .../Serializations/SerializationMap.cpp | 114 ++++++++++++++-- .../Serializations/SerializationMap.h | 3 + .../Serializations/SerializationNullable.cpp | 54 ++------ .../Serializations/SerializationNullable.h | 1 - .../Serializations/SerializationTuple.cpp | 89 +++++++++++- .../Serializations/SerializationUUID.cpp | 4 +- .../Serializations/SerializationUUID.h | 7 +- src/Formats/JSONUtils.cpp | 84 +++++++++--- src/IO/ReadHelpers.cpp | 17 --- src/IO/ReadHelpers.h | 4 - ... => 03215_json_empty_as_default.reference} | 8 +- ...lt.sql => 03215_json_empty_as_default.sql} | 0 35 files changed, 415 insertions(+), 247 deletions(-) delete mode 100644 src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h rename tests/queries/0_stateless/{03210_json_empty_as_default.reference => 03215_json_empty_as_default.reference} (95%) rename tests/queries/0_stateless/{03210_json_empty_as_default.sql => 03215_json_empty_as_default.sql} (100%) diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index 2ab2e53100b..e3a4f9726d9 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -64,8 +64,6 @@ public: SerializationPtr doGetDefaultSerialization() const override; bool supportsSparseSerialization() const override { return false; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } - bool isVersioned() const; /// Version is not empty only if it was parsed from AST or implicitly cast to 0 or version according diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index cb7a603705d..0e08b9ba2ca 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -17,7 +17,6 @@ public: bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 53bd010b7cf..65633e7a228 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -18,7 +18,6 @@ public: bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index 11b579920ba..5519240dee1 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -44,7 +44,6 @@ public: bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index dd5ff7e6550..64cedd798d1 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -39,8 +39,6 @@ public: bool isSummable() const override { return false; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } - protected: SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/DataTypeIPv4andIPv6.h b/src/DataTypes/DataTypeIPv4andIPv6.h index 520af4f21e0..5aea55751a7 100644 --- a/src/DataTypes/DataTypeIPv4andIPv6.h +++ b/src/DataTypes/DataTypeIPv4andIPv6.h @@ -46,7 +46,6 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(IPv4); } bool isCategorial() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } SerializationPtr doGetDefaultSerialization() const override { return std::make_shared>(); } }; @@ -85,7 +84,6 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(IPv6); } bool isCategorial() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } SerializationPtr doGetDefaultSerialization() const override { return std::make_shared>(); } }; diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index 1aeab1b78ba..90cdd90d68d 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -42,7 +42,6 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(UUID); } bool isCategorial() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } - bool isNonTriviallySerializedAsStringJSON() const override { return true; } SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 174531cad27..397ae3d8be9 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -328,9 +328,6 @@ public: /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); - /// Checks if this is a non-numeric non-string type which is serialized as a JSON string. - virtual bool isNonTriviallySerializedAsStringJSON() const { return false; } - protected: friend class DataTypeFactory; friend class AggregateFunctionSimpleState; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 3f4f4b8f875..5d0bf60c59f 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -17,7 +17,6 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; } class IDataType; @@ -399,20 +398,12 @@ public: virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; virtual bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - /// The following two methods are implemented only for non-numeric non-string simple data types. - virtual void deserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method deserializeTextNoEmptyCheckJSON is not supported"); - } - virtual bool tryDeserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method tryDeserializeTextNoEmptyCheckJSON is not supported"); - } virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t /*indent*/) const { serializeTextJSON(column, row_num, ostr, settings); } + /** Text serialization for putting into the XML format. */ virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp index 39c3f389619..41b198890e4 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -182,7 +182,7 @@ void SerializationAggregateFunction::serializeTextJSON(const IColumn & column, s } -void SerializationAggregateFunction::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String s; readJSONString(s, istr, settings.json); diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.h b/src/DataTypes/Serializations/SerializationAggregateFunction.h index 6afa1cd4e97..c45fc79f714 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.h +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.h @@ -3,13 +3,12 @@ #include #include -#include namespace DB { -class SerializationAggregateFunction final : public SerializationAsStringNonTrivialJSON +class SerializationAggregateFunction final : public ISerialization { private: AggregateFunctionPtr function; @@ -38,7 +37,7 @@ public: void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 0a9c4529e23..eb24307d9e9 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -614,29 +615,124 @@ void SerializationArray::serializeTextJSONPretty(const IColumn & column, size_t writeChar(']', ostr); } +namespace +{ +template +ReturnType deserializeTextJSONArrayImpl(IColumn & column, ReadBuffer & istr, const SerializationPtr & nested, const FormatSettings & settings) +{ + static constexpr auto throw_exception = std::is_same_v; + + static constexpr auto EMPTY_STRING = "\"\""; + static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + + auto do_deserialize_nested = [](IColumn & nested_column, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize) -> ReturnType + { + if (check_for_empty_string(buf)) + { + nested_column.insertDefault(); + return ReturnType(true); + } + return deserialize(nested_column, buf); + }; + + auto deserialize_nested_impl = [&settings, &nested](IColumn & nested_column, ReadBuffer & buf) -> ReturnType + { + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested); + else + nested->deserializeTextJSON(nested_column, buf, settings); + return; + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested); + else + return nested->tryDeserializeTextJSON(nested_column, buf, settings); + } + }; + + auto deserialize_nested = [&settings, &do_deserialize_nested, &deserialize_nested_impl](IColumn & nested_column, ReadBuffer & buf) -> ReturnType + { + if (!settings.json.empty_as_default || buf.eof() || *buf.position() != EMPTY_STRING[0]) + return deserialize_nested_impl(nested_column, buf); + + if (buf.available() >= EMPTY_STRING_LENGTH) + { + /// We have enough data in buffer to check if we have an empty string. + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto * pos = buf_.position(); + if (checkString(EMPTY_STRING, buf_)) + return true; + else + { + buf_.position() = pos; + return false; + } + }; + + return do_deserialize_nested(nested_column, buf, check_for_empty_string, deserialize_nested_impl); + } + + /// We don't have enough data in buffer to check if we have an empty string. + /// Use PeekableReadBuffer to make a checkpoint before checking for an + /// empty string and rollback if check was failed. + + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto & peekable_buf = assert_cast(buf_); + peekable_buf.setCheckpoint(); + SCOPE_EXIT(peekable_buf.dropCheckpoint()); + if (checkString(EMPTY_STRING, peekable_buf)) + return true; + else + { + peekable_buf.rollbackToCheckpoint(); + return false; + } + }; + + auto deserialize_nested_impl_with_check = [&deserialize_nested_impl](IColumn & nested_column_, ReadBuffer & buf_) -> ReturnType + { + auto & peekable_buf = assert_cast(buf_); + if constexpr (throw_exception) + { + deserialize_nested_impl(nested_column_, peekable_buf); + assert(!peekable_buf.hasUnreadData()); + } + else + { + if (!deserialize_nested_impl(nested_column_, peekable_buf)) + return false; + if (likely(!peekable_buf.hasUnreadData())) + return true; + return false; + } + }; + + PeekableReadBuffer peekable_buf(buf, true); + return do_deserialize_nested(nested_column, peekable_buf, check_for_empty_string, deserialize_nested_impl_with_check); + }; + + return deserializeTextImpl(column, istr, + [&deserialize_nested, &istr](IColumn & nested_column) -> ReturnType + { + return deserialize_nested(nested_column, istr); + }, false); +} +} void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeTextImpl(column, istr, - [&](IColumn & nested_column) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); - else - nested->deserializeTextJSON(nested_column, istr, settings); - }, false); + deserializeTextJSONArrayImpl(column, istr, nested, settings); } bool SerializationArray::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - auto read_nested = [&](IColumn & nested_column) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); - return nested->tryDeserializeTextJSON(nested_column, istr, settings); - }; - - return deserializeTextImpl(column, istr, std::move(read_nested), false); + return deserializeTextJSONArrayImpl(column, istr, nested, settings); } diff --git a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h b/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h deleted file mode 100644 index f52e1982830..00000000000 --- a/src/DataTypes/Serializations/SerializationAsStringNonTrivialJSON.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -/** Serialization for non-numeric non-string data types serialized as JSON strings - * For these data types, we support an option, input_format_json_empty_as_default, which, when set to 1, - * allows for JSON deserialization to treat an encountered empty string as a default value for the specified type. - * Derived classes must implement the following methods: - * deserializeTextNoEmptyCheckJSON() and tryDeserializeTextNoEmptyCheckJSON() - * instead of deserializeTextJSON() and tryDeserializeTextJSON() respectively. - */ -template -requires std::derived_from -class SerializationAsStringNonTrivialJSON : public T -{ -public: - using T::T; - - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & format_settings) const override - { - if (format_settings.json.empty_as_default && tryMatchEmptyString(istr)) - column.insertDefault(); - else - deserializeTextNoEmptyCheckJSON(column, istr, format_settings); - } - - bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & format_settings) const override - { - if (format_settings.json.empty_as_default && tryMatchEmptyString(istr)) - { - column.insertDefault(); - return true; - } - else - return tryDeserializeTextNoEmptyCheckJSON(column, istr, format_settings); - } - - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override = 0; - - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & /*column*/, ReadBuffer & /*istr*/, const FormatSettings &) const override - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method tryDeserializeTextNoEmptyCheckJSON is not supported"); - } -}; - -} diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp index 3f122189c22..38e1bb87b6d 100644 --- a/src/DataTypes/Serializations/SerializationDate.cpp +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -85,7 +85,7 @@ void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num writeChar('"', ostr); } -void SerializationDate::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; assertChar('"', istr); @@ -94,7 +94,7 @@ void SerializationDate::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBu assert_cast(column).getData().push_back(x); } -bool SerializationDate::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +bool SerializationDate::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h index 10c83171527..dcf79eb49da 100644 --- a/src/DataTypes/Serializations/SerializationDate.h +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -1,13 +1,12 @@ #pragma once #include -#include #include namespace DB { -class SerializationDate final : public SerializationAsStringNonTrivialJSON> +class SerializationDate final : public SerializationNumber { public: explicit SerializationDate(const DateLUTImpl & time_zone_ = DateLUT::instance()); @@ -22,8 +21,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationDate32.cpp b/src/DataTypes/Serializations/SerializationDate32.cpp index 8ad07b534ce..70a22d59e42 100644 --- a/src/DataTypes/Serializations/SerializationDate32.cpp +++ b/src/DataTypes/Serializations/SerializationDate32.cpp @@ -83,7 +83,7 @@ void SerializationDate32::serializeTextJSON(const IColumn & column, size_t row_n writeChar('"', ostr); } -void SerializationDate32::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; assertChar('"', istr); @@ -92,7 +92,7 @@ void SerializationDate32::deserializeTextNoEmptyCheckJSON(IColumn & column, Read assert_cast(column).getData().push_back(x); } -bool SerializationDate32::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +bool SerializationDate32::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDate32.h b/src/DataTypes/Serializations/SerializationDate32.h index ac6239fbc2b..be2e2b76c1d 100644 --- a/src/DataTypes/Serializations/SerializationDate32.h +++ b/src/DataTypes/Serializations/SerializationDate32.h @@ -1,12 +1,11 @@ #pragma once #include -#include #include namespace DB { -class SerializationDate32 final : public SerializationAsStringNonTrivialJSON> +class SerializationDate32 final : public SerializationNumber { public: explicit SerializationDate32(const DateLUTImpl & time_zone_ = DateLUT::instance()); @@ -21,8 +20,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index 80b2d51140d..c5c819ce7fa 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -180,7 +180,7 @@ void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row writeChar('"', ostr); } -void SerializationDateTime::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; if (checkChar('"', istr)) @@ -196,7 +196,7 @@ void SerializationDateTime::deserializeTextNoEmptyCheckJSON(IColumn & column, Re assert_cast(column).getData().push_back(static_cast(x)); } -bool SerializationDateTime::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationDateTime::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; if (checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h index 0041f221ccf..584b0c4116b 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.h +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include class DateLUTImpl; @@ -9,7 +8,7 @@ class DateLUTImpl; namespace DB { -class SerializationDateTime final : public SerializationAsStringNonTrivialJSON>, public TimezoneMixin +class SerializationDateTime final : public SerializationNumber, public TimezoneMixin { public: explicit SerializationDateTime(const TimezoneMixin & time_zone_); @@ -24,8 +23,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index cdac7f785d1..442e29edd52 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -15,7 +15,7 @@ namespace DB SerializationDateTime64::SerializationDateTime64( UInt32 scale_, const TimezoneMixin & time_zone_) - : SerializationAsStringNonTrivialJSON>(DecimalUtils::max_precision, scale_) + : SerializationDecimalBase(DecimalUtils::max_precision, scale_) , TimezoneMixin(time_zone_) { } @@ -170,7 +170,7 @@ void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t r writeChar('"', ostr); } -void SerializationDateTime64::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; if (checkChar('"', istr)) @@ -185,7 +185,7 @@ void SerializationDateTime64::deserializeTextNoEmptyCheckJSON(IColumn & column, assert_cast(column).getData().push_back(x); } -bool SerializationDateTime64::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationDateTime64::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; if (checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h index a3bc4d1ad4e..b49bd1e9098 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.h +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include class DateLUTImpl; @@ -9,7 +8,7 @@ class DateLUTImpl; namespace DB { -class SerializationDateTime64 final : public SerializationAsStringNonTrivialJSON>, public TimezoneMixin +class SerializationDateTime64 final : public SerializationDecimalBase, public TimezoneMixin { public: SerializationDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_); @@ -26,8 +25,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp index ecd50a0b9b8..c1beceb4533 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -69,7 +69,7 @@ void SerializationIP::serializeTextJSON(const DB::IColumn & column, size_t } template -void SerializationIP::deserializeTextNoEmptyCheckJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { IPv x; assertChar('"', istr); @@ -84,7 +84,7 @@ void SerializationIP::deserializeTextNoEmptyCheckJSON(DB::IColumn & column, } template -bool SerializationIP::tryDeserializeTextNoEmptyCheckJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +bool SerializationIP::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const { IPv x; if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h index 44f36252741..a53f257646b 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h @@ -4,14 +4,13 @@ #include #include #include -#include #include namespace DB { template -class SerializationIP : public SerializationAsStringNonTrivialJSON +class SerializationIP : public SimpleTextSerialization { public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -23,8 +22,8 @@ public: bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index c722b3ac7a1..4a9f701c5c1 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -315,29 +316,116 @@ void SerializationMap::serializeTextJSONPretty(const IColumn & column, size_t ro writeChar('}', ostr); } - -void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType SerializationMap::deserializeTextJSONMapImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeTextImpl(column, istr, - [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + static constexpr auto throw_exception = std::is_same_v; + + static constexpr auto EMPTY_STRING = "\"\""; + static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + + auto do_deserialize_subcolumn = [](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn, auto && check_for_empty_string, auto && deserialize) -> ReturnType + { + if (check_for_empty_string(buf)) + { + subcolumn.insertDefault(); + return ReturnType(true); + } + return deserialize(buf, subcolumn_serialization, subcolumn); + }; + + auto deserialize_subcolumn_impl = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) -> ReturnType + { + if constexpr (throw_exception) { if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); - }); + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + else + return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); + } + }; + + auto deserialize_subcolumn = [&settings, &do_deserialize_subcolumn, &deserialize_subcolumn_impl](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) -> ReturnType + { + if (!settings.json.empty_as_default || buf.eof() || *buf.position() != EMPTY_STRING[0]) + return do_deserialize_subcolumn(buf, subcolumn_serialization, subcolumn, [](ReadBuffer &){ return false; }, deserialize_subcolumn_impl); + + if (buf.available() >= EMPTY_STRING_LENGTH) + { + /// We have enough data in buffer to check if we have an empty string. + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto * pos = buf_.position(); + if (checkString(EMPTY_STRING, buf_)) + return true; + else + { + buf_.position() = pos; + return false; + } + }; + + return do_deserialize_subcolumn(buf, subcolumn_serialization, subcolumn, check_for_empty_string, deserialize_subcolumn_impl); + } + + /// We don't have enough data in buffer to check if we have an empty string. + /// Use PeekableReadBuffer to make a checkpoint before checking for an + /// empty string and rollback if check was failed. + + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto & peekable_buf = assert_cast(buf_); + peekable_buf.setCheckpoint(); + SCOPE_EXIT(peekable_buf.dropCheckpoint()); + if (checkString(EMPTY_STRING, peekable_buf)) + return true; + else + { + peekable_buf.rollbackToCheckpoint(); + return false; + } + }; + + auto deserialize_subcolumn_impl_with_check = [&deserialize_subcolumn_impl](ReadBuffer & buf_, const SerializationPtr & subcolumn_serialization_, IColumn & subcolumn_) -> ReturnType + { + auto & peekable_buf = assert_cast(buf_); + if constexpr (throw_exception) + { + deserialize_subcolumn_impl(peekable_buf, subcolumn_serialization_, subcolumn_); + assert(!peekable_buf.hasUnreadData()); + } + else + { + if (!deserialize_subcolumn_impl(peekable_buf, subcolumn_serialization_, subcolumn_)) + return false; + if (likely(!peekable_buf.hasUnreadData())) + return true; + return false; + } + }; + + PeekableReadBuffer peekable_buf(buf, true); + return do_deserialize_subcolumn(peekable_buf, subcolumn_serialization, subcolumn, check_for_empty_string, deserialize_subcolumn_impl_with_check); + }; + + return deserializeTextImpl(column, istr, deserialize_subcolumn); +} + +void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONMapImpl(column, istr, settings); } bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); - return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); - }; - - return deserializeTextImpl(column, istr, reader); + return deserializeTextJSONMapImpl(column, istr, settings); } void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index cfcde445c1f..9bdc110c445 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -74,6 +74,9 @@ private: template ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; + + template + ReturnType deserializeTextJSONMapImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 3cd767e58eb..e72dd3a42f5 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -844,52 +844,25 @@ bool SerializationNullable::tryDeserializeNullJSON(DB::ReadBuffer & istr) return checkString("null", istr); } -namespace -{ - -enum class Strategy : uint8_t -{ - Deserialize, - DeserializeNoEmptyCheck, - TryDeserialize -}; - -template struct ReturnTypeImpl; -template <> struct ReturnTypeImpl { using Type = void; }; -template <> struct ReturnTypeImpl { using Type = bool; }; -template <> struct ReturnTypeImpl { using Type = void; }; - -template -using ReturnType = typename ReturnTypeImpl::Type; - -template struct AlwaysFalse : std::false_type {}; - -template -ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) +template +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { auto check_for_null = [](ReadBuffer & buf){ return checkStringByFirstCharacterAndAssertTheRest("null", buf); }; - auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) -> ReturnType + auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) { - if constexpr (strategy == Strategy::TryDeserialize) + if constexpr (std::is_same_v) return nested->tryDeserializeTextJSON(nested_column, buf, settings); - else if constexpr (strategy == Strategy::Deserialize) - nested->deserializeTextJSON(nested_column, buf, settings); - else if constexpr (strategy == Strategy::DeserializeNoEmptyCheck) - nested->deserializeTextNoEmptyCheckJSON(nested_column, buf, settings); - else - static_assert(AlwaysFalse::value); + nested->deserializeTextJSON(nested_column, buf, settings); }; - return deserializeImpl>(column, istr, check_for_null, deserialize_nested, is_null); -} - + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { ColumnNullable & col = assert_cast(column); bool is_null; - deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); + deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); safeAppendToNullMap(col, is_null); } @@ -897,27 +870,20 @@ bool SerializationNullable::tryDeserializeTextJSON(IColumn & column, ReadBuffer { ColumnNullable & col = assert_cast(column); bool is_null; - return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); + return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); } bool SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) { bool is_null; - deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); - return !is_null; -} - -bool SerializationNullable::deserializeNullAsDefaultOrNestedTextNoEmptyCheckJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) -{ - bool is_null; - deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); return !is_null; } bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) { bool is_null; - return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index c5215e2a39f..f7d2d2eadf0 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -88,7 +88,6 @@ public: static bool deserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); static bool deserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); static bool deserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); - static bool deserializeNullAsDefaultOrNestedTextNoEmptyCheckJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); static bool deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); /// If Check for NULL and deserialize value into non-nullable column or insert default value of nested type. diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 594a23ab507..7238c2bdbd7 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -317,24 +318,100 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf { static constexpr bool throw_exception = std::is_same_v; - auto deserialize_element = [&](IColumn & element_column, size_t element_pos) + static constexpr auto EMPTY_STRING = "\"\""; + static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + + auto do_deserialize_element = [](IColumn & element_column, size_t element_pos, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize) -> ReturnType + { + if (check_for_empty_string(buf)) + { + element_column.insertDefault(); + return ReturnType(true); + } + + return deserialize(element_column, element_pos, buf); + }; + + auto deserialize_element_impl = [&settings, this](IColumn & element_column, size_t element_pos, ReadBuffer & buf) -> ReturnType { if constexpr (throw_exception) { if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, buf, settings, elems[element_pos]); else - elems[element_pos]->deserializeTextJSON(element_column, istr, settings); - return true; + elems[element_pos]->deserializeTextJSON(element_column, buf, settings); } else { if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); - return elems[element_pos]->tryDeserializeTextJSON(element_column, istr, settings); + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(element_column, buf, settings, elems[element_pos]); + return elems[element_pos]->tryDeserializeTextJSON(element_column, buf, settings); } }; + auto deserialize_element = [&settings, &do_deserialize_element, &deserialize_element_impl, &istr](IColumn & element_column, size_t element_pos) -> ReturnType + { + if (!settings.json.empty_as_default || istr.eof() || *istr.position() != EMPTY_STRING[0]) + return do_deserialize_element(element_column, element_pos, istr, [](ReadBuffer &) { return false; }, deserialize_element_impl); + + if (istr.available() >= EMPTY_STRING_LENGTH) + { + /// We have enough data in buffer to check if we have an empty string. + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto * pos = buf_.position(); + if (checkString(EMPTY_STRING, buf_)) + return true; + else + { + buf_.position() = pos; + return false; + } + }; + + return do_deserialize_element(element_column, element_pos, istr, check_for_empty_string, deserialize_element_impl); + } + + /// We don't have enough data in buffer to check if we have an empty string. + /// Use PeekableReadBuffer to make a checkpoint before checking for an + /// empty string and rollback if check was failed. + + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto & peekable_buf = assert_cast(buf_); + peekable_buf.setCheckpoint(); + SCOPE_EXIT(peekable_buf.dropCheckpoint()); + if (checkString(EMPTY_STRING, peekable_buf)) + return true; + else + { + peekable_buf.rollbackToCheckpoint(); + return false; + } + }; + + auto deserialize_element_impl_with_check = [&deserialize_element_impl](IColumn & element_column_, size_t element_pos_, ReadBuffer & buf_) -> ReturnType + { + auto & peekable_buf = assert_cast(buf_); + if constexpr (throw_exception) + { + deserialize_element_impl(element_column_, element_pos_, peekable_buf); + assert(!peekable_buf.hasUnreadData()); + } + else + { + if (!deserialize_element_impl(element_column_, element_pos_, peekable_buf)) + return false; + if (likely(!peekable_buf.hasUnreadData())) + return true; + return false; + } + }; + + PeekableReadBuffer peekable_buf(istr, true); + return do_deserialize_element(element_column, element_pos, peekable_buf, check_for_empty_string, deserialize_element_impl_with_check); + }; + if (settings.json.read_named_tuples_as_objects && have_explicit_names) { diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index 21a0ccf676c..f18466ad8ad 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -94,7 +94,7 @@ void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num writeChar('"', ostr); } -void SerializationUUID::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { UUID x; assertChar('"', istr); @@ -103,7 +103,7 @@ void SerializationUUID::deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBu assert_cast(column).getData().push_back(x); } -bool SerializationUUID::tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +bool SerializationUUID::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { UUID x; if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h index 185d1d44c16..458504f8f42 100644 --- a/src/DataTypes/Serializations/SerializationUUID.h +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -1,12 +1,11 @@ #pragma once #include -#include namespace DB { -class SerializationUUID : public SerializationAsStringNonTrivialJSON +class SerializationUUID : public SimpleTextSerialization { public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -16,8 +15,8 @@ public: void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool tryDeserializeTextNoEmptyCheckJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 73189e81f97..34f55ef00d8 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include #include @@ -268,6 +270,9 @@ namespace JSONUtils const FormatSettings & format_settings, bool yield_strings) { + static constexpr auto EMPTY_STRING = "\"\""; + static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + try { bool as_nullable = format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type); @@ -286,33 +291,78 @@ namespace JSONUtils return true; } - if (format_settings.json.empty_as_default && type->isNonTriviallySerializedAsStringJSON()) + auto do_deserialize = [](IColumn & column_, ReadBuffer & buf_, auto && check_for_empty_string, auto && deserialize) -> bool { - /// We have a non-numeric non-string data type at the top level. - /// At first glance, it looks like we sort of duplicate the work done in - /// SerializationAsStringNonTrivialJSON. Actually we need to proceed as - /// done here because we want to return false if we inserted a default - /// value on purpose, which the ISerialization interface does not allow for. - if (tryMatchEmptyString(in)) + if (check_for_empty_string(buf_)) { - column.insertDefault(); + column_.insertDefault(); return false; } + else + return deserialize(column_, buf_); + }; - if (as_nullable) - return SerializationNullable::deserializeNullAsDefaultOrNestedTextNoEmptyCheckJSON(column, in, format_settings, serialization); - - serialization->deserializeTextNoEmptyCheckJSON(column, in, format_settings); - return true; - } - else + auto deserialize_impl = [as_nullable, &format_settings, &serialization](IColumn & column_, ReadBuffer & buf_) -> bool { if (as_nullable) - return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column_, buf_, format_settings, serialization); - serialization->deserializeTextJSON(column, in, format_settings); + serialization->deserializeTextJSON(column_, buf_, format_settings); return true; + }; + + if (!format_settings.json.empty_as_default || in.eof() || *in.position() != EMPTY_STRING[0]) + return do_deserialize(column, in, [](ReadBuffer &) { return false; }, deserialize_impl); + + if (in.available() >= EMPTY_STRING_LENGTH) + { + /// We have enough data in buffer to check if we have an empty string. + auto check_for_empty_string = [](ReadBuffer & buf_) + { + auto * pos = buf_.position(); + if (checkString(EMPTY_STRING, buf_)) + return true; + else + { + buf_.position() = pos; + return false; + } + }; + + return do_deserialize(column, in, check_for_empty_string, deserialize_impl); } + + /// We don't have enough data in buffer to check if we have an empty string. + /// Use PeekableReadBuffer to make a checkpoint before checking for an + /// empty string and rollback if check was failed. + + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto & peekable_buf = assert_cast(buf_); + peekable_buf.setCheckpoint(); + SCOPE_EXIT(peekable_buf.dropCheckpoint()); + if (checkString(EMPTY_STRING, peekable_buf)) + return true; + else + { + peekable_buf.rollbackToCheckpoint(); + return false; + } + }; + + auto deserialize_impl_with_check = [&deserialize_impl](IColumn & column_, ReadBuffer & buf_) -> bool + { + auto & peekable_buf = assert_cast(buf_); + + if (!deserialize_impl(column_, peekable_buf)) + return false; + if (likely(!peekable_buf.hasUnreadData())) + return true; + return false; + }; + + PeekableReadBuffer peekable_buf(in, true); + return do_deserialize(column, peekable_buf, check_for_empty_string, deserialize_impl_with_check); } catch (Exception & e) { diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index e8ef667a4e3..9559462e62b 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1137,23 +1137,6 @@ template void readCSVStringInto(String & s, ReadBuffer & b template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template void readCSVStringInto, false, false>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); -bool tryMatchEmptyString(ReadBuffer & buf) -{ - if (buf.eof() || *buf.position() != '"') - return false; - - ++buf.position(); - - if (buf.eof() || *buf.position() != '"') - { - --buf.position(); - return false; - } - - ++buf.position(); - - return true; -} template ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::JSON & settings) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 59c1923e02a..ffba4fafb5c 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -665,10 +665,6 @@ void readStringUntilEOFInto(Vector & s, ReadBuffer & buf); template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings); -/// Consumes the current token if it is an empty string, i.e. two consecutive double quotes, -/// Returns true if consumed. -bool tryMatchEmptyString(ReadBuffer & buf); - /// ReturnType is either bool or void. If bool, the function will return false instead of throwing an exception. template ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::JSON & settings); diff --git a/tests/queries/0_stateless/03210_json_empty_as_default.reference b/tests/queries/0_stateless/03215_json_empty_as_default.reference similarity index 95% rename from tests/queries/0_stateless/03210_json_empty_as_default.reference rename to tests/queries/0_stateless/03215_json_empty_as_default.reference index 1c689228cdf..1a98f45577a 100644 --- a/tests/queries/0_stateless/03210_json_empty_as_default.reference +++ b/tests/queries/0_stateless/03215_json_empty_as_default.reference @@ -24,18 +24,18 @@ SELECT * FROM table1 ORDER BY address ASC; -- Nullable -- { echoOn } SELECT x FROM format(JSONEachRow, 'x Nullable(IPv6)', '{"x":""}'); -:: +\N -- Compound types SELECT x FROM format(JSONEachRow, 'x Array(UUID)', '{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e",""]}'); ['00000000-0000-0000-0000-000000000000','b15f852c-c41a-4fd6-9247-1929c841715e','00000000-0000-0000-0000-000000000000'] SELECT x FROM format(JSONEachRow, 'x Array(Nullable(IPv6))', '{"x":["",""]}'); -['::','::'] +[NULL,NULL] SELECT x FROM format(JSONEachRow, 'x Tuple(Date, IPv4, String)', '{"x":["", "", "abc"]}'); ('1970-01-01','0.0.0.0','abc') SELECT x FROM format(JSONEachRow, 'x Map(String, IPv6)', '{"x":{"abc": ""}}'); {'abc':'::'} SELECT x FROM format(JSONEachRow, 'x Variant(Date, UUID)', '{"x":""}'); -00000000-0000-0000-0000-000000000000 +\N -- Deep composition SELECT x FROM format(JSONEachRow, 'x Array(Array(IPv6))', '{"x":[["2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF", ""], ["", "2001:db8:3333:4444:5555:6666:7777:8888"]]}'); [['2001:db8:3333:4444:cccc:dddd:eeee:ffff','::'],['::','2001:db8:3333:4444:5555:6666:7777:8888']] @@ -44,4 +44,4 @@ SELECT x FROM format(JSONEachRow, 'x Variant(Date, Array(UUID))', '{"x":["", "b1 SELECT x FROM format(JSONEachRow, 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))', '{"x":[[""], ["",{"abc":""}]]}'); (['00000000-0000-0000-0000-000000000000'],('00000000-0000-0000-0000-000000000000',{'abc':'::'})) SELECT x FROM format(JSONEachRow, 'x Map(Tuple(Date,IPv4), Variant(UUID,IPv6))', '{"x":{["",""]:""}}'); -{('1970-01-01','0.0.0.0'):'00000000-0000-0000-0000-000000000000'} +{('1970-01-01','0.0.0.0'):NULL} diff --git a/tests/queries/0_stateless/03210_json_empty_as_default.sql b/tests/queries/0_stateless/03215_json_empty_as_default.sql similarity index 100% rename from tests/queries/0_stateless/03210_json_empty_as_default.sql rename to tests/queries/0_stateless/03215_json_empty_as_default.sql From 3c586d80c85168d13620042d1656fad67f35d09e Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Sat, 3 Aug 2024 10:57:27 +0200 Subject: [PATCH 09/13] post-rebase fixes --- src/Core/SettingsChangesHistory.cpp | 261 +--------------------------- 1 file changed, 1 insertion(+), 260 deletions(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 6a3b32f0d1b..2f99910d7a0 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -88,6 +88,7 @@ static std::initializer_list col >= '2023-01-01' AND col <= '2023-12-31')"}, - {"extract_key_value_pairs_max_pairs_per_row", 0, 0, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory."}, - {"default_view_definer", "CURRENT_USER", "CURRENT_USER", "Allows to set default `DEFINER` option while creating a view"}, - {"default_materialized_view_sql_security", "DEFINER", "DEFINER", "Allows to set a default value for SQL SECURITY option when creating a materialized view"}, - {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, - {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, - {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, - }}, - {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, - {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, - {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, - {"allow_experimental_variant_type", false, false, "Add new experimental Variant type"}, - {"use_variant_as_common_type", false, false, "Allow to use Variant in if/multiIf if there is no common type"}, - {"output_format_arrow_use_64_bit_indexes_for_dictionary", false, false, "Allow to use 64 bit indexes type in Arrow dictionaries"}, - {"parallel_replicas_mark_segment_size", 128, 128, "Add new setting to control segment size in new parallel replicas coordinator implementation"}, - {"ignore_materialized_views_with_dropped_target_table", false, false, "Add new setting to allow to ignore materialized views with dropped target table"}, - {"output_format_compression_level", 3, 3, "Allow to change compression level in the query output"}, - {"output_format_compression_zstd_window_log", 0, 0, "Allow to change zstd window log in the query output when zstd compression is used"}, - {"enable_zstd_qat_codec", false, false, "Add new ZSTD_QAT codec"}, - {"enable_vertical_final", false, true, "Use vertical final by default"}, - {"output_format_arrow_use_64_bit_indexes_for_dictionary", false, false, "Allow to use 64 bit indexes type in Arrow dictionaries"}, - {"max_rows_in_set_to_optimize_join", 100000, 0, "Disable join optimization as it prevents from read in order optimization"}, - {"output_format_pretty_color", true, "auto", "Setting is changed to allow also for auto value, disabling ANSI escapes if output is not a tty"}, - {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, - {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, - {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, - {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, - {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}, - {"azure_max_unexpected_write_error_retries", 4, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write"}, - {"split_parts_ranges_into_intersecting_and_non_intersecting_final", false, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"}, - {"split_intersecting_parts_ranges_into_layers_final", true, true, "Allow to split intersecting parts ranges into layers during FINAL optimization"}}}, - {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, - {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, - {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"}, - {"input_format_arrow_allow_missing_columns", false, true, "Allow missing columns in Arrow files by default"}}}, - {"23.11", {{"parsedatetime_parse_without_leading_zeros", false, true, "Improved compatibility with MySQL DATE_FORMAT/STR_TO_DATE"}}}, - {"23.9", {{"optimize_group_by_constant_keys", false, true, "Optimize group by constant keys by default"}, - {"input_format_json_try_infer_named_tuples_from_objects", false, true, "Try to infer named Tuples from JSON objects by default"}, - {"input_format_json_read_numbers_as_strings", false, true, "Allow to read numbers as strings in JSON formats by default"}, - {"input_format_json_read_arrays_as_strings", false, true, "Allow to read arrays as strings in JSON formats by default"}, - {"input_format_json_infer_incomplete_types_as_strings", false, true, "Allow to infer incomplete types as Strings in JSON formats by default"}, - {"input_format_json_try_infer_numbers_from_strings", true, false, "Don't infer numbers from strings in JSON formats by default to prevent possible parsing errors"}, - {"http_write_exception_in_output_format", false, true, "Output valid JSON/XML on exception in HTTP streaming."}}}, - {"23.8", {{"rewrite_count_distinct_if_with_count_distinct_implementation", false, true, "Rewrite countDistinctIf with count_distinct_implementation configuration"}}}, - {"23.7", {{"function_sleep_max_microseconds_per_block", 0, 3000000, "In previous versions, the maximum sleep time of 3 seconds was applied only for `sleep`, but not for `sleepEachRow` function. In the new version, we introduce this setting. If you set compatibility with the previous versions, we will disable the limit altogether."}}}, - {"23.6", {{"http_send_timeout", 180, 30, "3 minutes seems crazy long. Note that this is timeout for a single network write call, not for the whole upload operation."}, - {"http_receive_timeout", 180, 30, "See http_send_timeout."}}}, - {"23.5", {{"input_format_parquet_preserve_order", true, false, "Allow Parquet reader to reorder rows for better parallelism."}, - {"parallelize_output_from_storages", false, true, "Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows."}, - {"use_with_fill_by_sorting_prefix", false, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently"}, - {"output_format_parquet_compliant_nested_types", false, true, "Change an internal field name in output Parquet file schema."}}}, - {"23.4", {{"allow_suspicious_indices", true, false, "If true, index can defined with identical expressions"}, - {"allow_nonconst_timezone_arguments", true, false, "Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*()."}, - {"connect_timeout_with_failover_ms", 50, 1000, "Increase default connect timeout because of async connect"}, - {"connect_timeout_with_failover_secure_ms", 100, 1000, "Increase default secure connect timeout because of async connect"}, - {"hedged_connection_timeout_ms", 100, 50, "Start new connection in hedged requests after 50 ms instead of 100 to correspond with previous connect timeout"}, - {"formatdatetime_f_prints_single_zero", true, false, "Improved compatibility with MySQL DATE_FORMAT()/STR_TO_DATE()"}, - {"formatdatetime_parsedatetime_m_is_month_name", false, true, "Improved compatibility with MySQL DATE_FORMAT/STR_TO_DATE"}}}, - {"23.3", {{"output_format_parquet_version", "1.0", "2.latest", "Use latest Parquet format version for output format"}, - {"input_format_json_ignore_unknown_keys_in_named_tuple", false, true, "Improve parsing JSON objects as named tuples"}, - {"input_format_native_allow_types_conversion", false, true, "Allow types conversion in Native input forma"}, - {"output_format_arrow_compression_method", "none", "lz4_frame", "Use lz4 compression in Arrow output format by default"}, - {"output_format_parquet_compression_method", "snappy", "lz4", "Use lz4 compression in Parquet output format by default"}, - {"output_format_orc_compression_method", "none", "lz4_frame", "Use lz4 compression in ORC output format by default"}, - {"async_query_sending_for_remote", false, true, "Create connections and send query async across shards"}}}, - {"23.2", {{"output_format_parquet_fixed_string_as_fixed_byte_array", false, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type for FixedString by default"}, - {"output_format_arrow_fixed_string_as_fixed_byte_array", false, true, "Use Arrow FIXED_SIZE_BINARY type for FixedString by default"}, - {"query_plan_remove_redundant_distinct", false, true, "Remove redundant Distinct step in query plan"}, - {"optimize_duplicate_order_by_and_distinct", true, false, "Remove duplicate ORDER BY and DISTINCT if it's possible"}, - {"insert_keeper_max_retries", 0, 20, "Enable reconnections to Keeper on INSERT, improve reliability"}}}, - {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}, - {"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}, - {"input_format_csv_detect_header", false, true, "Detect header in CSV format by default"}, - {"input_format_tsv_detect_header", false, true, "Detect header in TSV format by default"}, - {"input_format_custom_detect_header", false, true, "Detect header in CustomSeparated format by default"}, - {"query_plan_remove_redundant_sorting", false, true, "Remove redundant sorting in query plan. For example, sorting steps related to ORDER BY clauses in subqueries"}}}, - {"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"}, - {"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"}, - {"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}}, - {"22.11", {{"use_structure_from_insertion_table_in_table_functions", 0, 2, "Improve using structure from insertion table in table functions"}}}, - {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}}, - {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"}, - {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"}, - {"format_csv_allow_single_quotes", true, false, "Most tools don't treat single quote in CSV specially, don't do it by default too"}}}, - {"22.6", {{"output_format_json_named_tuples_as_objects", false, true, "Allow to serialize named tuples as JSON objects in JSON formats by default"}, - {"input_format_skip_unknown_fields", false, true, "Optimize reading subset of columns for some input formats"}}}, - {"22.5", {{"memory_overcommit_ratio_denominator", 0, 1073741824, "Enable memory overcommit feature by default"}, - {"memory_overcommit_ratio_denominator_for_user", 0, 1073741824, "Enable memory overcommit feature by default"}}}, - {"22.4", {{"allow_settings_after_format_in_insert", true, false, "Do not allow SETTINGS after FORMAT for INSERT queries because ClickHouse interpret SETTINGS as some values, which is misleading"}}}, - {"22.3", {{"cast_ipv4_ipv6_default_on_conversion_error", true, false, "Make functions cast(value, 'IPv4') and cast(value, 'IPv6') behave same as toIPv4 and toIPv6 functions"}}}, - {"21.12", {{"stream_like_engine_allow_direct_select", true, false, "Do not allow direct select for Kafka/RabbitMQ/FileLog by default"}}}, - {"21.9", {{"output_format_decimal_trailing_zeros", true, false, "Do not output trailing zeros in text representation of Decimal types by default for better looking output"}, - {"use_hedged_requests", false, true, "Enable Hedged Requests feature by default"}}}, - {"21.7", {{"legacy_column_name_of_tuple_literal", true, false, "Add this setting only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher"}}}, - {"21.5", {{"async_socket_for_remote", false, true, "Fix all problems and turn on asynchronous reads from socket for remote queries by default again"}}}, - {"21.3", {{"async_socket_for_remote", true, false, "Turn off asynchronous reads from socket for remote queries because of some problems"}, - {"optimize_normalize_count_variants", false, true, "Rewrite aggregate functions that semantically equals to count() as count() by default"}, - {"normalize_function_names", false, true, "Normalize function names to their canonical names, this was needed for projection query routing"}}}, - {"21.2", {{"enable_global_with_statement", false, true, "Propagate WITH statements to UNION queries and all subqueries by default"}}}, - {"21.1", {{"insert_quorum_parallel", false, true, "Use parallel quorum inserts by default. It is significantly more convenient to use than sequential quorum inserts"}, - {"input_format_null_as_default", false, true, "Allow to insert NULL as default for input formats by default"}, - {"optimize_on_insert", false, true, "Enable data optimization on INSERT by default for better user experience"}, - {"use_compact_format_in_distributed_parts_names", false, true, "Use compact format for async INSERT into Distributed tables by default"}}}, - {"20.10", {{"format_regexp_escaping_rule", "Escaped", "Raw", "Use Raw as default escaping rule for Regexp format to male the behaviour more like to what users expect"}}}, - {"20.7", {{"show_table_uuid_in_table_create_query_if_not_nil", true, false, "Stop showing UID of the table in its CREATE query for Engine=Atomic"}}}, - {"20.5", {{"input_format_with_names_use_header", false, true, "Enable using header with names for formats with WithNames/WithNamesAndTypes suffixes"}, - {"allow_suspicious_codecs", true, false, "Don't allow to specify meaningless compression codecs"}}}, - {"20.4", {{"validate_polygons", false, true, "Throw exception if polygon is invalid in function pointInPolygon by default instead of returning possibly wrong results"}}}, - {"19.18", {{"enable_scalar_subquery_optimization", false, true, "Prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once"}}}, - {"19.14", {{"any_join_distinct_right_table_keys", true, false, "Disable ANY RIGHT and ANY FULL JOINs by default to avoid inconsistency"}}}, - {"19.12", {{"input_format_defaults_for_omitted_fields", false, true, "Enable calculation of complex default expressions for omitted fields for some input formats, because it should be the expected behaviour"}}}, - {"19.5", {{"max_partitions_per_insert_block", 0, 100, "Add a limit for the number of partitions in one block"}}}, - {"18.12.17", {{"enable_optimize_predicate_expression", 0, 1, "Optimize predicates to subqueries by default"}}}, }; From 229fffcd56414727fc806c711b7ca8d16f876878 Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Tue, 6 Aug 2024 19:31:30 +0200 Subject: [PATCH 10/13] post-review changes --- src/Core/SettingsChangesHistory.cpp | 1 + .../Serializations/SerializationArray.cpp | 104 +--------------- .../Serializations/SerializationMap.cpp | 99 +-------------- .../Serializations/SerializationTuple.cpp | 116 +++--------------- .../Serializations/SerializationTuple.h | 3 + src/Formats/FormatFactory.cpp | 1 - src/Formats/JSONUtils.cpp | 31 ++--- src/Formats/JSONUtils.h | 109 ++++++++++++++++ 8 files changed, 154 insertions(+), 310 deletions(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 2f99910d7a0..e385f72fb38 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -89,6 +89,7 @@ static std::initializer_list #include #include -#include #include #include #include #include #include +#include namespace DB { @@ -620,108 +620,16 @@ namespace template ReturnType deserializeTextJSONArrayImpl(IColumn & column, ReadBuffer & istr, const SerializationPtr & nested, const FormatSettings & settings) { - static constexpr auto throw_exception = std::is_same_v; - - static constexpr auto EMPTY_STRING = "\"\""; - static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); - - auto do_deserialize_nested = [](IColumn & nested_column, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize) -> ReturnType + auto deserializer = [&nested](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType { - if (check_for_empty_string(buf)) + auto adapter = [&deserialize_nested, &istr_, &nested](IColumn & nested_column) -> ReturnType { - nested_column.insertDefault(); - return ReturnType(true); - } - return deserialize(nested_column, buf); - }; - - auto deserialize_nested_impl = [&settings, &nested](IColumn & nested_column, ReadBuffer & buf) -> ReturnType - { - if constexpr (throw_exception) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested); - else - nested->deserializeTextJSON(nested_column, buf, settings); - return; - } - else - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested); - else - return nested->tryDeserializeTextJSON(nested_column, buf, settings); - } - }; - - auto deserialize_nested = [&settings, &do_deserialize_nested, &deserialize_nested_impl](IColumn & nested_column, ReadBuffer & buf) -> ReturnType - { - if (!settings.json.empty_as_default || buf.eof() || *buf.position() != EMPTY_STRING[0]) - return deserialize_nested_impl(nested_column, buf); - - if (buf.available() >= EMPTY_STRING_LENGTH) - { - /// We have enough data in buffer to check if we have an empty string. - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto * pos = buf_.position(); - if (checkString(EMPTY_STRING, buf_)) - return true; - else - { - buf_.position() = pos; - return false; - } - }; - - return do_deserialize_nested(nested_column, buf, check_for_empty_string, deserialize_nested_impl); - } - - /// We don't have enough data in buffer to check if we have an empty string. - /// Use PeekableReadBuffer to make a checkpoint before checking for an - /// empty string and rollback if check was failed. - - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto & peekable_buf = assert_cast(buf_); - peekable_buf.setCheckpoint(); - SCOPE_EXIT(peekable_buf.dropCheckpoint()); - if (checkString(EMPTY_STRING, peekable_buf)) - return true; - else - { - peekable_buf.rollbackToCheckpoint(); - return false; - } + return deserialize_nested(nested_column, istr_, nested); }; - - auto deserialize_nested_impl_with_check = [&deserialize_nested_impl](IColumn & nested_column_, ReadBuffer & buf_) -> ReturnType - { - auto & peekable_buf = assert_cast(buf_); - if constexpr (throw_exception) - { - deserialize_nested_impl(nested_column_, peekable_buf); - assert(!peekable_buf.hasUnreadData()); - } - else - { - if (!deserialize_nested_impl(nested_column_, peekable_buf)) - return false; - if (likely(!peekable_buf.hasUnreadData())) - return true; - return false; - } - }; - - PeekableReadBuffer peekable_buf(buf, true); - return do_deserialize_nested(nested_column, peekable_buf, check_for_empty_string, deserialize_nested_impl_with_check); + return deserializeTextImpl(column_, istr_, adapter, false); }; - return deserializeTextImpl(column, istr, - [&deserialize_nested, &istr](IColumn & nested_column) -> ReturnType - { - return deserialize_nested(nested_column, istr); - }, false); + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, deserializer); } } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 4a9f701c5c1..0412a85ee44 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -6,9 +6,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -319,103 +319,16 @@ void SerializationMap::serializeTextJSONPretty(const IColumn & column, size_t ro template ReturnType SerializationMap::deserializeTextJSONMapImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - static constexpr auto throw_exception = std::is_same_v; - - static constexpr auto EMPTY_STRING = "\"\""; - static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); - - auto do_deserialize_subcolumn = [](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn, auto && check_for_empty_string, auto && deserialize) -> ReturnType + auto deserializer = [this](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType { - if (check_for_empty_string(buf)) + auto adapter = [&deserialize_nested](ReadBuffer & buf, const SerializationPtr & nested_column_serialization, IColumn & nested_column) -> ReturnType { - subcolumn.insertDefault(); - return ReturnType(true); - } - return deserialize(buf, subcolumn_serialization, subcolumn); - }; - - auto deserialize_subcolumn_impl = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) -> ReturnType - { - if constexpr (throw_exception) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); - else - subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); - } - else - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); - else - return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); - } - }; - - auto deserialize_subcolumn = [&settings, &do_deserialize_subcolumn, &deserialize_subcolumn_impl](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) -> ReturnType - { - if (!settings.json.empty_as_default || buf.eof() || *buf.position() != EMPTY_STRING[0]) - return do_deserialize_subcolumn(buf, subcolumn_serialization, subcolumn, [](ReadBuffer &){ return false; }, deserialize_subcolumn_impl); - - if (buf.available() >= EMPTY_STRING_LENGTH) - { - /// We have enough data in buffer to check if we have an empty string. - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto * pos = buf_.position(); - if (checkString(EMPTY_STRING, buf_)) - return true; - else - { - buf_.position() = pos; - return false; - } - }; - - return do_deserialize_subcolumn(buf, subcolumn_serialization, subcolumn, check_for_empty_string, deserialize_subcolumn_impl); - } - - /// We don't have enough data in buffer to check if we have an empty string. - /// Use PeekableReadBuffer to make a checkpoint before checking for an - /// empty string and rollback if check was failed. - - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto & peekable_buf = assert_cast(buf_); - peekable_buf.setCheckpoint(); - SCOPE_EXIT(peekable_buf.dropCheckpoint()); - if (checkString(EMPTY_STRING, peekable_buf)) - return true; - else - { - peekable_buf.rollbackToCheckpoint(); - return false; - } + return deserialize_nested(nested_column, buf, nested_column_serialization); }; - - auto deserialize_subcolumn_impl_with_check = [&deserialize_subcolumn_impl](ReadBuffer & buf_, const SerializationPtr & subcolumn_serialization_, IColumn & subcolumn_) -> ReturnType - { - auto & peekable_buf = assert_cast(buf_); - if constexpr (throw_exception) - { - deserialize_subcolumn_impl(peekable_buf, subcolumn_serialization_, subcolumn_); - assert(!peekable_buf.hasUnreadData()); - } - else - { - if (!deserialize_subcolumn_impl(peekable_buf, subcolumn_serialization_, subcolumn_)) - return false; - if (likely(!peekable_buf.hasUnreadData())) - return true; - return false; - } - }; - - PeekableReadBuffer peekable_buf(buf, true); - return do_deserialize_subcolumn(peekable_buf, subcolumn_serialization, subcolumn, check_for_empty_string, deserialize_subcolumn_impl_with_check); + return this->deserializeTextImpl(column_, istr_, adapter); }; - return deserializeTextImpl(column, istr, deserialize_subcolumn); + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, deserializer); } void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 7238c2bdbd7..459677d40cd 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -314,103 +314,9 @@ void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t } template -ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +ReturnType SerializationTuple::deserializeTextJSONTupleImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const { - static constexpr bool throw_exception = std::is_same_v; - - static constexpr auto EMPTY_STRING = "\"\""; - static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); - - auto do_deserialize_element = [](IColumn & element_column, size_t element_pos, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize) -> ReturnType - { - if (check_for_empty_string(buf)) - { - element_column.insertDefault(); - return ReturnType(true); - } - - return deserialize(element_column, element_pos, buf); - }; - - auto deserialize_element_impl = [&settings, this](IColumn & element_column, size_t element_pos, ReadBuffer & buf) -> ReturnType - { - if constexpr (throw_exception) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, buf, settings, elems[element_pos]); - else - elems[element_pos]->deserializeTextJSON(element_column, buf, settings); - } - else - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(element_column, buf, settings, elems[element_pos]); - return elems[element_pos]->tryDeserializeTextJSON(element_column, buf, settings); - } - }; - - auto deserialize_element = [&settings, &do_deserialize_element, &deserialize_element_impl, &istr](IColumn & element_column, size_t element_pos) -> ReturnType - { - if (!settings.json.empty_as_default || istr.eof() || *istr.position() != EMPTY_STRING[0]) - return do_deserialize_element(element_column, element_pos, istr, [](ReadBuffer &) { return false; }, deserialize_element_impl); - - if (istr.available() >= EMPTY_STRING_LENGTH) - { - /// We have enough data in buffer to check if we have an empty string. - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto * pos = buf_.position(); - if (checkString(EMPTY_STRING, buf_)) - return true; - else - { - buf_.position() = pos; - return false; - } - }; - - return do_deserialize_element(element_column, element_pos, istr, check_for_empty_string, deserialize_element_impl); - } - - /// We don't have enough data in buffer to check if we have an empty string. - /// Use PeekableReadBuffer to make a checkpoint before checking for an - /// empty string and rollback if check was failed. - - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto & peekable_buf = assert_cast(buf_); - peekable_buf.setCheckpoint(); - SCOPE_EXIT(peekable_buf.dropCheckpoint()); - if (checkString(EMPTY_STRING, peekable_buf)) - return true; - else - { - peekable_buf.rollbackToCheckpoint(); - return false; - } - }; - - auto deserialize_element_impl_with_check = [&deserialize_element_impl](IColumn & element_column_, size_t element_pos_, ReadBuffer & buf_) -> ReturnType - { - auto & peekable_buf = assert_cast(buf_); - if constexpr (throw_exception) - { - deserialize_element_impl(element_column_, element_pos_, peekable_buf); - assert(!peekable_buf.hasUnreadData()); - } - else - { - if (!deserialize_element_impl(element_column_, element_pos_, peekable_buf)) - return false; - if (likely(!peekable_buf.hasUnreadData())) - return true; - return false; - } - }; - - PeekableReadBuffer peekable_buf(istr, true); - return do_deserialize_element(element_column, element_pos, peekable_buf, check_for_empty_string, deserialize_element_impl_with_check); - }; + static constexpr auto throw_exception = std::is_same_v; if (settings.json.read_named_tuples_as_objects && have_explicit_names) @@ -583,6 +489,22 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf } } +template +ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto deserializer = [&settings, this](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType + { + auto adapter = [&deserialize_nested, &istr_, this](IColumn & nested_column, size_t element_pos) -> ReturnType + { + return deserialize_nested(nested_column, istr_, this->elems[element_pos]); + }; + + return this->deserializeTextJSONTupleImpl(column_, istr_, settings, adapter); + }; + + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, deserializer); +} + void SerializationTuple::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { deserializeTextJSONImpl(column, istr, settings); diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 810673d8b21..4a55ea6eedf 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -81,6 +81,9 @@ private: template ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; + template + ReturnType deserializeTextJSONTupleImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const; + template ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 59131f34697..c1723a65e6d 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -151,7 +151,6 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects; format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence; format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields; - format_settings.json.case_insensitive_column_matching = settings.input_format_json_case_insensitive_column_matching; format_settings.json.empty_as_default = settings.input_format_json_empty_as_default; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.force_null_for_omitted_fields = settings.input_format_force_null_for_omitted_fields; diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 34f55ef00d8..4f0187b4d87 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -3,10 +3,7 @@ #include #include #include -#include -#include #include -#include #include #include #include @@ -311,8 +308,8 @@ namespace JSONUtils return true; }; - if (!format_settings.json.empty_as_default || in.eof() || *in.position() != EMPTY_STRING[0]) - return do_deserialize(column, in, [](ReadBuffer &) { return false; }, deserialize_impl); + if (!format_settings.json.empty_as_default || in.eof() || *in.position() != EMPTY_STRING[0]) + return deserialize_impl(column, in); if (in.available() >= EMPTY_STRING_LENGTH) { @@ -322,11 +319,8 @@ namespace JSONUtils auto * pos = buf_.position(); if (checkString(EMPTY_STRING, buf_)) return true; - else - { - buf_.position() = pos; - return false; - } + buf_.position() = pos; + return false; }; return do_deserialize(column, in, check_for_empty_string, deserialize_impl); @@ -343,22 +337,17 @@ namespace JSONUtils SCOPE_EXIT(peekable_buf.dropCheckpoint()); if (checkString(EMPTY_STRING, peekable_buf)) return true; - else - { - peekable_buf.rollbackToCheckpoint(); - return false; - } + peekable_buf.rollbackToCheckpoint(); + return false; }; auto deserialize_impl_with_check = [&deserialize_impl](IColumn & column_, ReadBuffer & buf_) -> bool { auto & peekable_buf = assert_cast(buf_); - - if (!deserialize_impl(column_, peekable_buf)) - return false; - if (likely(!peekable_buf.hasUnreadData())) - return true; - return false; + bool res = deserialize_impl(column_, peekable_buf); + if (unlikely(peekable_buf.hasUnreadData())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect state while parsing JSON: PeekableReadBuffer has unread data in own memory: {}", String(peekable_buf.position(), peekable_buf.available())); + return res; }; PeekableReadBuffer peekable_buf(in, true); diff --git a/src/Formats/JSONUtils.h b/src/Formats/JSONUtils.h index e2ac3467971..2800017bfed 100644 --- a/src/Formats/JSONUtils.h +++ b/src/Formats/JSONUtils.h @@ -2,11 +2,15 @@ #include #include +#include #include #include +#include #include +#include #include #include +#include #include #include @@ -16,6 +20,11 @@ namespace DB class Block; struct JSONInferenceInfo; +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + namespace JSONUtils { std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows); @@ -136,6 +145,106 @@ namespace JSONUtils bool skipUntilFieldInObject(ReadBuffer & in, const String & desired_field_name, const FormatSettings::JSON & settings); void skipTheRestOfObject(ReadBuffer & in, const FormatSettings::JSON & settings); + + template + ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserializer) + { + static constexpr auto throw_exception = std::is_same_v; + + static constexpr auto EMPTY_STRING = "\"\""; + static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + + auto do_deserialize_nested = [](IColumn & nested_column, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize, const SerializationPtr & nested_column_serialization) -> ReturnType + { + if (check_for_empty_string(buf)) + { + nested_column.insertDefault(); + return ReturnType(true); + } + return deserialize(nested_column, buf, nested_column_serialization); + }; + + auto deserialize_nested_impl = [&settings](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType + { + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested_column_serialization); + else + nested_column_serialization->deserializeTextJSON(nested_column, buf, settings); + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested_column_serialization); + return nested_column_serialization->tryDeserializeTextJSON(nested_column, buf, settings); + } + }; + + auto deserialize_nested = [&settings, &do_deserialize_nested, &deserialize_nested_impl](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType + { + if (!settings.json.empty_as_default || buf.eof() || *buf.position() != EMPTY_STRING[0]) + return deserialize_nested_impl(nested_column, buf, nested_column_serialization); + + if (buf.available() >= EMPTY_STRING_LENGTH) + { + /// We have enough data in buffer to check if we have an empty string. + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto * pos = buf_.position(); + if (checkString(EMPTY_STRING, buf_)) + return true; + buf_.position() = pos; + return false; + }; + + return do_deserialize_nested(nested_column, buf, check_for_empty_string, deserialize_nested_impl, nested_column_serialization); + } + + /// We don't have enough data in buffer to check if we have an empty string. + /// Use PeekableReadBuffer to make a checkpoint before checking for an + /// empty string and rollback if check was failed. + + auto check_for_empty_string = [](ReadBuffer & buf_) -> bool + { + auto & peekable_buf = assert_cast(buf_); + peekable_buf.setCheckpoint(); + SCOPE_EXIT(peekable_buf.dropCheckpoint()); + if (checkString(EMPTY_STRING, peekable_buf)) + return true; + peekable_buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested_impl_with_check = [&deserialize_nested_impl](IColumn & nested_column_, ReadBuffer & buf_, const SerializationPtr & nested_column_serialization_) -> ReturnType + { + auto & peekable_buf = assert_cast(buf_); + + auto enforceNoUnreadData = [&peekable_buf]() -> void + { + if (unlikely(peekable_buf.hasUnreadData())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect state while parsing JSON: PeekableReadBuffer has unread data in own memory: {}", String(peekable_buf.position(), peekable_buf.available())); + }; + + if constexpr (throw_exception) + { + deserialize_nested_impl(nested_column_, peekable_buf, nested_column_serialization_); + enforceNoUnreadData(); + } + else + { + bool res = deserialize_nested_impl(nested_column_, peekable_buf, nested_column_serialization_); + enforceNoUnreadData(); + return res; + } + }; + + PeekableReadBuffer peekable_buf(buf, true); + return do_deserialize_nested(nested_column, peekable_buf, check_for_empty_string, deserialize_nested_impl_with_check, nested_column_serialization); + }; + + return deserializer(column, istr, deserialize_nested); + } } } From a39a4b108099346b129cb162edbbf85456ca650c Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Wed, 7 Aug 2024 11:31:36 +0200 Subject: [PATCH 11/13] better --- .../Serializations/SerializationArray.cpp | 29 +++++++++++++--- .../Serializations/SerializationMap.cpp | 29 +++++++++++++--- .../Serializations/SerializationMap.h | 2 +- .../Serializations/SerializationTuple.cpp | 33 +++++++++++++++---- .../Serializations/SerializationTuple.h | 4 +-- src/Formats/JSONUtils.h | 4 +-- 6 files changed, 79 insertions(+), 22 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 5891a0a6e75..60f908a249b 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -618,7 +618,7 @@ void SerializationArray::serializeTextJSONPretty(const IColumn & column, size_t namespace { template -ReturnType deserializeTextJSONArrayImpl(IColumn & column, ReadBuffer & istr, const SerializationPtr & nested, const FormatSettings & settings) +ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const SerializationPtr & nested, const FormatSettings & settings) { auto deserializer = [&nested](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType { @@ -626,21 +626,40 @@ ReturnType deserializeTextJSONArrayImpl(IColumn & column, ReadBuffer & istr, con { return deserialize_nested(nested_column, istr_, nested); }; - return deserializeTextImpl(column_, istr_, adapter, false); + return deserializeTextImpl(column_, istr_, std::move(adapter), false); }; - return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, deserializer); + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, std::move(deserializer)); } } void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeTextJSONArrayImpl(column, istr, nested, settings); + if (settings.json.empty_as_default) + deserializeEmpyStringAsDefaultOrNested(column, istr, nested, settings); + else + deserializeTextImpl(column, istr, + [&settings, &istr, this](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); + else + nested->deserializeTextJSON(nested_column, istr, settings); + }, false); } bool SerializationArray::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - return deserializeTextJSONArrayImpl(column, istr, nested, settings); + if (settings.json.empty_as_default) + return deserializeEmpyStringAsDefaultOrNested(column, istr, nested, settings); + + return deserializeTextImpl(column, istr, + [&settings, &istr, this](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); + return nested->tryDeserializeTextJSON(nested_column, istr, settings); + }, false); } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 0412a85ee44..96c21f19805 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -317,7 +317,7 @@ void SerializationMap::serializeTextJSONPretty(const IColumn & column, size_t ro } template -ReturnType SerializationMap::deserializeTextJSONMapImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +ReturnType SerializationMap::deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { auto deserializer = [this](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType { @@ -325,20 +325,39 @@ ReturnType SerializationMap::deserializeTextJSONMapImpl(IColumn & column, ReadBu { return deserialize_nested(nested_column, buf, nested_column_serialization); }; - return this->deserializeTextImpl(column_, istr_, adapter); + return this->deserializeTextImpl(column_, istr_, std::move(adapter)); }; - return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, deserializer); + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, std::move(deserializer)); } void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeTextJSONMapImpl(column, istr, settings); + if (settings.json.empty_as_default) + deserializeEmpyStringAsDefaultOrNested(column, istr, settings); + else + deserializeTextImpl(column, istr, + [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + else + subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); + }); } bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - return deserializeTextJSONMapImpl(column, istr, settings); + if (settings.json.empty_as_default) + return deserializeEmpyStringAsDefaultOrNested(column, istr, settings); + + return deserializeTextImpl(column, istr, + [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); + }); } void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index 9bdc110c445..ddf8047f061 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -76,7 +76,7 @@ private: ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; template - ReturnType deserializeTextJSONMapImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 459677d40cd..f8cb894c19b 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -314,7 +314,7 @@ void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t } template -ReturnType SerializationTuple::deserializeTextJSONTupleImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const +ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const { static constexpr auto throw_exception = std::is_same_v; @@ -490,29 +490,48 @@ ReturnType SerializationTuple::deserializeTextJSONTupleImpl(IColumn & column, Re } template -ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +ReturnType SerializationTuple::deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { auto deserializer = [&settings, this](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType { auto adapter = [&deserialize_nested, &istr_, this](IColumn & nested_column, size_t element_pos) -> ReturnType { - return deserialize_nested(nested_column, istr_, this->elems[element_pos]); + return deserialize_nested(nested_column, istr_, elems[element_pos]); }; - return this->deserializeTextJSONTupleImpl(column_, istr_, settings, adapter); + return deserializeTextJSONImpl(column_, istr_, settings, std::move(adapter)); }; - return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, deserializer); + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, std::move(deserializer)); } void SerializationTuple::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - deserializeTextJSONImpl(column, istr, settings); + if (settings.json.empty_as_default) + deserializeEmpyStringAsDefaultOrNested(column, istr, settings); + else + deserializeTextJSONImpl(column, istr, settings, + [&settings, &istr, this](IColumn & nested_column, size_t element_pos) -> void + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, elems[element_pos]); + else + elems[element_pos]->deserializeTextJSON(nested_column, istr, settings); + }); } bool SerializationTuple::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - return deserializeTextJSONImpl(column, istr, settings); + if (settings.json.empty_as_default) + return deserializeEmpyStringAsDefaultOrNested(column, istr, settings); + + return deserializeTextJSONImpl(column, istr, settings, + [&settings, &istr, this](IColumn & nested_column, size_t element_pos) -> bool + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, elems[element_pos]); + return elems[element_pos]->tryDeserializeTextJSON(nested_column, istr, settings); + }); } diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 4a55ea6eedf..54084617d3b 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -82,10 +82,10 @@ private: ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; template - ReturnType deserializeTextJSONTupleImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const; + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const; template - ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; template ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; diff --git a/src/Formats/JSONUtils.h b/src/Formats/JSONUtils.h index 2800017bfed..a8e7113388a 100644 --- a/src/Formats/JSONUtils.h +++ b/src/Formats/JSONUtils.h @@ -181,9 +181,9 @@ namespace JSONUtils } }; - auto deserialize_nested = [&settings, &do_deserialize_nested, &deserialize_nested_impl](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType + auto deserialize_nested = [&do_deserialize_nested, &deserialize_nested_impl](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType { - if (!settings.json.empty_as_default || buf.eof() || *buf.position() != EMPTY_STRING[0]) + if (buf.eof() || *buf.position() != EMPTY_STRING[0]) return deserialize_nested_impl(nested_column, buf, nested_column_serialization); if (buf.available() >= EMPTY_STRING_LENGTH) From 29bc7cf5d58264e52fa90a83206c9508b208f93a Mon Sep 17 00:00:00 2001 From: Alexis Arnaud Date: Tue, 13 Aug 2024 14:50:37 +0200 Subject: [PATCH 12/13] post-review changes --- src/Core/SettingsChangesHistory.cpp | 1 - .../Serializations/SerializationArray.cpp | 61 ++++---- .../Serializations/SerializationArray.h | 4 + .../Serializations/SerializationMap.cpp | 73 ++++----- .../Serializations/SerializationMap.h | 2 +- .../Serializations/SerializationTuple.cpp | 68 +++++---- .../Serializations/SerializationTuple.h | 6 +- src/Formats/JSONUtils.cpp | 143 ++++++++++-------- src/Formats/JSONUtils.h | 112 +------------- ... => 03222_json_empty_as_default.reference} | 0 ...lt.sql => 03222_json_empty_as_default.sql} | 0 ...pty_as_default_small_read_buffer.reference | 8 + ...json_empty_as_default_small_read_buffer.sh | 31 ++++ 13 files changed, 241 insertions(+), 268 deletions(-) rename tests/queries/0_stateless/{03215_json_empty_as_default.reference => 03222_json_empty_as_default.reference} (100%) rename tests/queries/0_stateless/{03215_json_empty_as_default.sql => 03222_json_empty_as_default.sql} (100%) create mode 100644 tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.reference create mode 100755 tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.sh diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index e385f72fb38..ae08566ebd2 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -88,7 +88,6 @@ static std::initializer_list -ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const SerializationPtr & nested, const FormatSettings & settings) +ReturnType SerializationArray::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - auto deserializer = [&nested](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType + auto deserialize_nested = [&settings, this](IColumn & nested_column, ReadBuffer & buf) -> ReturnType { - auto adapter = [&deserialize_nested, &istr_, &nested](IColumn & nested_column) -> ReturnType + if constexpr (std::is_same_v) { - return deserialize_nested(nested_column, istr_, nested); - }; - return deserializeTextImpl(column_, istr_, std::move(adapter), false); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested); + else + nested->deserializeTextJSON(nested_column, buf, settings); + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested); + return nested->tryDeserializeTextJSON(nested_column, buf, settings); + } }; - return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, std::move(deserializer)); -} + if (settings.json.empty_as_default) + return deserializeTextImpl(column, istr, + [&deserialize_nested, &istr](IColumn & nested_column) -> ReturnType + { + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(nested_column, istr, deserialize_nested); + }, false); + else + return deserializeTextImpl(column, istr, + [&deserialize_nested, &istr](IColumn & nested_column) -> ReturnType + { + return deserialize_nested(nested_column, istr); + }, false); } + void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - if (settings.json.empty_as_default) - deserializeEmpyStringAsDefaultOrNested(column, istr, nested, settings); - else - deserializeTextImpl(column, istr, - [&settings, &istr, this](IColumn & nested_column) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); - else - nested->deserializeTextJSON(nested_column, istr, settings); - }, false); + deserializeTextJSONImpl(column, istr, settings); } bool SerializationArray::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - if (settings.json.empty_as_default) - return deserializeEmpyStringAsDefaultOrNested(column, istr, nested, settings); - - return deserializeTextImpl(column, istr, - [&settings, &istr, this](IColumn & nested_column) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); - return nested->tryDeserializeTextJSON(nested_column, istr, settings); - }, false); + return deserializeTextJSONImpl(column, istr, settings); } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index c3353f0c251..7e34abfac90 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -82,6 +82,10 @@ public: SerializationPtr create(const SerializationPtr & prev) const override; ColumnPtr create(const ColumnPtr & prev) const override; }; + +private: + template + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 96c21f19805..ae864cbf7b4 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -316,48 +316,53 @@ void SerializationMap::serializeTextJSONPretty(const IColumn & column, size_t ro writeChar('}', ostr); } + template -ReturnType SerializationMap::deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +ReturnType SerializationMap::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - auto deserializer = [this](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType + auto deserialize_nested = [&settings](IColumn & subcolumn, ReadBuffer & buf, const SerializationPtr & subcolumn_serialization) -> ReturnType { - auto adapter = [&deserialize_nested](ReadBuffer & buf, const SerializationPtr & nested_column_serialization, IColumn & nested_column) -> ReturnType + if constexpr (std::is_same_v) { - return deserialize_nested(nested_column, buf, nested_column_serialization); - }; - return this->deserializeTextImpl(column_, istr_, std::move(adapter)); - }; - - return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, std::move(deserializer)); -} - -void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.json.empty_as_default) - deserializeEmpyStringAsDefaultOrNested(column, istr, settings); - else - deserializeTextImpl(column, istr, - [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); - else - subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); - }); -} - -bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.json.empty_as_default) - return deserializeEmpyStringAsDefaultOrNested(column, istr, settings); - - return deserializeTextImpl(column, istr, - [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + else + subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); + } + else { if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); - }); + } + }; + + if (settings.json.empty_as_default) + return deserializeTextImpl(column, istr, + [&deserialize_nested](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) -> ReturnType + { + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(subcolumn, buf, + [&deserialize_nested, &subcolumn_serialization](IColumn & subcolumn_, ReadBuffer & buf_) -> ReturnType + { + return deserialize_nested(subcolumn_, buf_, subcolumn_serialization); + }); + }); + else + return deserializeTextImpl(column, istr, + [&deserialize_nested](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) -> ReturnType + { + return deserialize_nested(subcolumn, buf, subcolumn_serialization); + }); +} + +void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); } void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index ddf8047f061..007d153ec7e 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -76,7 +76,7 @@ private: ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; template - ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index f8cb894c19b..e1fcb1a8d48 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -314,7 +314,7 @@ void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t } template -ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const +ReturnType SerializationTuple::deserializeTupleJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const { static constexpr auto throw_exception = std::is_same_v; @@ -490,48 +490,52 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf } template -ReturnType SerializationTuple::deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - auto deserializer = [&settings, this](IColumn & column_, ReadBuffer & istr_, auto && deserialize_nested) -> ReturnType + auto deserialize_nested = [&settings](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType { - auto adapter = [&deserialize_nested, &istr_, this](IColumn & nested_column, size_t element_pos) -> ReturnType + if constexpr (std::is_same_v) { - return deserialize_nested(nested_column, istr_, elems[element_pos]); - }; - - return deserializeTextJSONImpl(column_, istr_, settings, std::move(adapter)); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested_column_serialization); + else + nested_column_serialization->deserializeTextJSON(nested_column, buf, settings); + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested_column_serialization); + else + return nested_column_serialization->tryDeserializeTextJSON(nested_column, buf, settings); + } }; - return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, istr, settings, std::move(deserializer)); -} - -void SerializationTuple::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const -{ if (settings.json.empty_as_default) - deserializeEmpyStringAsDefaultOrNested(column, istr, settings); + return deserializeTupleJSONImpl(column, istr, settings, + [&deserialize_nested, &istr, this](IColumn & nested_column, size_t element_pos) -> ReturnType + { + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(nested_column, istr, + [&deserialize_nested, element_pos, this](IColumn & nested_column_, ReadBuffer & buf) -> ReturnType + { + return deserialize_nested(nested_column_, buf, elems[element_pos]); + }); + }); else - deserializeTextJSONImpl(column, istr, settings, - [&settings, &istr, this](IColumn & nested_column, size_t element_pos) -> void - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, elems[element_pos]); - else - elems[element_pos]->deserializeTextJSON(nested_column, istr, settings); - }); + return deserializeTupleJSONImpl(column, istr, settings, + [&deserialize_nested, &istr, this](IColumn & nested_column, size_t element_pos) -> ReturnType + { + return deserialize_nested(nested_column, istr, elems[element_pos]); + }); } -bool SerializationTuple::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - if (settings.json.empty_as_default) - return deserializeEmpyStringAsDefaultOrNested(column, istr, settings); + deserializeTextJSONImpl(column, istr, settings); +} - return deserializeTextJSONImpl(column, istr, settings, - [&settings, &istr, this](IColumn & nested_column, size_t element_pos) -> bool - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, elems[element_pos]); - return elems[element_pos]->tryDeserializeTextJSON(nested_column, istr, settings); - }); +bool SerializationTuple::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); } diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 54084617d3b..c51adb6e536 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -82,10 +82,10 @@ private: ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; template - ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const; + ReturnType deserializeTupleJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserialize_element) const; - template - ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + template + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; template ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 4f0187b4d87..d85c9898074 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -2,11 +2,14 @@ #include #include #include +#include #include #include +#include #include #include #include +#include #include @@ -267,9 +270,6 @@ namespace JSONUtils const FormatSettings & format_settings, bool yield_strings) { - static constexpr auto EMPTY_STRING = "\"\""; - static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); - try { bool as_nullable = format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type); @@ -288,70 +288,19 @@ namespace JSONUtils return true; } - auto do_deserialize = [](IColumn & column_, ReadBuffer & buf_, auto && check_for_empty_string, auto && deserialize) -> bool - { - if (check_for_empty_string(buf_)) - { - column_.insertDefault(); - return false; - } - else - return deserialize(column_, buf_); - }; - - auto deserialize_impl = [as_nullable, &format_settings, &serialization](IColumn & column_, ReadBuffer & buf_) -> bool + auto deserialize = [as_nullable, &format_settings, &serialization](IColumn & column_, ReadBuffer & buf) -> bool { if (as_nullable) - return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column_, buf_, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column_, buf, format_settings, serialization); - serialization->deserializeTextJSON(column_, buf_, format_settings); + serialization->deserializeTextJSON(column_, buf, format_settings); return true; }; - if (!format_settings.json.empty_as_default || in.eof() || *in.position() != EMPTY_STRING[0]) - return deserialize_impl(column, in); - - if (in.available() >= EMPTY_STRING_LENGTH) - { - /// We have enough data in buffer to check if we have an empty string. - auto check_for_empty_string = [](ReadBuffer & buf_) - { - auto * pos = buf_.position(); - if (checkString(EMPTY_STRING, buf_)) - return true; - buf_.position() = pos; - return false; - }; - - return do_deserialize(column, in, check_for_empty_string, deserialize_impl); - } - - /// We don't have enough data in buffer to check if we have an empty string. - /// Use PeekableReadBuffer to make a checkpoint before checking for an - /// empty string and rollback if check was failed. - - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto & peekable_buf = assert_cast(buf_); - peekable_buf.setCheckpoint(); - SCOPE_EXIT(peekable_buf.dropCheckpoint()); - if (checkString(EMPTY_STRING, peekable_buf)) - return true; - peekable_buf.rollbackToCheckpoint(); - return false; - }; - - auto deserialize_impl_with_check = [&deserialize_impl](IColumn & column_, ReadBuffer & buf_) -> bool - { - auto & peekable_buf = assert_cast(buf_); - bool res = deserialize_impl(column_, peekable_buf); - if (unlikely(peekable_buf.hasUnreadData())) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect state while parsing JSON: PeekableReadBuffer has unread data in own memory: {}", String(peekable_buf.position(), peekable_buf.available())); - return res; - }; - - PeekableReadBuffer peekable_buf(in, true); - return do_deserialize(column, peekable_buf, check_for_empty_string, deserialize_impl_with_check); + if (format_settings.json.empty_as_default) + return JSONUtils::deserializeEmpyStringAsDefaultOrNested(column, in, deserialize); + else + return deserialize(column, in); } catch (Exception & e) { @@ -915,6 +864,78 @@ namespace JSONUtils } } + template + ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested) + { + static constexpr auto throw_exception = std::is_same_v; + + static constexpr auto EMPTY_STRING = "\"\""; + static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + + if (istr.eof() || *istr.position() != EMPTY_STRING[0]) + return deserialize_nested(column, istr); + + auto do_deserialize = [](IColumn & column_, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize) -> ReturnType + { + if (check_for_empty_string(buf)) + { + column_.insertDefault(); + return ReturnType(default_column_return_value); + } + return deserialize(column_, buf); + }; + + if (istr.available() >= EMPTY_STRING_LENGTH) + { + /// We have enough data in buffer to check if we have an empty string. + auto check_for_empty_string = [](ReadBuffer & buf) -> bool + { + auto * pos = buf.position(); + if (checkString(EMPTY_STRING, buf)) + return true; + buf.position() = pos; + return false; + }; + + return do_deserialize(column, istr, check_for_empty_string, deserialize_nested); + } + + /// We don't have enough data in buffer to check if we have an empty string. + /// Use PeekableReadBuffer to make a checkpoint before checking for an + /// empty string and rollback if check was failed. + + auto check_for_empty_string = [](ReadBuffer & buf) -> bool + { + auto & peekable_buf = assert_cast(buf); + peekable_buf.setCheckpoint(); + SCOPE_EXIT(peekable_buf.dropCheckpoint()); + if (checkString(EMPTY_STRING, peekable_buf)) + return true; + peekable_buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested_with_check = [&deserialize_nested](IColumn & column_, ReadBuffer & buf) -> ReturnType + { + auto & peekable_buf = assert_cast(buf); + if constexpr (throw_exception) + deserialize_nested(column_, peekable_buf); + else if (!deserialize_nested(column_, peekable_buf)) + return ReturnType(false); + + if (unlikely(peekable_buf.hasUnreadData())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect state while parsing JSON: PeekableReadBuffer has unread data in own memory: {}", String(peekable_buf.position(), peekable_buf.available())); + + return ReturnType(true); + }; + + PeekableReadBuffer peekable_buf(istr, true); + return do_deserialize(column, peekable_buf, check_for_empty_string, deserialize_nested_with_check); + } + + template void deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); + template bool deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); + template bool deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); } } diff --git a/src/Formats/JSONUtils.h b/src/Formats/JSONUtils.h index a8e7113388a..50e182c7d27 100644 --- a/src/Formats/JSONUtils.h +++ b/src/Formats/JSONUtils.h @@ -2,16 +2,13 @@ #include #include -#include #include #include -#include #include -#include #include #include -#include #include +#include #include namespace DB @@ -20,11 +17,6 @@ namespace DB class Block; struct JSONInferenceInfo; -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - namespace JSONUtils { std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows); @@ -147,104 +139,14 @@ namespace JSONUtils void skipTheRestOfObject(ReadBuffer & in, const FormatSettings::JSON & settings); template - ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, auto && deserializer) - { - static constexpr auto throw_exception = std::is_same_v; + using NestedDeserialize = std::function; - static constexpr auto EMPTY_STRING = "\"\""; - static constexpr auto EMPTY_STRING_LENGTH = std::string_view(EMPTY_STRING).length(); + template + ReturnType deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); - auto do_deserialize_nested = [](IColumn & nested_column, ReadBuffer & buf, auto && check_for_empty_string, auto && deserialize, const SerializationPtr & nested_column_serialization) -> ReturnType - { - if (check_for_empty_string(buf)) - { - nested_column.insertDefault(); - return ReturnType(true); - } - return deserialize(nested_column, buf, nested_column_serialization); - }; - - auto deserialize_nested_impl = [&settings](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType - { - if constexpr (throw_exception) - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested_column_serialization); - else - nested_column_serialization->deserializeTextJSON(nested_column, buf, settings); - } - else - { - if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) - return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, buf, settings, nested_column_serialization); - return nested_column_serialization->tryDeserializeTextJSON(nested_column, buf, settings); - } - }; - - auto deserialize_nested = [&do_deserialize_nested, &deserialize_nested_impl](IColumn & nested_column, ReadBuffer & buf, const SerializationPtr & nested_column_serialization) -> ReturnType - { - if (buf.eof() || *buf.position() != EMPTY_STRING[0]) - return deserialize_nested_impl(nested_column, buf, nested_column_serialization); - - if (buf.available() >= EMPTY_STRING_LENGTH) - { - /// We have enough data in buffer to check if we have an empty string. - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto * pos = buf_.position(); - if (checkString(EMPTY_STRING, buf_)) - return true; - buf_.position() = pos; - return false; - }; - - return do_deserialize_nested(nested_column, buf, check_for_empty_string, deserialize_nested_impl, nested_column_serialization); - } - - /// We don't have enough data in buffer to check if we have an empty string. - /// Use PeekableReadBuffer to make a checkpoint before checking for an - /// empty string and rollback if check was failed. - - auto check_for_empty_string = [](ReadBuffer & buf_) -> bool - { - auto & peekable_buf = assert_cast(buf_); - peekable_buf.setCheckpoint(); - SCOPE_EXIT(peekable_buf.dropCheckpoint()); - if (checkString(EMPTY_STRING, peekable_buf)) - return true; - peekable_buf.rollbackToCheckpoint(); - return false; - }; - - auto deserialize_nested_impl_with_check = [&deserialize_nested_impl](IColumn & nested_column_, ReadBuffer & buf_, const SerializationPtr & nested_column_serialization_) -> ReturnType - { - auto & peekable_buf = assert_cast(buf_); - - auto enforceNoUnreadData = [&peekable_buf]() -> void - { - if (unlikely(peekable_buf.hasUnreadData())) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect state while parsing JSON: PeekableReadBuffer has unread data in own memory: {}", String(peekable_buf.position(), peekable_buf.available())); - }; - - if constexpr (throw_exception) - { - deserialize_nested_impl(nested_column_, peekable_buf, nested_column_serialization_); - enforceNoUnreadData(); - } - else - { - bool res = deserialize_nested_impl(nested_column_, peekable_buf, nested_column_serialization_); - enforceNoUnreadData(); - return res; - } - }; - - PeekableReadBuffer peekable_buf(buf, true); - return do_deserialize_nested(nested_column, peekable_buf, check_for_empty_string, deserialize_nested_impl_with_check, nested_column_serialization); - }; - - return deserializer(column, istr, deserialize_nested); - } + extern template void deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); + extern template bool deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); + extern template bool deserializeEmpyStringAsDefaultOrNested(IColumn & column, ReadBuffer & istr, const NestedDeserialize & deserialize_nested); } } diff --git a/tests/queries/0_stateless/03215_json_empty_as_default.reference b/tests/queries/0_stateless/03222_json_empty_as_default.reference similarity index 100% rename from tests/queries/0_stateless/03215_json_empty_as_default.reference rename to tests/queries/0_stateless/03222_json_empty_as_default.reference diff --git a/tests/queries/0_stateless/03215_json_empty_as_default.sql b/tests/queries/0_stateless/03222_json_empty_as_default.sql similarity index 100% rename from tests/queries/0_stateless/03215_json_empty_as_default.sql rename to tests/queries/0_stateless/03222_json_empty_as_default.sql diff --git a/tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.reference b/tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.reference new file mode 100644 index 00000000000..8176d7895d8 --- /dev/null +++ b/tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.reference @@ -0,0 +1,8 @@ +Array(UUID) +{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e","00000000-0000-0000-0000-000000000000"]} +{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e","00000000-0000-0000-0000-000000000000"]} +{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e","00000000-0000-0000-0000-000000000000"]} +Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6))) +{"x":[["00000000-0000-0000-0000-000000000000"],["00000000-0000-0000-0000-000000000000",{"abc":"::"}]]} +{"x":[["00000000-0000-0000-0000-000000000000"],["00000000-0000-0000-0000-000000000000",{"abc":"::"}]]} +{"x":[["00000000-0000-0000-0000-000000000000"],["00000000-0000-0000-0000-000000000000",{"abc":"::"}]]} diff --git a/tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.sh b/tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.sh new file mode 100755 index 00000000000..6b69fb2e9dc --- /dev/null +++ b/tests/queries/0_stateless/03222_json_empty_as_default_small_read_buffer.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.json + +# Wrapper for clickhouse-client to always output in JSONEachRow format, that +# way format settings will not affect output. +function clickhouse_local() +{ + $CLICKHOUSE_LOCAL --output-format JSONEachRow "$@" +} + +echo 'Array(UUID)' +echo '{"x":["00000000-0000-0000-0000-000000000000","b15f852c-c41a-4fd6-9247-1929c841715e",""]}' > $DATA_FILE +# Use increasingly smaller read buffers. +clickhouse_local -q "SELECT x FROM file('$DATA_FILE', 'JSONEachRow', 'x Array(UUID)') SETTINGS input_format_json_empty_as_default=1, input_format_parallel_parsing=0, storage_file_read_method='read', max_read_buffer_size=4" +clickhouse_local -q "SELECT x FROM file('$DATA_FILE', 'JSONEachRow', 'x Array(UUID)') SETTINGS input_format_json_empty_as_default=1, input_format_parallel_parsing=0, storage_file_read_method='read', max_read_buffer_size=2" +clickhouse_local -q "SELECT x FROM file('$DATA_FILE', 'JSONEachRow', 'x Array(UUID)') SETTINGS input_format_json_empty_as_default=1, input_format_parallel_parsing=0, storage_file_read_method='read', max_read_buffer_size=1" + +echo 'Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))' +echo '{"x":[[""], ["",{"abc":""}]]}' > $DATA_FILE +# Use increasingly smaller read buffers. +clickhouse_local -q "SELECT x FROM file('$DATA_FILE', 'JSONEachRow', 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))') SETTINGS input_format_json_empty_as_default=1, input_format_parallel_parsing=0, storage_file_read_method='read', max_read_buffer_size=16" +clickhouse_local -q "SELECT x FROM file('$DATA_FILE', 'JSONEachRow', 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))') SETTINGS input_format_json_empty_as_default=1, input_format_parallel_parsing=0, storage_file_read_method='read', max_read_buffer_size=8" +clickhouse_local -q "SELECT x FROM file('$DATA_FILE', 'JSONEachRow', 'x Tuple(Array(UUID), Tuple(UUID, Map(String, IPv6)))') SETTINGS input_format_json_empty_as_default=1, input_format_parallel_parsing=0, storage_file_read_method='read', max_read_buffer_size=1" + +rm $DATA_FILE From 2c0ddd10a0c711a1b65895f171b2e570eccef3ba Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:49:48 +0200 Subject: [PATCH 13/13] Fix build --- src/Core/SettingsChangesHistory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index b4ba5202aa7..54852a37318 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -71,7 +71,7 @@ static std::initializer_list