From 22da93e239ffd4402ba27aee4c982742082cc9fc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 Mar 2023 21:41:01 +0000 Subject: [PATCH 1/7] Cosmetics --- src/Functions/formatDateTime.cpp | 99 +++++++++++++++----------------- src/Functions/parseDateTime.cpp | 26 ++++----- 2 files changed, 58 insertions(+), 67 deletions(-) diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp index bbb4c3ba5b0..daea8b3a7b0 100644 --- a/src/Functions/formatDateTime.cpp +++ b/src/Functions/formatDateTime.cpp @@ -39,21 +39,17 @@ namespace ErrorCodes namespace { -struct FormatDateTimeTraits +enum class SupportInteger { - enum class SupportInteger - { - Yes, - No - }; - - enum class FormatSyntax - { - MySQL, - Joda - }; + Yes, + No }; +enum class FormatSyntax +{ + MySQL, + Joda +}; template struct InstructionValueTypeMap {}; template <> struct InstructionValueTypeMap { using InstructionValueType = UInt32; }; @@ -85,11 +81,9 @@ constexpr std::string_view weekdaysFull[] = {"Sunday", "Monday", "Tuesday", "Wed constexpr std::string_view weekdaysShort[] = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; -constexpr std::string_view monthsFull[] - = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"}; +constexpr std::string_view monthsFull[] = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"}; -constexpr std::string_view monthsShort[] - = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; +constexpr std::string_view monthsShort[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; /** formatDateTime(time, 'format') * Performs formatting of time, according to provided format. @@ -129,7 +123,7 @@ constexpr std::string_view monthsShort[] * * PS. We can make this function to return FixedString. Currently it returns String. */ -template +template class FunctionFormatDateTimeImpl : public IFunction { private: @@ -157,7 +151,7 @@ private: /// This is the reason why we use raw function pointer in MySQL format and std::function /// in Joda format. using Func = std::conditional_t< - format_syntax == FormatDateTimeTraits::FormatSyntax::MySQL, + format_syntax == FormatSyntax::MySQL, size_t (*)(char *, Time, UInt64, UInt32, const DateLUTImpl &), std::function>; @@ -257,7 +251,10 @@ private: return pos; } public: - static size_t mysqlNoop(char *, Time, UInt64, UInt32, const DateLUTImpl &) { return 0; } + static size_t mysqlNoop(char *, Time, UInt64, UInt32, const DateLUTImpl &) + { + return 0; + } static size_t mysqlCentury(char * dest, Time source, UInt64, UInt32, const DateLUTImpl & timezone) { @@ -430,8 +427,7 @@ private: return writeNumber2(dest, ToSecondImpl::execute(source, timezone)); } - static size_t - mysqlFractionalSecond(char * dest, Time /*source*/, UInt64 fractional_second, UInt32 scale, const DateLUTImpl & /*timezone*/) + static size_t mysqlFractionalSecond(char * dest, Time /*source*/, UInt64 fractional_second, UInt32 scale, const DateLUTImpl & /*timezone*/) { if (scale == 0) scale = 1; @@ -672,7 +668,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if constexpr (support_integer == FormatDateTimeTraits::SupportInteger::Yes) + if constexpr (support_integer == SupportInteger::Yes) { if (arguments.size() != 1 && arguments.size() != 2 && arguments.size() != 3) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, @@ -718,7 +714,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, [[maybe_unused]] size_t input_rows_count) const override { ColumnPtr res; - if constexpr (support_integer == FormatDateTimeTraits::SupportInteger::Yes) + if constexpr (support_integer == SupportInteger::Yes) { if (arguments.size() == 1) { @@ -793,7 +789,7 @@ public: using T = typename InstructionValueTypeMap::InstructionValueType; std::vector> instructions; String out_template; - auto result_size = parseFormat(format, instructions, scale, out_template); + size_t out_template_size = parseFormat(format, instructions, scale, out_template); const DateLUTImpl * time_zone_tmp = nullptr; if (castType(arguments[0].type.get(), [&]([[maybe_unused]] const auto & type) { return true; })) @@ -807,26 +803,26 @@ public: const auto & vec = times->getData(); auto col_res = ColumnString::create(); - auto & dst_data = col_res->getChars(); - auto & dst_offsets = col_res->getOffsets(); - dst_data.resize(vec.size() * (result_size + 1)); - dst_offsets.resize(vec.size()); + auto & res_data = col_res->getChars(); + auto & res_offsets = col_res->getOffsets(); + res_data.resize(vec.size() * (out_template_size + 1)); + res_offsets.resize(vec.size()); - if constexpr (format_syntax == FormatDateTimeTraits::FormatSyntax::MySQL) + if constexpr (format_syntax == FormatSyntax::MySQL) { - /// Fill result with literals. + /// Fill result with template. { - UInt8 * begin = dst_data.data(); - UInt8 * end = begin + dst_data.size(); - UInt8 * pos = begin; + const UInt8 * const begin = res_data.data(); + const UInt8 * const end = res_data.data() + res_data.size(); + UInt8 * pos = res_data.data(); if (pos < end) { - memcpy(pos, out_template.data(), result_size + 1); /// With zero terminator. - pos += result_size + 1; + memcpy(pos, out_template.data(), out_template_size + 1); /// With zero terminator. mystring[mystring.size()] = '\0' is guaranteed since C++11. + pos += out_template_size + 1; } - /// Fill by copying exponential growing ranges. + /// Copy exponentially growing ranges. while (pos < end) { size_t bytes_to_copy = std::min(pos - begin, end - pos); @@ -836,7 +832,7 @@ public: } } - auto * begin = reinterpret_cast(dst_data.data()); + auto * begin = reinterpret_cast(res_data.data()); auto * pos = begin; for (size_t i = 0; i < vec.size(); ++i) { @@ -844,9 +840,7 @@ public: { const auto c = DecimalUtils::split(vec[i], scale); for (auto & instruction : instructions) - { instruction.perform(pos, static_cast(c.whole), c.fractional, scale, time_zone); - } } else { @@ -855,21 +849,19 @@ public: } *pos++ = '\0'; - dst_offsets[i] = pos - begin; + res_offsets[i] = pos - begin; } - dst_data.resize(pos - begin); + res_data.resize(pos - begin); return col_res; } template size_t parseFormat(const String & format, std::vector> & instructions, UInt32 scale, String & out_template) const { - static_assert( - format_syntax == FormatDateTimeTraits::FormatSyntax::MySQL || format_syntax == FormatDateTimeTraits::FormatSyntax::Joda, - "format syntax must be one of MySQL or Joda"); + static_assert(format_syntax == FormatSyntax::MySQL || format_syntax == FormatSyntax::Joda); - if constexpr (format_syntax == FormatDateTimeTraits::FormatSyntax::MySQL) + if constexpr (format_syntax == FormatSyntax::MySQL) return parseMySQLFormat(format, instructions, scale, out_template); else return parseJodaFormat(format, instructions, scale, out_template); @@ -914,13 +906,13 @@ public: switch (*pos) { - // Abbreviated weekday [Mon...Sun] + // Abbreviated weekday [Mon-Sun] case 'a': instructions.emplace_back(&Instruction::mysqlDayOfWeekTextShort); out_template += "Mon"; break; - // Abbreviated month [Jan...Dec] + // Abbreviated month [Jan-Dec] case 'b': instructions.emplace_back(&Instruction::mysqlMonthOfYearTextShort); out_template += "Jan"; @@ -958,12 +950,10 @@ public: // Fractional seconds case 'f': - { /// If the time data type has no fractional part, then we print '0' as the fractional part. instructions.emplace_back(&Instruction::mysqlFractionalSecond); out_template += String(std::max(1, scale), '0'); break; - } // Short YYYY-MM-DD date, equivalent to %Y-%m-%d 2001-08-23 case 'F': @@ -1013,7 +1003,7 @@ public: out_template += "0"; break; - // Full weekday [Monday...Sunday] + // Full weekday [Monday-Sunday] case 'W': instructions.emplace_back(&Instruction::mysqlDayOfWeekTextLong); out_template += "Monday"; @@ -1186,6 +1176,7 @@ public: size_t reserve_size = 0; const char * pos = format.data(); const char * end = format.data() + format.size(); + while (pos < end) { const char * cur_token = pos; @@ -1392,10 +1383,10 @@ struct NameFromUnixTimeInJodaSyntax }; -using FunctionFormatDateTime = FunctionFormatDateTimeImpl; -using FunctionFromUnixTimestamp = FunctionFormatDateTimeImpl; -using FunctionFormatDateTimeInJodaSyntax = FunctionFormatDateTimeImpl; -using FunctionFromUnixTimestampInJodaSyntax = FunctionFormatDateTimeImpl; +using FunctionFormatDateTime = FunctionFormatDateTimeImpl; +using FunctionFromUnixTimestamp = FunctionFormatDateTimeImpl; +using FunctionFormatDateTimeInJodaSyntax = FunctionFormatDateTimeImpl; +using FunctionFromUnixTimestampInJodaSyntax = FunctionFormatDateTimeImpl; } diff --git a/src/Functions/parseDateTime.cpp b/src/Functions/parseDateTime.cpp index abee7e0d8f8..cd3c0d993d0 100644 --- a/src/Functions/parseDateTime.cpp +++ b/src/Functions/parseDateTime.cpp @@ -101,16 +101,16 @@ namespace bool is_year_of_era = false; /// If true, year is calculated from era and year of era, the latter cannot be zero or negative. bool has_year = false; /// Whether year was explicitly specified. - /// If is_clock_hour = true, is_hour_of_half_day = true, hour's range is [1, 12] - /// If is_clock_hour = true, is_hour_of_half_day = false, hour's range is [1, 24] - /// If is_clock_hour = false, is_hour_of_half_day = true, hour's range is [0, 11] - /// If is_clock_hour = false, is_hour_of_half_day = false, hour's range is [0, 23] + /// If hour_starts_at_1 = true, is_hour_of_half_day = true, hour's range is [1, 12] + /// If hour_starts_at_1 = true, is_hour_of_half_day = false, hour's range is [1, 24] + /// If hour_starts_at_1 = false, is_hour_of_half_day = true, hour's range is [0, 11] + /// If hour_starts_at_1 = false, is_hour_of_half_day = false, hour's range is [0, 23] Int32 hour = 0; Int32 minute = 0; /// range [0, 59] Int32 second = 0; /// range [0, 59] bool is_am = true; /// If is_hour_of_half_day = true and is_am = false (i.e. pm) then add 12 hours to the result DateTime - bool is_clock_hour = false; /// Whether the hour is clockhour + bool hour_starts_at_1 = false; /// Whether the hour is clockhour bool is_hour_of_half_day = false; /// Whether the hour is of half day bool has_time_zone_offset = false; /// If true, time zone offset is explicitly specified. @@ -137,7 +137,7 @@ namespace second = 0; is_am = true; - is_clock_hour = false; + hour_starts_at_1 = false; is_hour_of_half_day = false; has_time_zone_offset = false; @@ -275,23 +275,23 @@ namespace throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Unknown half day of day: {}", text); } - void setHour(Int32 hour_, bool is_hour_of_half_day_ = false, bool is_clock_hour_ = false) + void setHour(Int32 hour_, bool is_hour_of_half_day_ = false, bool hour_starts_at_1_ = false) { Int32 max_hour; Int32 min_hour; Int32 new_hour = hour_; - if (!is_hour_of_half_day_ && !is_clock_hour_) + if (!is_hour_of_half_day_ && !hour_starts_at_1_) { max_hour = 23; min_hour = 0; } - else if (!is_hour_of_half_day_ && is_clock_hour_) + else if (!is_hour_of_half_day_ && hour_starts_at_1_) { max_hour = 24; min_hour = 1; new_hour = hour_ % 24; } - else if (is_hour_of_half_day_ && !is_clock_hour_) + else if (is_hour_of_half_day_ && !hour_starts_at_1_) { max_hour = 11; min_hour = 0; @@ -306,16 +306,16 @@ namespace if (hour_ < min_hour || hour_ > max_hour) throw Exception( ErrorCodes::CANNOT_PARSE_DATETIME, - "Value {} for hour must be in the range [{}, {}] if_hour_of_half_day={} and is_clock_hour={}", + "Value {} for hour must be in the range [{}, {}] if_hour_of_half_day={} and hour_starts_at_1={}", hour, max_hour, min_hour, is_hour_of_half_day_, - is_clock_hour_); + hour_starts_at_1_); hour = new_hour; is_hour_of_half_day = is_hour_of_half_day_; - is_clock_hour = is_clock_hour_; + hour_starts_at_1 = hour_starts_at_1_; } void setMinute(Int32 minute_) From 3db38dbb5a1a227c749549faf4ec0f140bc267b7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 Mar 2023 22:16:10 +0000 Subject: [PATCH 2/7] Replace mySQL date formatter M behavior from minutes to month name --- .../functions/date-time-functions.md | 8 +- src/Functions/formatDateTime.cpp | 576 +++++++++++++----- src/Functions/parseDateTime.cpp | 40 +- .../00718_format_datetime.reference | 4 +- .../0_stateless/00718_format_datetime.sql | 3 + ...00921_datetime64_compatibility_long.python | 2 +- ...21_datetime64_compatibility_long.reference | 2 +- .../0_stateless/01411_from_unixtime.reference | 2 +- .../0_stateless/02564_date_format.reference | 2 +- .../02668_parse_datetime.reference | 7 + .../0_stateless/02668_parse_datetime.sql | 6 +- 11 files changed, 493 insertions(+), 159 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index d06ab253cf7..425d67ed5a0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1276,16 +1276,16 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %k | hour in 24h format (00-23) | 22 | | %l | hour in 12h format (01-12) | 09 | | %m | month as an integer number (01-12) | 01 | -| %M | minute (00-59) | 33 | +| %M | full month name (January-December) | January | | %n | new-line character (‘’) | | | %p | AM or PM designation | PM | | %Q | Quarter (1-4) | 1 | -| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%M %p | 10:30 PM | -| %R | 24-hour HH:MM time, equivalent to %H:%M | 22:33 | +| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%i %p | 10:30 PM | +| %R | 24-hour HH:MM time, equivalent to %H:%i | 22:33 | | %s | second (00-59) | 44 | | %S | second (00-59) | 44 | | %t | horizontal-tab character (’) | | -| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S | 22:33:44 | +| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%i:%S | 22:33:44 | | %u | ISO 8601 weekday as number with Monday as 1 (1-7) | 2 | | %V | ISO 8601 week number (01-53) | 01 | | %w | weekday as a integer number with Sunday as 0 (0-6) | 2 | diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp index daea8b3a7b0..c243222db91 100644 --- a/src/Functions/formatDateTime.cpp +++ b/src/Functions/formatDateTime.cpp @@ -109,13 +109,13 @@ constexpr std::string_view monthsShort[] = {"Jan", "Feb", "Mar", "Apr", "May", " * * Performance on Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz: * - * WITH formatDateTime(now() + number, '%H:%M:%S') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x); + * WITH formatDateTime(now() + number, '%H:%i:%S') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x); * - 97 million rows per second per core; * * WITH formatDateTime(toDateTime('2018-01-01 00:00:00') + number, '%F %T') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x) * - 71 million rows per second per core; * - * select count() from (select formatDateTime(t, '%m/%d/%Y %H:%M:%S') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); + * select count() from (select formatDateTime(t, '%m/%d/%Y %H:%i:%S') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); * - 53 million rows per second per core; * * select count() from (select formatDateTime(t, 'Hello %Y World') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); @@ -146,26 +146,34 @@ private: class Instruction { public: - /// Using std::function will cause performance degradation in MySQL format by 0.45x. - /// But std::function is required for Joda format to capture extra variables. - /// This is the reason why we use raw function pointer in MySQL format and std::function - /// in Joda format. - using Func = std::conditional_t< - format_syntax == FormatSyntax::MySQL, - size_t (*)(char *, Time, UInt64, UInt32, const DateLUTImpl &), - std::function>; + /// Joda format generally requires capturing extra variables (i.e. holding state) which is more convenient with + /// std::function and std::bind. Unfortunately, std::function causes a performance degradation by 0.45x compared to raw function + /// pointers. For MySQL format, we generally prefer raw function pointers. Because of the special case that not all formatters are + /// fixed-width formatters (see mysqlLiteral), we still need to be able to store state. For that reason, we use member function + /// pointers instead of static function pointers. + using FuncMysql = size_t (Instruction