From a194871b910eae3b2e1e5c09dad559edd0c38e3e Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Wed, 16 Oct 2024 19:40:09 +0800 Subject: [PATCH] support parseDateTime64 for joda syntax --- src/Functions/parseDateTime.cpp | 178 +++++++++++++++++++++++++++----- 1 file changed, 155 insertions(+), 23 deletions(-) diff --git a/src/Functions/parseDateTime.cpp b/src/Functions/parseDateTime.cpp index a4b23745308..c80355ecfb7 100644 --- a/src/Functions/parseDateTime.cpp +++ b/src/Functions/parseDateTime.cpp @@ -185,6 +185,7 @@ namespace Int32 hour = 0; Int32 minute = 0; /// range [0, 59] Int32 second = 0; /// range [0, 59] + Int32 microsecond = 0; /// range [0, 999999] bool is_am = true; /// If is_hour_of_half_day = true and is_am = false (i.e. pm) then add 12 hours to the result DateTime bool hour_starts_at_1 = false; /// Whether the hour is clockhour @@ -212,6 +213,7 @@ namespace hour = 0; minute = 0; second = 0; + microsecond = 0; is_am = true; hour_starts_at_1 = false; @@ -437,6 +439,16 @@ namespace return {}; } + [[nodiscard]] + VoidOrError setMicrosecond(Int32 microsecond_) + { + if (microsecond_ < 0 || microsecond_ > 999999) + RETURN_ERROR(ErrorCodes::CANNOT_PARSE_DATETIME, "Value {} for microsecond must be in the range [0, 999999]", microsecond_) + + microsecond = microsecond_; + return {}; + } + /// For debug [[maybe_unused]] String toString() const { @@ -559,7 +571,7 @@ namespace }; /// _FUNC_(str[, format, timezone]) - template + template class FunctionParseDateTimeImpl : public IFunction { public: @@ -598,13 +610,72 @@ namespace validateFunctionArguments(*this, arguments, mandatory_args, optional_args); String time_zone_name = getTimeZone(arguments).getTimeZone(); - DataTypePtr date_type = std::make_shared(time_zone_name); + DataTypePtr date_type = nullptr; + if constexpr (parseDateTime64) + { + String format = getFormat(arguments); + std::vector instructions = parseFormat(format); + UInt32 scale = 0; + if (!instructions.empty()) + { + for (const auto & ins : instructions) + { + if (scale > 0) + break; + const String fragment = ins.getFragment(); + for (size_t i = 0; i < fragment.size(); i++) + { + if (fragment[i] != 'S') + { + scale = 0; + break; + } + else + scale++; + } + } + } + date_type = std::make_shared(scale, time_zone_name); + } + else + date_type = std::make_shared(time_zone_name); if (error_handling == ErrorHandling::Null) return std::make_shared(date_type); return date_type; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + ColumnUInt8::MutablePtr col_null_map; + if constexpr (error_handling == ErrorHandling::Null) + col_null_map = ColumnUInt8::create(input_rows_count, 0); + PaddedPODArray & null_map_data = col_null_map->getData(); + if constexpr (parseDateTime64) + { + const DataTypeDateTime64 * datatime64_type = checkAndGetDataType(removeNullable(result_type).get()); + auto col_res = ColumnDateTime64::create(input_rows_count, datatime64_type->getScale()); + PaddedPODArray & res_data = col_res->getData(); + executeImpl2(arguments, result_type, input_rows_count, res_data, null_map_data); + if constexpr (error_handling == ErrorHandling::Null) + return ColumnNullable::create(std::move(col_res), std::move(col_null_map)); + else + return col_res; + } + else + { + auto col_res = ColumnDateTime::create(input_rows_count); + PaddedPODArray & res_data = col_res->getData(); + executeImpl2(arguments, result_type, input_rows_count, res_data, null_map_data); + if constexpr (error_handling == ErrorHandling::Null) + return ColumnNullable::create(std::move(col_res), std::move(col_null_map)); + else + return col_res; + } + } + + template + void executeImpl2(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, + PaddedPODArray & res_data, PaddedPODArray & null_map_data) const { const auto * col_str = checkAndGetColumn(arguments[0].column.get()); if (!col_str) @@ -618,14 +689,6 @@ namespace const auto & time_zone = getTimeZone(arguments); std::vector instructions = parseFormat(format); - auto col_res = ColumnDateTime::create(input_rows_count); - - ColumnUInt8::MutablePtr col_null_map; - if constexpr (error_handling == ErrorHandling::Null) - col_null_map = ColumnUInt8::create(input_rows_count, 0); - - auto & res_data = col_res->getData(); - /// Make datetime fit in a cache line. alignas(64) DateTime datetime; for (size_t i = 0; i < input_rows_count; ++i) @@ -653,7 +716,7 @@ namespace else if constexpr (error_handling == ErrorHandling::Null) { res_data[i] = 0; - col_null_map->getData()[i] = 1; + null_map_data[i] = 1; error = true; break; } @@ -672,7 +735,7 @@ namespace Int64OrError result = 0; /// Ensure all input was consumed - if (cur < end) + if (!parseDateTime64 && cur < end) { result = tl::unexpected(ErrorCodeAndMessage( ErrorCodes::CANNOT_PARSE_DATETIME, @@ -684,7 +747,16 @@ namespace if (result.has_value()) { if (result = datetime.buildDateTime(time_zone); result.has_value()) - res_data[i] = static_cast(*result); + { + if constexpr (parseDateTime64) + { + const DataTypeDateTime64 * datatime64_type = checkAndGetDataType(removeNullable(result_type).get()); + Int64 multipler = DecimalUtils::scaleMultiplier(datatime64_type->getScale()); + res_data[i] = static_cast(*result) * multipler + datetime.microsecond; + } + else + res_data[i] = static_cast(*result); + } } if (!result.has_value()) @@ -696,7 +768,7 @@ namespace else if constexpr (error_handling == ErrorHandling::Null) { res_data[i] = 0; - col_null_map->getData()[i] = 1; + null_map_data[i] = 1; } else { @@ -706,11 +778,6 @@ namespace } } } - - if constexpr (error_handling == ErrorHandling::Null) - return ColumnNullable::create(std::move(col_res), std::move(col_null_map)); - else - return col_res; } @@ -742,6 +809,8 @@ namespace explicit Instruction(const String & literal_) : literal(literal_), fragment("LITERAL") { } explicit Instruction(String && literal_) : literal(std::move(literal_)), fragment("LITERAL") { } + const String getFragment() const { return fragment; } + /// For debug [[maybe_unused]] String toString() const { @@ -1625,6 +1694,59 @@ namespace RETURN_ERROR_IF_FAILED(date.setSecond(second)) return cur; } + + [[nodiscard]] + static PosOrError jodaMicroSecondOfSecond(size_t repetitions, Pos cur, Pos end, const String & fragment, DateTime & date) + { + Int32 microsecond; + ASSIGN_RESULT_OR_RETURN_ERROR(cur, (readNumberWithVariableLength(cur, end, false, false, false, repetitions, std::max(repetitions, 2uz), fragment, microsecond))) + RETURN_ERROR_IF_FAILED(date.setMicrosecond(microsecond)) + return cur; + } + + [[nodiscard]] + static PosOrError jodaTimezoneId(size_t, Pos cur, Pos end, const String &, DateTime & date) + { + String dateTimeZone = ""; + while (cur <= end) + { + dateTimeZone += *cur; + ++cur; + } + const DateLUTImpl & utc_time_zone = DateLUT::instance("UTC"); + const DateLUTImpl & date_time_zone = DateLUT::instance(dateTimeZone); + const auto timezoneOffset = date_time_zone.getTimeOffsetAtStartOfLUT() - utc_time_zone.getTimeOffsetAtStartOfLUT(); + date.has_time_zone_offset = true; + date.time_zone_offset = timezoneOffset; + return cur; + } + + [[nodiscard]] + static PosOrError jodaTimezoneOffset(size_t repetitions, Pos cur, Pos end, const String & fragment, DateTime & date) + { + RETURN_ERROR_IF_FAILED(checkSpace(cur, end, 5, "jodaTimezoneOffset requires size >= 5", fragment)) + Int32 sign; + if (*cur == '-') + sign = -1; + else if (*cur == '+') + sign = 1; + else + RETURN_ERROR( + ErrorCodes::CANNOT_PARSE_DATETIME, + "Unable to parse fragment {} from {} because of unknown sign time zone offset: {}", + fragment, + std::string_view(cur, end - cur), + std::string_view(cur, 1)) + ++cur; + + Int32 hour; + ASSIGN_RESULT_OR_RETURN_ERROR(cur, (readNumberWithVariableLength(cur, end, false, false, false, repetitions, std::max(repetitions, 2uz), fragment, hour))) + Int32 minute; + ASSIGN_RESULT_OR_RETURN_ERROR(cur, (readNumberWithVariableLength(cur, end, false, false, false, repetitions, std::max(repetitions, 2uz), fragment, minute))) + date.has_time_zone_offset = true; + date.time_zone_offset = sign * (hour * 3600 + minute * 60); + return cur; + } }; /// NOLINTEND(readability-else-after-return) @@ -2007,11 +2129,14 @@ namespace instructions.emplace_back(ACTION_ARGS_WITH_BIND(Instruction::jodaSecondOfMinute, repetitions)); break; case 'S': - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "format is not supported for fractional seconds"); + instructions.emplace_back(ACTION_ARGS_WITH_BIND(Instruction::jodaMicroSecondOfSecond, repetitions)); + break; case 'z': - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "format is not supported for timezone"); + instructions.emplace_back(ACTION_ARGS_WITH_BIND(Instruction::jodaTimezoneId, repetitions)); + break; case 'Z': - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "format is not supported for timezone offset id"); + instructions.emplace_back(ACTION_ARGS_WITH_BIND(Instruction::jodaTimezoneOffset, repetitions)); + break; default: if (isalpha(*cur_token)) throw Exception( @@ -2097,12 +2222,18 @@ namespace static constexpr auto name = "parseDateTimeInJodaSyntaxOrNull"; }; + struct NameParseDateTime64InJodaSyntaxOrNull + { + static constexpr auto name = "parseDateTime64InJodaSyntaxOrNull"; + }; + using FunctionParseDateTime = FunctionParseDateTimeImpl; using FunctionParseDateTimeOrZero = FunctionParseDateTimeImpl; using FunctionParseDateTimeOrNull = FunctionParseDateTimeImpl; using FunctionParseDateTimeInJodaSyntax = FunctionParseDateTimeImpl; using FunctionParseDateTimeInJodaSyntaxOrZero = FunctionParseDateTimeImpl; using FunctionParseDateTimeInJodaSyntaxOrNull = FunctionParseDateTimeImpl; + using FunctionParseDateTime64InJodaSyntaxOrNull = FunctionParseDateTimeImpl; } REGISTER_FUNCTION(ParseDateTime) @@ -2116,6 +2247,7 @@ REGISTER_FUNCTION(ParseDateTime) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); }