Merge remote-tracking branch 'origin/master' into pr-local-plan

2024-09-19 16:20:50 +00:00 · 2024-09-02 14:06:46 +00:00 · 2024-09-02 14:06:46 +00:00 · d064692125
commit d064692125
parent 1d1f41521e b409c0657d
28 changed files with 548 additions and 32 deletions
--- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md
@ -104,7 +104,7 @@ Events that occur at the same second may lay in the sequence in an undefined ord

 **Parameters**

- `pattern` — Pattern string. See [Pattern syntax](#sequencematch).
+- `pattern` — Pattern string. See [Pattern syntax](#pattern-syntax).

 **Returned values**

@ -113,8 +113,7 @@ Events that occur at the same second may lay in the sequence in an undefined ord

 Type: `UInt8`.

-<a name="sequence-function-pattern-syntax"></a>
-**Pattern syntax**
+#### Pattern syntax

 - `(?N)` — Matches the condition argument at position `N`. Conditions are numbered in the `[1, 32]` range. For example, `(?1)` matches the argument passed to the `cond1` parameter.

@ -196,7 +195,7 @@ sequenceCount(pattern)(timestamp, cond1, cond2, ...)

 **Parameters**

- `pattern` — Pattern string. See [Pattern syntax](#sequencematch).
+- `pattern` — Pattern string. See [Pattern syntax](#pattern-syntax).

 **Returned values**

--- a/docs/en/sql-reference/data-types/newjson.md
+++ b/docs/en/sql-reference/data-types/newjson.md
@ -453,8 +453,8 @@ As we can see, after inserting paths `e` and `f.g` the limit was reached and we

 ### During merges of data parts in MergeTree table engines

-During merge of several data parts in MergeTree table the `JSON` column in the resulting data part can reach the limit of dynamic paths won't be able to store all paths from source parts as subcolumns.
-In this case ClickHouse chooses what paths will remain as subcolumns after merge and what types will be stored in the shared data structure. In most cases ClickHouse tries to keep paths that contains
+During merge of several data parts in MergeTree table the `JSON` column in the resulting data part can reach the limit of dynamic paths and won't be able to store all paths from source parts as subcolumns.
+In this case ClickHouse chooses what paths will remain as subcolumns after merge and what paths will be stored in the shared data structure. In most cases ClickHouse tries to keep paths that contain
 the largest number of non-null values and move the rarest paths to the shared data structure, but it depends on the implementation.

 Let's see an example of such merge. First, let's create a table with `JSON` column, set the limit of dynamic paths to `3` and insert values with `5` different paths:
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -2019,7 +2019,7 @@ Alias: `dateTrunc`.

    `unit` argument is case-insensitive.

- `value` — Date and time. [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md).
+- `value` — Date and time. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md).
 - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). If not specified, the function uses the timezone of the `value` parameter. [String](../data-types/string.md).

 **Returned value**
--- a/programs/keeper-client/Commands.cpp
+++ b/programs/keeper-client/Commands.cpp
@ -677,4 +677,122 @@ void GetAllChildrenNumberCommand::execute(const ASTKeeperQuery * query, KeeperCl
    std::cout << totalNumChildren << "\n";
 }

+namespace
+{
+
+class CPMVOperation
+{
+    constexpr static UInt64 kTryLimit = 1000;
+
+public:
+    CPMVOperation(String src_, String dest_, bool remove_src_, KeeperClient * client_)
+        : src(std::move(src_)), dest(std::move(dest_)), remove_src(remove_src_), client(client_)
+    {
+    }
+
+    bool isTryLimitReached() const { return failed_tries_count >= kTryLimit; }
+
+    bool isCompleted() const { return is_completed; }
+
+    void perform()
+    {
+        Coordination::Stat src_stat;
+        String data = client->zookeeper->get(src, &src_stat);
+
+        Coordination::Requests ops{
+            zkutil::makeCheckRequest(src, src_stat.version),
+            zkutil::makeCreateRequest(dest, data, zkutil::CreateMode::Persistent), // Do we need to copy ACLs here?
+        };
+
+        if (remove_src)
+            ops.push_back(zkutil::makeRemoveRequest(src, src_stat.version));
+
+        Coordination::Responses responses;
+        auto code = client->zookeeper->tryMulti(ops, responses);
+
+        switch (code)
+        {
+            case Coordination::Error::ZOK: {
+                is_completed = true;
+                return;
+            }
+            case Coordination::Error::ZBADVERSION: {
+                ++failed_tries_count;
+
+                if (isTryLimitReached())
+                    zkutil::KeeperMultiException::check(code, ops, responses);
+
+                return;
+            }
+            default:
+                zkutil::KeeperMultiException::check(code, ops, responses);
+        }
+
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unreachable");
+    }
+
+private:
+    String src;
+    String dest;
+    bool remove_src = false;
+    KeeperClient * client = nullptr;
+
+    bool is_completed = false;
+    uint64_t failed_tries_count = 0;
+};
+
+}
+
+bool CPCommand::parse(IParser::Pos & pos, std::shared_ptr<ASTKeeperQuery> & node, [[maybe_unused]] Expected & expected) const
+{
+    String src_path;
+    if (!parseKeeperPath(pos, expected, src_path))
+        return false;
+    node->args.push_back(std::move(src_path));
+
+    String to_path;
+    if (!parseKeeperPath(pos, expected, to_path))
+        return false;
+    node->args.push_back(std::move(to_path));
+
+    return true;
+}
+
+void CPCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const
+{
+    auto src = client->getAbsolutePath(query->args[0].safeGet<String>());
+    auto dest = client->getAbsolutePath(query->args[1].safeGet<String>());
+
+    CPMVOperation operation(std::move(src), std::move(dest), /*remove_src_=*/false, /*client_=*/client);
+
+    while (!operation.isTryLimitReached() && !operation.isCompleted())
+        operation.perform();
+}
+
+bool MVCommand::parse(IParser::Pos & pos, std::shared_ptr<ASTKeeperQuery> & node, Expected & expected) const
+{
+    String src_path;
+    if (!parseKeeperPath(pos, expected, src_path))
+        return false;
+    node->args.push_back(std::move(src_path));
+
+    String to_path;
+    if (!parseKeeperPath(pos, expected, to_path))
+        return false;
+    node->args.push_back(std::move(to_path));
+
+    return true;
+}
+
+void MVCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const
+{
+    auto src = client->getAbsolutePath(query->args[0].safeGet<String>());
+    auto dest = client->getAbsolutePath(query->args[1].safeGet<String>());
+
+    CPMVOperation operation(std::move(src), std::move(dest), /*remove_src_=*/true, /*client_=*/client);
+
+    while (!operation.isTryLimitReached() && !operation.isCompleted())
+        operation.perform();
+}
+
 }
--- a/programs/keeper-client/Commands.h
+++ b/programs/keeper-client/Commands.h
@ -266,4 +266,32 @@ class GetAllChildrenNumberCommand : public IKeeperClientCommand
    }
 };

+class CPCommand : public IKeeperClientCommand
+{
+    String getName() const override { return "cp"; }
+
+    bool parse(IParser::Pos & pos, std::shared_ptr<ASTKeeperQuery> & node, Expected & expected) const override;
+
+    void execute(const ASTKeeperQuery * query, KeeperClient * client) const override;
+
+    String getHelpMessage() const override
+    {
+        return "{} <src> <dest> -- Copies 'src' node to 'dest' path.";
+    }
+};
+
+class MVCommand : public IKeeperClientCommand
+{
+    String getName() const override { return "mv"; }
+
+    bool parse(IParser::Pos & pos, std::shared_ptr<ASTKeeperQuery> & node, Expected & expected) const override;
+
+    void execute(const ASTKeeperQuery * query, KeeperClient * client) const override;
+
+    String getHelpMessage() const override
+    {
+        return "{} <src> <dest> -- Moves 'src' node to the 'dest' path.";
+    }
+};
+
 }
--- a/programs/keeper-client/KeeperClient.cpp
+++ b/programs/keeper-client/KeeperClient.cpp
@ -212,6 +212,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */)
        std::make_shared<FourLetterWordCommand>(),
        std::make_shared<GetDirectChildrenNumberCommand>(),
        std::make_shared<GetAllChildrenNumberCommand>(),
+        std::make_shared<CPCommand>(),
+        std::make_shared<MVCommand>(),
    });

    String home_path;
--- a/src/AggregateFunctions/AggregateFunctionAnyHeavy.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAnyHeavy.cpp
@ -68,7 +68,10 @@ public:
        if (data().isEqualTo(to.data()))
            counter += to.counter;
        else if (!data().has() || counter < to.counter)
+        {
            data().set(to.data(), arena);
+            counter = to.counter - counter;
+        }
        else
            counter -= to.counter;
    }
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -75,9 +75,9 @@
    M(GlobalThread, "Number of threads in global thread pool.") \
    M(GlobalThreadActive, "Number of threads in global thread pool running a task.") \
    M(GlobalThreadScheduled, "Number of queued or active jobs in global thread pool.") \
-    M(LocalThread, "Number of threads in local thread pools. The threads in local thread pools are taken from the global thread pool.") \
-    M(LocalThreadActive, "Number of threads in local thread pools running a task.") \
-    M(LocalThreadScheduled, "Number of queued or active jobs in local thread pools.") \
+    M(LocalThread, "Obsolete. Number of threads in local thread pools. The threads in local thread pools are taken from the global thread pool.") \
+    M(LocalThreadActive, "Obsolete. Number of threads in local thread pools running a task.") \
+    M(LocalThreadScheduled, "Obsolete. Number of queued or active jobs in local thread pools.") \
    M(MergeTreeDataSelectExecutorThreads, "Number of threads in the MergeTreeDataSelectExecutor thread pool.") \
    M(MergeTreeDataSelectExecutorThreadsActive, "Number of threads in the MergeTreeDataSelectExecutor thread pool running a task.") \
    M(MergeTreeDataSelectExecutorThreadsScheduled, "Number of queued or active jobs in the MergeTreeDataSelectExecutor thread pool.") \
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -86,6 +86,20 @@
    M(NetworkReceiveBytes, "Total number of bytes received from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \
    M(NetworkSendBytes, "Total number of bytes send to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \
    \
+    M(GlobalThreadPoolExpansions, "Counts the total number of times new threads have been added to the global thread pool. This metric indicates the frequency of expansions in the global thread pool to accommodate increased processing demands.") \
+    M(GlobalThreadPoolShrinks, "Counts the total number of times the global thread pool has shrunk by removing threads. This occurs when the number of idle threads exceeds max_thread_pool_free_size, indicating adjustments in the global thread pool size in response to decreased thread utilization.") \
+    M(GlobalThreadPoolThreadCreationMicroseconds, "Total time spent waiting for new threads to start.") \
+    M(GlobalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the global thread pool.") \
+    M(GlobalThreadPoolJobs, "Counts the number of jobs that have been pushed to the global thread pool.") \
+    M(GlobalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.") \
+    M(LocalThreadPoolExpansions, "Counts the total number of times threads have been borrowed from the global thread pool to expand local thread pools.") \
+    M(LocalThreadPoolShrinks, "Counts the total number of times threads have been returned to the global thread pool from local thread pools.") \
+    M(LocalThreadPoolThreadCreationMicroseconds, "Total time local thread pools have spent waiting to borrow a thread from the global pool.") \
+    M(LocalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the local thread pools.") \
+    M(LocalThreadPoolJobs, "Counts the number of jobs that have been pushed to the local thread pools.") \
+    M(LocalThreadPoolBusyMicroseconds, "Total time threads have spent executing the actual work.") \
+    M(LocalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.") \
+    \
    M(DiskS3GetRequestThrottlerCount, "Number of DiskS3 GET and SELECT requests passed through throttler.") \
    M(DiskS3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform DiskS3 GET and SELECT request throttling.") \
    M(DiskS3PutRequestThrottlerCount, "Number of DiskS3 PUT, COPY, POST and LIST requests passed through throttler.") \
--- a/src/Common/ProgressIndication.cpp
+++ b/src/Common/ProgressIndication.cpp
@ -34,13 +34,16 @@ bool ProgressIndication::updateProgress(const Progress & value)

 void ProgressIndication::resetProgress()
 {
-    watch.restart();
-    progress.reset();
-    show_progress_bar = false;
-    written_progress_chars = 0;
-    write_progress_on_update = false;
+    {
+        std::lock_guard lock(progress_mutex);
+        progress.reset();
+        show_progress_bar = false;
+        written_progress_chars = 0;
+        write_progress_on_update = false;
+    }
    {
        std::lock_guard lock(profile_events_mutex);
+        watch.restart();
        cpu_usage_meter.reset(getElapsedNanoseconds());
        hosts_data.clear();
    }
@ -90,6 +93,8 @@ ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const

 void ProgressIndication::writeFinalProgress()
 {
+    std::lock_guard lock(progress_mutex);
+
    if (progress.read_rows < 1000)
        return;

@ -271,6 +276,8 @@ void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message)

 void ProgressIndication::clearProgressOutput(WriteBufferFromFileDescriptor & message)
 {
+    std::lock_guard lock(progress_mutex);
+
    if (written_progress_chars)
    {
        written_progress_chars = 0;
--- a/src/Common/ProgressIndication.h
+++ b/src/Common/ProgressIndication.h
@ -115,6 +115,8 @@ private:
    /// It is possible concurrent access to the following:
    /// - writeProgress() (class properties) (guarded with progress_mutex)
    /// - hosts_data/cpu_usage_meter (guarded with profile_events_mutex)
+    ///
+    /// It is also possible to have more races if query is cancelled, so that clearProgressOutput() is called concurrently
    mutable std::mutex profile_events_mutex;
    mutable std::mutex progress_mutex;

--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@ -1,4 +1,5 @@
 #include <Common/ThreadPool.h>
+#include <Common/ProfileEvents.h>
 #include <Common/setThreadName.h>
 #include <Common/Exception.h>
 #include <Common/getNumberOfPhysicalCPUCores.h>
@ -27,6 +28,25 @@ namespace CurrentMetrics
    extern const Metric GlobalThreadScheduled;
 }

+namespace ProfileEvents
+{
+    extern const Event GlobalThreadPoolExpansions;
+    extern const Event GlobalThreadPoolShrinks;
+    extern const Event GlobalThreadPoolThreadCreationMicroseconds;
+    extern const Event GlobalThreadPoolLockWaitMicroseconds;
+    extern const Event GlobalThreadPoolJobs;
+    extern const Event GlobalThreadPoolJobWaitTimeMicroseconds;
+
+    extern const Event LocalThreadPoolExpansions;
+    extern const Event LocalThreadPoolShrinks;
+    extern const Event LocalThreadPoolThreadCreationMicroseconds;
+    extern const Event LocalThreadPoolLockWaitMicroseconds;
+    extern const Event LocalThreadPoolJobs;
+    extern const Event LocalThreadPoolBusyMicroseconds;
+    extern const Event LocalThreadPoolJobWaitTimeMicroseconds;
+
+}
+
 class JobWithPriority
 {
 public:
@ -40,6 +60,7 @@ public:
    /// Call stacks of all jobs' schedulings leading to this one
    std::vector<StackTrace::FramePointers> frame_pointers;
    bool enable_job_stack_trace = false;
+    Stopwatch job_create_time;

    JobWithPriority(
        Job job_, Priority priority_, CurrentMetrics::Metric metric,
@ -59,6 +80,13 @@ public:
    {
        return priority > rhs.priority; // Reversed for `priority_queue` max-heap to yield minimum value (i.e. highest priority) first
    }
+
+    UInt64 elapsedMicroseconds() const
+    {
+        return job_create_time.elapsedMicroseconds();
+    }
+
+
 };

 static constexpr auto DEFAULT_THREAD_NAME = "ThreadPool";
@ -180,14 +208,18 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:
    };

    {
+        Stopwatch watch;
        std::unique_lock lock(mutex);
+        ProfileEvents::increment(
+            std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds,
+            watch.elapsedMicroseconds());

        if (CannotAllocateThreadFaultInjector::injectFault())
            return on_error("fault injected");

        auto pred = [this] { return !queue_size || scheduled_jobs < queue_size || shutdown; };

-        if (wait_microseconds)  /// Check for optional. Condition is true if the optional is set and the value is zero.
+        if (wait_microseconds)  /// Check for optional. Condition is true if the optional is set. Even if the value is zero.
        {
            if (!job_finished.wait_for(lock, std::chrono::microseconds(*wait_microseconds), pred))
                return on_error(fmt::format("no free thread (timeout={})", *wait_microseconds));
@ -216,7 +248,13 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:

            try
            {
+                Stopwatch watch2;
                threads.front() = Thread([this, it = threads.begin()] { worker(it); });
+                ProfileEvents::increment(
+                    std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds,
+                    watch2.elapsedMicroseconds());
+                ProfileEvents::increment(
+                    std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions);
            }
            catch (...)
            {
@ -239,6 +277,8 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:
    /// Wake up a free thread to run the new job.
    new_job_or_shutdown.notify_one();

+    ProfileEvents::increment(std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolJobs : ProfileEvents::LocalThreadPoolJobs);
+
    return static_cast<ReturnType>(true);
 }

@ -262,7 +302,14 @@ void ThreadPoolImpl<Thread>::startNewThreadsNoLock()

        try
        {
+            Stopwatch watch;
            threads.front() = Thread([this, it = threads.begin()] { worker(it); });
+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds,
+                watch.elapsedMicroseconds());
+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions);
+
        }
        catch (...)
        {
@ -293,7 +340,11 @@ void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, Priority priority, uint64_
 template <typename Thread>
 void ThreadPoolImpl<Thread>::wait()
 {
+    Stopwatch watch;
    std::unique_lock lock(mutex);
+    ProfileEvents::increment(
+        std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds,
+        watch.elapsedMicroseconds());
    /// Signal here just in case.
    /// If threads are waiting on condition variables, but there are some jobs in the queue
    /// then it will prevent us from deadlock.
@ -334,7 +385,11 @@ void ThreadPoolImpl<Thread>::finalize()

    /// Wait for all currently running jobs to finish (we don't wait for all scheduled jobs here like the function wait() does).
    for (auto & thread : threads)
+    {
        thread.join();
+        ProfileEvents::increment(
+            std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks);
+    }

    threads.clear();
 }
@ -391,7 +446,11 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
        std::optional<JobWithPriority> job_data;

        {
+            Stopwatch watch;
            std::unique_lock lock(mutex);
+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds,
+                watch.elapsedMicroseconds());

            // Finish with previous job if any
            if (job_is_done)
@ -424,6 +483,8 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
                {
                    thread_it->detach();
                    threads.erase(thread_it);
+                    ProfileEvents::increment(
+                        std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks);
                }
                return;
            }
@ -433,6 +494,10 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
            job_data = std::move(const_cast<JobWithPriority &>(jobs.top()));
            jobs.pop();

+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolJobWaitTimeMicroseconds : ProfileEvents::LocalThreadPoolJobWaitTimeMicroseconds,
+                job_data->elapsedMicroseconds());
+
            /// We don't run jobs after `shutdown` is set, but we have to properly dequeue all jobs and finish them.
            if (shutdown)
            {
@ -459,7 +524,22 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_

            CurrentMetrics::Increment metric_active_pool_threads(metric_active_threads);

-            job_data->job();
+            if constexpr (!std::is_same_v<Thread, std::thread>)
+            {
+                Stopwatch watch;
+                job_data->job();
+                // This metric is less relevant for the global thread pool, as it would show large values (time while
+                // a thread was used by local pools) and increment only when local pools are destroyed.
+                //
+                // In cases where global pool threads are used directly (without a local thread pool), distinguishing
+                // them is difficult.
+                ProfileEvents::increment(ProfileEvents::LocalThreadPoolBusyMicroseconds, watch.elapsedMicroseconds());
+            }
+            else
+            {
+                job_data->job();
+            }
+

            if (thread_trace_context.root_span.isTraceEnabled())
            {
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@ -131,7 +131,7 @@ private:
    bool threads_remove_themselves = true;
    const bool shutdown_on_exception = true;

-    boost::heap::priority_queue<JobWithPriority> jobs;
+    boost::heap::priority_queue<JobWithPriority,boost::heap::stable<true>> jobs;
    std::list<Thread> threads;
    std::exception_ptr first_exception;
    std::stack<OnDestroyCallback> on_destroy_callbacks;
--- a/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp
+++ b/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp
@ -406,7 +406,7 @@ void UserDefinedSQLObjectsZooKeeperStorage::syncObjects(const zkutil::ZooKeeperP
    LOG_DEBUG(log, "Syncing user-defined {} objects", object_type);
    Strings object_names = getObjectNamesAndSetWatch(zookeeper, object_type);

-    getLock();
+    auto lock = getLock();

    /// Remove stale objects
    removeAllObjectsExcept(object_names);
--- a/src/Functions/date_trunc.cpp
+++ b/src/Functions/date_trunc.cpp
@ -2,6 +2,7 @@
 #include <Columns/ColumnString.h>
 #include <Columns/ColumnsNumber.h>
 #include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDate32.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeInterval.h>
 #include <Formats/FormatSettings.h>
@ -43,6 +44,7 @@ public:
        enum ResultType
        {
            Date,
+            Date32,
            DateTime,
            DateTime64,
        };
@ -75,15 +77,15 @@ public:

        bool second_argument_is_date = false;
        auto check_second_argument = [&] {
-            if (!isDate(arguments[1].type) && !isDateTime(arguments[1].type) && !isDateTime64(arguments[1].type))
+            if (!isDateOrDate32(arguments[1].type) && !isDateTime(arguments[1].type) && !isDateTime64(arguments[1].type))
                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}. "
                    "Should be a date or a date with time", arguments[1].type->getName(), getName());

-            second_argument_is_date = isDate(arguments[1].type);
+            second_argument_is_date = isDateOrDate32(arguments[1].type);

            if (second_argument_is_date && ((datepart_kind == IntervalKind::Kind::Hour)
                || (datepart_kind == IntervalKind::Kind::Minute) || (datepart_kind == IntervalKind::Kind::Second)))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type Date of argument for function {}", getName());
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for function {}", arguments[1].type->getName(), getName());
        };

        auto check_timezone_argument = [&] {
@ -119,6 +121,8 @@ public:

        if (result_type == ResultType::Date)
            return std::make_shared<DataTypeDate>();
+        if (result_type == ResultType::Date32)
+            return std::make_shared<DataTypeDate32>();
        else if (result_type == ResultType::DateTime)
            return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1, false));
        else
--- a/src/Functions/toStartOfInterval.cpp
+++ b/src/Functions/toStartOfInterval.cpp
@ -44,9 +44,9 @@ public:
        auto check_first_argument = [&]
        {
            const DataTypePtr & type_arg1 = arguments[0].type;
-            if (!isDate(type_arg1) && !isDateTime(type_arg1) && !isDateTime64(type_arg1))
+            if (!isDateOrDate32(type_arg1) && !isDateTime(type_arg1) && !isDateTime64(type_arg1))
                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                    "Illegal type {} of 1st argument of function {}, expected a Date, DateTime or DateTime64",
+                    "Illegal type {} of 1st argument of function {}, expected a Date, Date32, DateTime or DateTime64",
                    type_arg1->getName(), getName());
            value_is_date = isDate(type_arg1);
        };
@ -56,6 +56,7 @@ public:
        enum class ResultType : uint8_t
        {
            Date,
+            Date32,
            DateTime,
            DateTime64
        };
@ -128,6 +129,8 @@ public:
        {
            case ResultType::Date:
                return std::make_shared<DataTypeDate>();
+            case ResultType::Date32:
+                return std::make_shared<DataTypeDate32>();
            case ResultType::DateTime:
                return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false));
            case ResultType::DateTime64:
@ -185,7 +188,13 @@ private:
            if (time_column_vec)
                return dispatchForIntervalColumn(assert_cast<const DataTypeDate &>(time_column_type), *time_column_vec, interval_column, result_type, time_zone, input_rows_count);
        }
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal column for 1st argument of function {}, expected a Date, DateTime or DateTime64", getName());
+        else if (isDate32(time_column_type))
+        {
+            const auto * time_column_vec = checkAndGetColumn<ColumnDate32>(&time_column_col);
+            if (time_column_vec)
+                return dispatchForIntervalColumn(assert_cast<const DataTypeDate32 &>(time_column_type), *time_column_vec, interval_column, result_type, time_zone, input_rows_count);
+        }
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal column for 1st argument of function {}, expected a Date, Date32, DateTime or DateTime64", getName());
    }

    template <typename TimeDataType, typename TimeColumnType>
--- a/src/Storages/AlterCommands.cpp
+++ b/src/Storages/AlterCommands.cpp
@ -1245,6 +1245,13 @@ void AlterCommands::prepare(const StorageInMemoryMetadata & metadata)
 {
    auto columns = metadata.columns;

+    auto ast_to_str = [](const ASTPtr & query) -> String
+    {
+        if (!query)
+            return "";
+        return queryToString(query);
+    };
+
    for (size_t i = 0; i < size(); ++i)
    {
        auto & command = (*this)[i];
@ -1277,6 +1284,11 @@ void AlterCommands::prepare(const StorageInMemoryMetadata & metadata)
            if (!has_column && command.if_exists)
                command.ignore = true;
        }
+        else if (command.type == AlterCommand::MODIFY_ORDER_BY)
+        {
+            if (ast_to_str(command.order_by) == ast_to_str(metadata.sorting_key.definition_ast))
+                command.ignore = true;
+        }
    }

    prepared = true;
--- a/tests/integration/test_backward_compatibility/test_functions.py
+++ b/tests/integration/test_backward_compatibility/test_functions.py
@ -67,6 +67,11 @@ def test_aggregate_states(start_cluster):
            f"select hex(initializeAggregation('{function_name}State', 'foo'))"
        ).strip()

+    def get_final_value_unhex(node, function_name, value):
+        return node.query(
+            f"select finalizeAggregation(unhex('{value}')::AggregateFunction({function_name}, String))"
+        ).strip()
+
    for aggregate_function in aggregate_functions:
        logging.info("Checking %s", aggregate_function)

@ -99,13 +104,39 @@ def test_aggregate_states(start_cluster):

        upstream_state = get_aggregate_state_hex(upstream, aggregate_function)
        if upstream_state != backward_state:
-            logging.info(
-                "Failed %s, %s (backward) != %s (upstream)",
-                aggregate_function,
-                backward_state,
-                upstream_state,
-            )
-            failed += 1
+            allowed_changes_if_result_is_the_same = ["anyHeavy"]
+
+            if aggregate_function in allowed_changes_if_result_is_the_same:
+                backward_final_from_upstream = get_final_value_unhex(
+                    backward, aggregate_function, upstream_state
+                )
+                upstream_final_from_backward = get_final_value_unhex(
+                    upstream, aggregate_function, backward_state
+                )
+
+                if backward_final_from_upstream == upstream_final_from_backward:
+                    logging.info(
+                        "OK %s (but different intermediate states)", aggregate_function
+                    )
+                    passed += 1
+                else:
+                    logging.error(
+                        "Failed %s, Intermediate: %s (backward) != %s (upstream). Final from intermediate: %s (backward from upstream state) != %s (upstream from backward state)",
+                        aggregate_function,
+                        backward_state,
+                        upstream_state,
+                        backward_final_from_upstream,
+                        upstream_final_from_backward,
+                    )
+                    failed += 1
+            else:
+                logging.error(
+                    "Failed %s, %s (backward) != %s (upstream)",
+                    aggregate_function,
+                    backward_state,
+                    upstream_state,
+                )
+                failed += 1
        else:
            logging.info("OK %s", aggregate_function)
            passed += 1
--- a/tests/integration/test_replicated_database_alter_modify_order_by/init.py
+++ b/tests/integration/test_replicated_database_alter_modify_order_by/init.py
--- a/tests/integration/test_replicated_database_alter_modify_order_by/configs/config.xml
+++ b/tests/integration/test_replicated_database_alter_modify_order_by/configs/config.xml
@ -0,0 +1,10 @@
+<clickhouse>
+    <database_atomic_delay_before_drop_table_sec>10</database_atomic_delay_before_drop_table_sec>
+    <allow_moving_table_directory_to_trash>1</allow_moving_table_directory_to_trash>
+    <merge_tree>
+        <initialization_retry_period>10</initialization_retry_period>
+    </merge_tree>
+    <max_database_replicated_create_table_thread_pool_size>50</max_database_replicated_create_table_thread_pool_size>
+    <allow_experimental_transactions>42</allow_experimental_transactions>
+    <async_load_databases>false</async_load_databases>
+</clickhouse>
--- a/tests/integration/test_replicated_database_alter_modify_order_by/configs/settings.xml
+++ b/tests/integration/test_replicated_database_alter_modify_order_by/configs/settings.xml
@ -0,0 +1,11 @@
+<clickhouse>
+    <profiles>
+        <default>
+        </default>
+    </profiles>
+    <users>
+        <default>
+            <profile>default</profile>
+        </default>
+    </users>
+</clickhouse>
--- a/tests/integration/test_replicated_database_alter_modify_order_by/test.py
+++ b/tests/integration/test_replicated_database_alter_modify_order_by/test.py
@ -0,0 +1,70 @@
+import pytest
+
+
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import assert_eq_with_retry
+
+
+cluster = ClickHouseCluster(__file__)
+
+shard1_node = cluster.add_instance(
+    "shard1_node",
+    main_configs=["configs/config.xml"],
+    user_configs=["configs/settings.xml"],
+    with_zookeeper=True,
+    stay_alive=True,
+    macros={"shard": 1, "replica": 1},
+)
+
+shard2_node = cluster.add_instance(
+    "shard2_node",
+    main_configs=["configs/config.xml"],
+    user_configs=["configs/settings.xml"],
+    with_zookeeper=True,
+    stay_alive=True,
+    macros={"shard": 2, "replica": 1},
+)
+
+
+all_nodes = [
+    shard1_node,
+    shard2_node,
+]
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+
+def test_alter_modify_order_by(started_cluster):
+    shard1_node.query("DROP DATABASE IF EXISTS alter_modify_order_by SYNC;")
+    shard2_node.query("DROP DATABASE IF EXISTS alter_modify_order_by SYNC;")
+
+    shard1_node.query(
+        "CREATE DATABASE alter_modify_order_by ENGINE = Replicated('/test/database/alter_modify_order_by', '{shard}', '{replica}');"
+    )
+    shard1_node.query(
+        "CREATE TABLE alter_modify_order_by.t1 (id Int64, score Int64) ENGINE = ReplicatedMergeTree('/test/tables/{uuid}/{shard}', '{replica}') ORDER BY (id);"
+    )
+    shard1_node.query("ALTER TABLE alter_modify_order_by.t1 modify order by (id);")
+    shard2_node.query(
+        "CREATE DATABASE alter_modify_order_by ENGINE = Replicated('/test/database/alter_modify_order_by', '{shard}', '{replica}');"
+    )
+
+    query = (
+        "select count() from system.tables where database = 'alter_modify_order_by';"
+    )
+    expected = shard1_node.query(query)
+    assert_eq_with_retry(shard2_node, query, expected)
+
+    query = "show create table alter_modify_order_by.t1;"
+    assert shard1_node.query(query) == shard2_node.query(query)
+
+    shard1_node.query("DROP DATABASE IF EXISTS alter_modify_order_by SYNC;")
+    shard2_node.query("DROP DATABASE IF EXISTS alter_modify_order_by SYNC;")
--- a/tests/queries/0_stateless/03230_anyHeavy_merge.reference
+++ b/tests/queries/0_stateless/03230_anyHeavy_merge.reference
@ -0,0 +1 @@
+a
--- a/tests/queries/0_stateless/03230_anyHeavy_merge.sql
+++ b/tests/queries/0_stateless/03230_anyHeavy_merge.sql
@ -0,0 +1,4 @@
+DROP TABLE IF EXISTS t;
+CREATE TABLE t (letter String) ENGINE=MergeTree order by () partition by letter;
+INSERT INTO t VALUES ('a'), ('a'), ('a'), ('a'),  ('b'), ('a'), ('a'), ('a'), ('a'), ('a'), ('a'), ('a'), ('a'), ('a'), ('a'), ('a'), ('c');
+SELECT anyHeavy(if(letter != 'b', letter, NULL)) FROM t;
--- a/tests/queries/0_stateless/03230_date_trunc_and_to_start_of_interval_on_date32.reference
+++ b/tests/queries/0_stateless/03230_date_trunc_and_to_start_of_interval_on_date32.reference
@ -0,0 +1,33 @@
+-- { echoOn }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 YEAR);
+2022-01-01
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 QUARTER);
+2022-07-01
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MONTH);
+2022-09-01
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 WEEK);
+2022-09-12
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 DAY);
+2022-09-16 00:00:00
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 HOUR); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MINUTE); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 SECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MILLISECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MICROSECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 NANOSECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('YEAR', toDate32('2022-09-16'));
+2022-01-01
+select date_trunc('QUARTER', toDate32('2022-09-16'));
+2022-07-01
+select date_trunc('MONTH', toDate32('2022-09-16'));
+2022-09-01
+select date_trunc('WEEK', toDate32('2022-09-16'));
+2022-09-12
+select date_trunc('DAY', toDate32('2022-09-16'));
+2022-09-16 00:00:00
+select date_trunc('HOUR', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('MINUTE', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('SECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('MILLISECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('MICROSECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('NANOSECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
--- a/tests/queries/0_stateless/03230_date_trunc_and_to_start_of_interval_on_date32.sql
+++ b/tests/queries/0_stateless/03230_date_trunc_and_to_start_of_interval_on_date32.sql
@ -0,0 +1,26 @@
+-- { echoOn }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 YEAR);
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 QUARTER);
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MONTH);
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 WEEK);
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 DAY);
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 HOUR); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MINUTE); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 SECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MILLISECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 MICROSECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select toStartOfInterval(toDate32('2022-09-16'), INTERVAL 1 NANOSECOND); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+
+select date_trunc('YEAR', toDate32('2022-09-16'));
+select date_trunc('QUARTER', toDate32('2022-09-16'));
+select date_trunc('MONTH', toDate32('2022-09-16'));
+select date_trunc('WEEK', toDate32('2022-09-16'));
+select date_trunc('DAY', toDate32('2022-09-16'));
+select date_trunc('HOUR', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('MINUTE', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('SECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('MILLISECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('MICROSECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+select date_trunc('NANOSECOND', toDate32('2022-09-16')); -- {  serverError ILLEGAL_TYPE_OF_ARGUMENT }
+
+
--- a/tests/queries/0_stateless/03230_keeper_cp_mv_commands.reference
+++ b/tests/queries/0_stateless/03230_keeper_cp_mv_commands.reference
@ -0,0 +1,15 @@
+initial
+A C
+simple copy
+A C D
+node-A
+simple move
+A C H
+node-A
+move node with childs -- must be error
+Transaction failed (Not empty): Op #2, path: /test-keeper-client-default/A
+A C H
+move node to existing
+Transaction failed (Node exists): Op #1, path: /test-keeper-client-default/A
+A C H
+clean up
--- a/tests/queries/0_stateless/03230_keeper_cp_mv_commands.sh
+++ b/tests/queries/0_stateless/03230_keeper_cp_mv_commands.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+path="/test-keeper-client-$CLICKHOUSE_DATABASE"
+$CLICKHOUSE_KEEPER_CLIENT -q "rm '$path'" >& /dev/null
+
+$CLICKHOUSE_KEEPER_CLIENT -q "create '$path' 'root'"
+$CLICKHOUSE_KEEPER_CLIENT -q "create '$path/A' 'node-A'"
+$CLICKHOUSE_KEEPER_CLIENT -q "create '$path/A/B' 'node-B'"
+$CLICKHOUSE_KEEPER_CLIENT -q "create '$path/C' 'node-B'"
+
+echo 'initial'
+$CLICKHOUSE_KEEPER_CLIENT -q "ls '$path'"
+
+echo 'simple copy'
+$CLICKHOUSE_KEEPER_CLIENT -q "cp '$path/A' '$path/D'"
+$CLICKHOUSE_KEEPER_CLIENT -q "ls '$path'"
+$CLICKHOUSE_KEEPER_CLIENT -q "get '$path/D'"
+
+echo 'simple move'
+$CLICKHOUSE_KEEPER_CLIENT -q "mv '$path/D' '$path/H'"
+$CLICKHOUSE_KEEPER_CLIENT -q "ls '$path'"
+$CLICKHOUSE_KEEPER_CLIENT -q "get '$path/H'"
+
+echo 'move node with childs -- must be error'
+$CLICKHOUSE_KEEPER_CLIENT -q "mv '$path/A' '$path/ERROR'" 2>&1
+$CLICKHOUSE_KEEPER_CLIENT -q "ls '$path'"
+
+echo 'move node to existing'
+$CLICKHOUSE_KEEPER_CLIENT -q "mv '$path/C' '$path/A'" 2>&1
+$CLICKHOUSE_KEEPER_CLIENT -q "ls '$path'"
+
+echo 'clean up'
+$CLICKHOUSE_KEEPER_CLIENT -q "rmr '$path'"