diff --git a/.gitmodules b/.gitmodules index 151dc28c55b..ba71a8ae3a7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -258,9 +258,6 @@ [submodule "contrib/wyhash"] path = contrib/wyhash url = https://github.com/wangyi-fudan/wyhash -[submodule "contrib/hashidsxx"] - path = contrib/hashidsxx - url = https://github.com/schoentoon/hashidsxx [submodule "contrib/nats-io"] path = contrib/nats-io url = https://github.com/ClickHouse/nats.c diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 2af468970f1..0f68c0cbc7c 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -164,7 +164,6 @@ add_contrib (libpq-cmake libpq) add_contrib (nuraft-cmake NuRaft) add_contrib (fast_float-cmake fast_float) add_contrib (datasketches-cpp-cmake datasketches-cpp) -add_contrib (hashidsxx-cmake hashidsxx) option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES}) if (ENABLE_NLP) diff --git a/contrib/hashidsxx b/contrib/hashidsxx deleted file mode 160000 index 783f6911ccf..00000000000 --- a/contrib/hashidsxx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 783f6911ccfdaca83e3cfac084c4aad888a80cee diff --git a/contrib/hashidsxx-cmake/CMakeLists.txt b/contrib/hashidsxx-cmake/CMakeLists.txt deleted file mode 100644 index 17f3888bd94..00000000000 --- a/contrib/hashidsxx-cmake/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/hashidsxx") - -set (SRCS - "${LIBRARY_DIR}/hashids.cpp" -) - -set (HDRS - "${LIBRARY_DIR}/hashids.h" -) - -add_library(_hashidsxx ${SRCS} ${HDRS}) -target_include_directories(_hashidsxx SYSTEM PUBLIC "${LIBRARY_DIR}") - -add_library(ch_contrib::hashidsxx ALIAS _hashidsxx) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 828c73e6781..e25b5fdbfed 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -141,7 +141,6 @@ function clone_submodules contrib/jemalloc contrib/replxx contrib/wyhash - contrib/hashidsxx contrib/c-ares contrib/morton-nd contrib/xxHash diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index c6b978506a1..87d84425029 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1449,7 +1449,7 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %n | new-line character (‘’) | | | %p | AM or PM designation | PM | | %Q | Quarter (1-4) | 1 | -| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%i %p | 10:30 PM | +| %r | 12-hour HH:MM AM/PM time, equivalent to %h:%i %p | 10:30 PM | | %R | 24-hour HH:MM time, equivalent to %H:%i | 22:33 | | %s | second (00-59) | 44 | | %S | second (00-59) | 44 | diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 556fe622c27..06097d92480 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -51,7 +51,7 @@ Calculates the MD5 from a string and returns the resulting set of bytes as Fixed If you do not need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the ‘sipHash128’ function instead. If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))). -## sipHash64 {#hash_functions-siphash64} +## sipHash64 (#hash_functions-siphash64) Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value. @@ -63,9 +63,9 @@ This is a cryptographic hash function. It works at least three times faster than The function [interprets](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. It then combines the hashes by the following algorithm: -1. The first and the second hash value are concatenated to an array which is hashed. -2. The previously calculated hash value and the hash of the third input parameter are hashed in a similar way. -3. This calculation is repeated for all remaining hash values of the original input. +1. The first and the second hash value are concatenated to an array which is hashed. +2. The previously calculated hash value and the hash of the third input parameter are hashed in a similar way. +3. This calculation is repeated for all remaining hash values of the original input. **Arguments** diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9202d4b32c1..33fdcc9c1a8 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1637,17 +1637,26 @@ try global_context->initializeTraceCollector(); /// Set up server-wide memory profiler (for total memory tracker). - UInt64 total_memory_profiler_step = config().getUInt64("total_memory_profiler_step", 0); - if (total_memory_profiler_step) + if (server_settings.total_memory_profiler_step) { - total_memory_tracker.setProfilerStep(total_memory_profiler_step); + total_memory_tracker.setProfilerStep(server_settings.total_memory_profiler_step); } - double total_memory_tracker_sample_probability = config().getDouble("total_memory_tracker_sample_probability", 0); - if (total_memory_tracker_sample_probability > 0.0) + if (server_settings.total_memory_tracker_sample_probability > 0.0) { - total_memory_tracker.setSampleProbability(total_memory_tracker_sample_probability); + total_memory_tracker.setSampleProbability(server_settings.total_memory_tracker_sample_probability); } + + if (server_settings.total_memory_profiler_sample_min_allocation_size) + { + total_memory_tracker.setSampleMinAllocationSize(server_settings.total_memory_profiler_sample_min_allocation_size); + } + + if (server_settings.total_memory_profiler_sample_max_allocation_size) + { + total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size); + } + } #endif diff --git a/src/Common/CurrentThread.cpp b/src/Common/CurrentThread.cpp index 057b1eeda12..ac5b712279e 100644 --- a/src/Common/CurrentThread.cpp +++ b/src/Common/CurrentThread.cpp @@ -3,7 +3,6 @@ #include "CurrentThread.h" #include #include -#include #include #include #include diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index 81cac2617c5..52cae0768dc 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -229,7 +229,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT } std::bernoulli_distribution sample(sample_probability); - if (unlikely(sample_probability > 0.0 && sample(thread_local_rng))) + if (unlikely(sample_probability > 0.0 && isSizeOkForSampling(size) && sample(thread_local_rng))) { MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global); DB::TraceSender::send(DB::TraceType::MemorySample, StackTrace(), {.size = size}); @@ -413,7 +413,7 @@ void MemoryTracker::free(Int64 size) } std::bernoulli_distribution sample(sample_probability); - if (unlikely(sample_probability > 0.0 && sample(thread_local_rng))) + if (unlikely(sample_probability > 0.0 && isSizeOkForSampling(size) && sample(thread_local_rng))) { MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global); DB::TraceSender::send(DB::TraceType::MemorySample, StackTrace(), {.size = -size}); @@ -534,6 +534,12 @@ void MemoryTracker::setOrRaiseProfilerLimit(Int64 value) ; } +bool MemoryTracker::isSizeOkForSampling(UInt64 size) const +{ + /// We can avoid comparison min_allocation_size_bytes with zero, because we cannot have 0 bytes allocation/deallocation + return ((max_allocation_size_bytes == 0 || size <= max_allocation_size_bytes) && size >= min_allocation_size_bytes); +} + bool canEnqueueBackgroundTask() { auto limit = background_memory_tracker.getSoftLimit(); diff --git a/src/Common/MemoryTracker.h b/src/Common/MemoryTracker.h index 4e29d40c953..768dc8a7404 100644 --- a/src/Common/MemoryTracker.h +++ b/src/Common/MemoryTracker.h @@ -67,6 +67,12 @@ private: /// To randomly sample allocations and deallocations in trace_log. double sample_probability = 0; + /// Randomly sample allocations only larger or equal to this size + UInt64 min_allocation_size_bytes = 0; + + /// Randomly sample allocations only smaller or equal to this size + UInt64 max_allocation_size_bytes = 0; + /// Singly-linked list. All information will be passed to subsequent memory trackers also (it allows to implement trackers hierarchy). /// In terms of tree nodes it is the list of parents. Lifetime of these trackers should "include" lifetime of current tracker. std::atomic parent {}; @@ -88,6 +94,8 @@ private: void setOrRaiseProfilerLimit(Int64 value); + bool isSizeOkForSampling(UInt64 size) const; + /// allocImpl(...) and free(...) should not be used directly friend struct CurrentMemoryTracker; void allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker = nullptr); @@ -165,6 +173,16 @@ public: sample_probability = value; } + void setSampleMinAllocationSize(UInt64 value) + { + min_allocation_size_bytes = value; + } + + void setSampleMaxAllocationSize(UInt64 value) + { + max_allocation_size_bytes = value; + } + void setProfilerStep(Int64 value) { profiler_step = value; diff --git a/src/Common/TaskStatsInfoGetter.cpp b/src/Common/NetlinkMetricsProvider.cpp similarity index 93% rename from src/Common/TaskStatsInfoGetter.cpp rename to src/Common/NetlinkMetricsProvider.cpp index 867a50c8cce..4c228bcc6fc 100644 --- a/src/Common/TaskStatsInfoGetter.cpp +++ b/src/Common/NetlinkMetricsProvider.cpp @@ -1,4 +1,4 @@ -#include "TaskStatsInfoGetter.h" +#include "NetlinkMetricsProvider.h" #include #include #include @@ -200,7 +200,7 @@ bool checkPermissionsImpl() if (!res) return false; - /// Check that we can successfully initialize TaskStatsInfoGetter. + /// Check that we can successfully initialize NetlinkMetricsProvider. /// It will ask about family id through Netlink. /// On some LXC containers we have capability but we still cannot use Netlink. /// There is an evidence that Linux fedora-riscv 6.1.22 gives something strange instead of the expected result. @@ -208,7 +208,7 @@ bool checkPermissionsImpl() try { ::taskstats stats{}; - TaskStatsInfoGetter().getStat(stats, static_cast(getThreadId())); + NetlinkMetricsProvider().getStat(stats, static_cast(getThreadId())); } catch (const Exception & e) { @@ -244,14 +244,14 @@ UInt16 getFamilyId(int fd) } -bool TaskStatsInfoGetter::checkPermissions() +bool NetlinkMetricsProvider::checkPermissions() { static bool res = checkPermissionsImpl(); return res; } -TaskStatsInfoGetter::TaskStatsInfoGetter() +NetlinkMetricsProvider::NetlinkMetricsProvider() { netlink_socket_fd = ::socket(PF_NETLINK, SOCK_RAW, NETLINK_GENERIC); if (netlink_socket_fd < 0) @@ -293,7 +293,7 @@ TaskStatsInfoGetter::TaskStatsInfoGetter() } -void TaskStatsInfoGetter::getStat(::taskstats & out_stats, pid_t tid) const +void NetlinkMetricsProvider::getStat(::taskstats & out_stats, pid_t tid) const { NetlinkMessage answer = query(netlink_socket_fd, taskstats_family_id, tid, TASKSTATS_CMD_GET, TASKSTATS_CMD_ATTR_PID, &tid, sizeof(tid)); @@ -318,7 +318,7 @@ void TaskStatsInfoGetter::getStat(::taskstats & out_stats, pid_t tid) const } -TaskStatsInfoGetter::~TaskStatsInfoGetter() +NetlinkMetricsProvider::~NetlinkMetricsProvider() { if (netlink_socket_fd >= 0) { @@ -335,15 +335,15 @@ TaskStatsInfoGetter::~TaskStatsInfoGetter() namespace DB { -bool TaskStatsInfoGetter::checkPermissions() +bool NetlinkMetricsProvider::checkPermissions() { return false; } -TaskStatsInfoGetter::TaskStatsInfoGetter() = default; -TaskStatsInfoGetter::~TaskStatsInfoGetter() = default; +NetlinkMetricsProvider::NetlinkMetricsProvider() = default; +NetlinkMetricsProvider::~NetlinkMetricsProvider() = default; -void TaskStatsInfoGetter::getStat(::taskstats &, pid_t) const +void NetlinkMetricsProvider::getStat(::taskstats &, pid_t) const { } diff --git a/src/Common/TaskStatsInfoGetter.h b/src/Common/NetlinkMetricsProvider.h similarity index 85% rename from src/Common/TaskStatsInfoGetter.h rename to src/Common/NetlinkMetricsProvider.h index 66655d7ad0d..8a54f33be80 100644 --- a/src/Common/TaskStatsInfoGetter.h +++ b/src/Common/NetlinkMetricsProvider.h @@ -15,11 +15,11 @@ namespace DB /// /// [1]: https://elixir.bootlin.com/linux/v5.18-rc4/source/kernel/tsacct.c#L101 /// -class TaskStatsInfoGetter : private boost::noncopyable +class NetlinkMetricsProvider : private boost::noncopyable { public: - TaskStatsInfoGetter(); - ~TaskStatsInfoGetter(); + NetlinkMetricsProvider(); + ~NetlinkMetricsProvider(); void getStat(::taskstats & out_stats, pid_t tid) const; diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index c542945c78d..0b80e2f3f97 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #define MIN_LENGTH_FOR_STRSTR 3 @@ -50,6 +51,8 @@ const char * analyzeImpl( bool & is_trivial, Literals & global_alternatives) { + checkStackSize(); + /** The expression is trivial if all the metacharacters in it are escaped. * The non-alternative string is * a string outside parentheses, diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index a94fd81559a..256f53df011 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -2,7 +2,7 @@ #if defined(OS_LINUX) -#include "TaskStatsInfoGetter.h" +#include "NetlinkMetricsProvider.h" #include "ProcfsMetricsProvider.h" #include "hasLinuxCapability.h" @@ -99,7 +99,7 @@ TasksStatsCounters::MetricsProvider TasksStatsCounters::findBestAvailableProvide static std::optional provider = []() -> MetricsProvider { - if (TaskStatsInfoGetter::checkPermissions()) + if (NetlinkMetricsProvider::checkPermissions()) { return MetricsProvider::Netlink; } @@ -119,7 +119,7 @@ TasksStatsCounters::TasksStatsCounters(const UInt64 tid, const MetricsProvider p switch (provider) { case MetricsProvider::Netlink: - stats_getter = [metrics_provider = std::make_shared(), tid]() + stats_getter = [metrics_provider = std::make_shared(), tid]() { ::taskstats result{}; metrics_provider->getStat(result, static_cast(tid)); diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 1a9f226041b..f7a6c9e950e 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -81,8 +81,12 @@ namespace DB M(UInt64, background_schedule_pool_size, 128, "The maximum number of threads that will be used for constantly executing some lightweight periodic operations.", 0) \ M(UInt64, background_message_broker_schedule_pool_size, 16, "The maximum number of threads that will be used for executing background operations for message streaming.", 0) \ M(UInt64, background_distributed_schedule_pool_size, 16, "The maximum number of threads that will be used for executing distributed sends.", 0) \ - M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) - + M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) \ + \ + M(UInt64, total_memory_profiler_step, 0, "Whenever server memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down server.", 0) \ + M(Double, total_memory_tracker_sample_probability, 0, "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ + M(UInt64, total_memory_profiler_sample_min_allocation_size, 0, "Collect random allocations of size greater or equal than specified value with probability equal to `total_memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \ + M(UInt64, total_memory_profiler_sample_max_allocation_size, 0, "Collect random allocations of size less or equal than specified value with probability equal to `total_memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) DECLARE_SETTINGS_TRAITS(ServerSettingsTraits, SERVER_SETTINGS) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index cfcb56729d2..68896b33068 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -427,7 +427,9 @@ class IColumn; M(UInt64, memory_overcommit_ratio_denominator_for_user, 1_GiB, "It represents soft memory limit on the global level. This value is used to compute query overcommit ratio.", 0) \ M(UInt64, max_untracked_memory, (4 * 1024 * 1024), "Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when amount (in absolute value) becomes larger than specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'.", 0) \ M(UInt64, memory_profiler_step, (4 * 1024 * 1024), "Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down query processing.", 0) \ - M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation. Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ + M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ + M(UInt64, memory_profiler_sample_min_allocation_size, 0, "Collect random allocations of size greater or equal than specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \ + M(UInt64, memory_profiler_sample_max_allocation_size, 0, "Collect random allocations of size less or equal than specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \ M(Bool, trace_profile_events, false, "Send to system.trace_log profile event and value of increment on each increment with 'ProfileEvent' trace_type", 0) \ \ M(UInt64, memory_usage_overcommit_max_wait_microseconds, 5'000'000, "Maximum time thread will wait for memory to be freed in the case of memory overcommit. If timeout is reached and memory is not freed, exception is thrown.", 0) \ @@ -761,7 +763,7 @@ class IColumn; /** Experimental functions */ \ M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ - M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions (hashid, etc)", 0) \ + M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ diff --git a/src/Disks/IO/ThreadPoolReader.cpp b/src/Disks/IO/ThreadPoolReader.cpp index effa19bc1af..cd3f2d8dea0 100644 --- a/src/Disks/IO/ThreadPoolReader.cpp +++ b/src/Disks/IO/ThreadPoolReader.cpp @@ -114,7 +114,7 @@ std::future ThreadPoolReader::submit(Request reques /// It reports real time spent including the time spent while thread was preempted doing nothing. /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables). /// Sometimes it is better to use taskstats::blkio_delay_total, but it is quite expensive to get it - /// (TaskStatsInfoGetter has about 500K RPS). + /// (NetlinkMetricsProvider has about 500K RPS). Stopwatch watch(CLOCK_MONOTONIC); SCOPE_EXIT({ diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 2f5c8a212f2..06436488050 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -21,7 +21,6 @@ list (APPEND PUBLIC_LIBS dbms ch_contrib::metrohash ch_contrib::murmurhash - ch_contrib::hashidsxx ch_contrib::morton_nd ) diff --git a/src/Functions/CountSubstringsImpl.h b/src/Functions/CountSubstringsImpl.h index de00e9397d6..8ba9ee99de8 100644 --- a/src/Functions/CountSubstringsImpl.h +++ b/src/Functions/CountSubstringsImpl.h @@ -49,6 +49,9 @@ struct CountSubstringsImpl /// FIXME: suboptimal memset(&res[0], 0, res.size() * sizeof(res[0])); + if (needle.empty()) + return; // Return all zeros + /// Current index in the array of strings. size_t i = 0; @@ -223,16 +226,19 @@ struct CountSubstringsImpl const char * needle_beg = reinterpret_cast(&needle_data[prev_needle_offset]); size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; - typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(needle_beg, needle_size); - - const UInt8 * end = reinterpret_cast(haystack.data() + haystack.size()); - const UInt8 * beg = reinterpret_cast(Impl::advancePos(haystack.data(), reinterpret_cast(end), start - 1)); - - const UInt8 * pos; - while ((pos = searcher.search(beg, end)) < end) + if (needle_size > 0) { - ++res[i]; - beg = pos + needle_size; + typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(needle_beg, needle_size); + + const UInt8 * end = reinterpret_cast(haystack.data() + haystack.size()); + const UInt8 * beg = reinterpret_cast(Impl::advancePos(haystack.data(), reinterpret_cast(end), start - 1)); + + const UInt8 * pos; + while ((pos = searcher.search(beg, end)) < end) + { + ++res[i]; + beg = pos + needle_size; + } } } diff --git a/src/Functions/FunctionHashID.cpp b/src/Functions/FunctionHashID.cpp deleted file mode 100644 index 829b3d9d2f6..00000000000 --- a/src/Functions/FunctionHashID.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "FunctionHashID.h" -#include - -namespace DB -{ - -REGISTER_FUNCTION(HashID) -{ - factory.registerFunction(); -} - -} diff --git a/src/Functions/FunctionHashID.h b/src/Functions/FunctionHashID.h deleted file mode 100644 index 680c3f6430b..00000000000 --- a/src/Functions/FunctionHashID.h +++ /dev/null @@ -1,170 +0,0 @@ -#pragma once - -#include "config.h" - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int SUPPORT_IS_DISABLED; - extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; - extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; -} - -// hashid(string, salt) -class FunctionHashID : public IFunction -{ -public: - static constexpr auto name = "hashid"; - - static FunctionPtr create(ContextPtr context) - { - if (!context->getSettingsRef().allow_experimental_hash_functions) - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, - "Hashing function '{}' is experimental. Set `allow_experimental_hash_functions` setting to enable it", name); - - return std::make_shared(); - } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - - bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - if (arguments.empty()) - throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects at least one argument", getName()); - - const auto & id_col = arguments[0]; - if (!isUnsignedInteger(id_col.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "First argument of function {} must be unsigned integer, got {}", - getName(), - arguments[0].type->getName()); - - if (arguments.size() > 1) - { - const auto & hash_col = arguments[1]; - if (!isString(hash_col.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Second argument of function {} must be String, got {}", - getName(), - arguments[1].type->getName()); - } - - if (arguments.size() > 2) - { - const auto & min_length_col = arguments[2]; - if (!isUInt8(min_length_col.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Third argument of function {} must be UInt8, got {}", - getName(), - arguments[2].type->getName()); - } - - if (arguments.size() > 3) - { - const auto & alphabet_col = arguments[3]; - if (!isString(alphabet_col.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Fourth argument of function {} must be String, got {}", - getName(), - arguments[3].type->getName()); - } - - if (arguments.size() > 4) - { - throw Exception( - ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, - "Function {} expect no more than four arguments (integer, salt, min_length, optional_alphabet), got {}", - getName(), - arguments.size()); - } - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - const auto & numcolumn = arguments[0].column; - - if (checkAndGetColumn(numcolumn.get()) || checkAndGetColumn(numcolumn.get()) - || checkAndGetColumn(numcolumn.get()) || checkAndGetColumn(numcolumn.get())) - { - std::string salt; - UInt8 min_length = 0; - std::string alphabet; - - if (arguments.size() >= 4) - { - const auto & alphabetcolumn = arguments[3].column; - if (const auto * alpha_col = checkAndGetColumnConst(alphabetcolumn.get())) - { - alphabet = alpha_col->getValue(); - if (alphabet.find('\0') != std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Custom alphabet must not contain null character"); - } - } - else - alphabet.assign(DEFAULT_ALPHABET); - - if (arguments.size() >= 3) - { - const auto & minlengthcolumn = arguments[2].column; - if (const auto * min_length_col = checkAndGetColumnConst(minlengthcolumn.get())) - min_length = min_length_col->getValue(); - } - - if (arguments.size() >= 2) - { - const auto & saltcolumn = arguments[1].column; - if (const auto * salt_col = checkAndGetColumnConst(saltcolumn.get())) - salt = salt_col->getValue(); - } - - hashidsxx::Hashids hash(salt, min_length, alphabet); - - auto col_res = ColumnString::create(); - - for (size_t i = 0; i < input_rows_count; ++i) - { - col_res->insert(hash.encode({numcolumn->getUInt(i)})); - } - - return col_res; - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function hashid", - arguments[0].column->getName()); - } -}; - -} diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 82944630b10..6af683777c3 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -79,51 +79,28 @@ namespace impl UInt64 key1 = 0; }; - struct SipHashKeyColumns + static SipHashKey parseSipHashKey(const ColumnWithTypeAndName & key) { - ColumnPtr key0; - ColumnPtr key1; - bool is_const; + SipHashKey ret{}; - size_t size() const - { - assert(key0 && key1); - assert(key0->size() == key1->size()); - return key0->size(); - } - SipHashKey getKey(size_t i) const - { - if (is_const) - i = 0; - const auto & key0data = assert_cast(*key0).getData(); - const auto & key1data = assert_cast(*key1).getData(); - return {key0data[i], key1data[i]}; - } - }; - - static SipHashKeyColumns parseSipHashKeyColumns(const ColumnWithTypeAndName & key) - { - const ColumnTuple * tuple = nullptr; - const auto * column = key.column.get(); - bool is_const = false; - if (isColumnConst(*column)) - { - is_const = true; - tuple = checkAndGetColumnConstData(column); - } - else - tuple = checkAndGetColumn(column); + const auto * tuple = checkAndGetColumn(key.column.get()); if (!tuple) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "key must be a tuple"); + if (tuple->tupleSize() != 2) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "wrong tuple size: key must be a tuple of 2 UInt64"); - SipHashKeyColumns ret{tuple->getColumnPtr(0), tuple->getColumnPtr(1), is_const}; - assert(ret.key0); - if (!checkColumn(*ret.key0)) + if (tuple->empty()) + return ret; + + if (const auto * key0col = checkAndGetColumn(&(tuple->getColumn(0)))) + ret.key0 = key0col->get64(0); + else throw Exception(ErrorCodes::NOT_IMPLEMENTED, "first element of the key tuple is not UInt64"); - assert(ret.key1); - if (!checkColumn(*ret.key1)) + + if (const auto * key1col = checkAndGetColumn(&(tuple->getColumn(1)))) + ret.key1 = key1col->get64(0); + else throw Exception(ErrorCodes::NOT_IMPLEMENTED, "second element of the key tuple is not UInt64"); return ret; @@ -352,10 +329,8 @@ struct SipHash64KeyedImpl static constexpr auto name = "sipHash64Keyed"; using ReturnType = UInt64; using Key = impl::SipHashKey; - using KeyColumns = impl::SipHashKeyColumns; - static KeyColumns parseKeyColumns(const ColumnWithTypeAndName & key) { return impl::parseSipHashKeyColumns(key); } - static Key getKey(const KeyColumns & key, size_t i) { return key.getKey(i); } + static Key parseKey(const ColumnWithTypeAndName & key) { return impl::parseSipHashKey(key); } static UInt64 applyKeyed(const Key & key, const char * begin, size_t size) { return sipHash64Keyed(key.key0, key.key1, begin, size); } @@ -396,10 +371,8 @@ struct SipHash128KeyedImpl static constexpr auto name = "sipHash128Keyed"; using ReturnType = UInt128; using Key = impl::SipHashKey; - using KeyColumns = impl::SipHashKeyColumns; - static KeyColumns parseKeyColumns(const ColumnWithTypeAndName & key) { return impl::parseSipHashKeyColumns(key); } - static Key getKey(const KeyColumns & key, size_t i) { return key.getKey(i); } + static Key parseKey(const ColumnWithTypeAndName & key) { return impl::parseSipHashKey(key); } static UInt128 applyKeyed(const Key & key, const char * begin, size_t size) { return sipHash128Keyed(key.key0, key.key1, begin, size); } @@ -425,43 +398,13 @@ struct SipHash128ReferenceImpl using ReturnType = UInt128; - static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc(h1, h2); } + static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc(h1, h2); } static UInt128 apply(const char * data, const size_t size) { return sipHash128Reference(data, size); } static constexpr bool use_int_hash_for_pods = false; }; -struct SipHash128ReferenceKeyedImpl -{ - static constexpr auto name = "sipHash128ReferenceKeyed"; - using ReturnType = UInt128; - using Key = impl::SipHashKey; - using KeyColumns = impl::SipHashKeyColumns; - - static KeyColumns parseKeyColumns(const ColumnWithTypeAndName & key) { return impl::parseSipHashKeyColumns(key); } - static Key getKey(const KeyColumns & key, size_t i) { return key.getKey(i); } - - static UInt128 applyKeyed(const Key & key, const char * begin, size_t size) - { - return sipHash128ReferenceKeyed(key.key0, key.key1, begin, size); - } - - static UInt128 combineHashesKeyed(const Key & key, UInt128 h1, UInt128 h2) - { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - UInt128 tmp; - reverseMemcpy(&tmp, &h1, sizeof(UInt128)); - h1 = tmp; - reverseMemcpy(&tmp, &h2, sizeof(UInt128)); - h2 = tmp; -#endif - UInt128 hashes[] = {h1, h2}; - return applyKeyed(key, reinterpret_cast(hashes), 2 * sizeof(UInt128)); - } - - static constexpr bool use_int_hash_for_pods = false; -}; /** Why we need MurmurHash2? * MurmurHash2 is an outdated hash function, superseded by MurmurHash3 and subsequently by CityHash, xxHash, HighwayHash. @@ -1080,7 +1023,7 @@ private: DECLARE_MULTITARGET_CODE( -template +template class FunctionAnyHash : public IFunction { public: @@ -1090,12 +1033,9 @@ private: using ToType = typename Impl::ReturnType; template - void executeIntType(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector::Container & vec_to) const + void executeIntType(const KeyType & key, const IColumn * column, typename ColumnVector::Container & vec_to) const { using ColVecType = ColumnVectorOrDecimal; - KeyType key{}; - if constexpr (Keyed) - key = Impl::getKey(key_cols, 0); if (const ColVecType * col_from = checkAndGetColumn(column)) { @@ -1104,9 +1044,6 @@ private: for (size_t i = 0; i < size; ++i) { ToType hash; - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); if constexpr (Impl::use_int_hash_for_pods) { @@ -1140,14 +1077,6 @@ private: } else if (auto col_from_const = checkAndGetColumnConst(column)) { - if constexpr (Keyed) - { - if (!key_cols.is_const) - { - ColumnPtr full_column = col_from_const->convertToFullColumn(); - return executeIntType(key_cols, full_column.get(), vec_to); - } - } auto value = col_from_const->template getValue(); ToType hash; @@ -1178,15 +1107,8 @@ private: if constexpr (first) vec_to.assign(size, hash); else - { for (size_t i = 0; i < size; ++i) - { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); vec_to[i] = combineHashes(key, vec_to[i], hash); - } - } } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", @@ -1194,12 +1116,9 @@ private: } template - void executeBigIntType(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector::Container & vec_to) const + void executeBigIntType(const KeyType & key, const IColumn * column, typename ColumnVector::Container & vec_to) const { using ColVecType = ColumnVectorOrDecimal; - KeyType key{}; - if constexpr (Keyed) - key = Impl::getKey(key_cols, 0); if (const ColVecType * col_from = checkAndGetColumn(column)) { @@ -1208,9 +1127,6 @@ private: for (size_t i = 0; i < size; ++i) { ToType hash; - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); if constexpr (std::endian::native == std::endian::little) hash = apply(key, reinterpret_cast(&vec_from[i]), sizeof(vec_from[i])); else @@ -1227,14 +1143,6 @@ private: } else if (auto col_from_const = checkAndGetColumnConst(column)) { - if constexpr (Keyed) - { - if (!key_cols.is_const) - { - ColumnPtr full_column = col_from_const->convertToFullColumn(); - return executeBigIntType(key_cols, full_column.get(), vec_to); - } - } auto value = col_from_const->template getValue(); ToType hash; @@ -1250,15 +1158,8 @@ private: if constexpr (first) vec_to.assign(size, hash); else - { for (size_t i = 0; i < size; ++i) - { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); vec_to[i] = combineHashes(key, vec_to[i], hash); - } - } } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", @@ -1266,16 +1167,10 @@ private: } template - void executeGeneric(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector::Container & vec_to) const + void executeGeneric(const KeyType & key, const IColumn * column, typename ColumnVector::Container & vec_to) const { - KeyType key{}; - if constexpr (Keyed) - key = Impl::getKey(key_cols, 0); for (size_t i = 0, size = column->size(); i < size; ++i) { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); StringRef bytes = column->getDataAt(i); const ToType hash = apply(key, bytes.data, bytes.size); if constexpr (first) @@ -1286,11 +1181,8 @@ private: } template - void executeString(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector::Container & vec_to) const + void executeString(const KeyType & key, const IColumn * column, typename ColumnVector::Container & vec_to) const { - KeyType key{}; - if constexpr (Keyed) - key = Impl::getKey(key_cols, 0); if (const ColumnString * col_from = checkAndGetColumn(column)) { const typename ColumnString::Chars & data = col_from->getChars(); @@ -1300,9 +1192,6 @@ private: ColumnString::Offset current_offset = 0; for (size_t i = 0; i < size; ++i) { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); const ToType hash = apply(key, reinterpret_cast(&data[current_offset]), offsets[i] - current_offset - 1); @@ -1323,9 +1212,6 @@ private: for (size_t i = 0; i < size; ++i) { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); const ToType hash = apply(key, reinterpret_cast(&data[i * n]), n); if constexpr (first) vec_to[i] = hash; @@ -1335,14 +1221,6 @@ private: } else if (const ColumnConst * col_from_const = checkAndGetColumnConstStringOrFixedString(column)) { - if constexpr (Keyed) - { - if (!key_cols.is_const) - { - ColumnPtr full_column = col_from_const->convertToFullColumn(); - return executeString(key_cols, full_column.get(), vec_to); - } - } String value = col_from_const->getValue(); const ToType hash = apply(key, value.data(), value.size()); const size_t size = vec_to.size(); @@ -1350,15 +1228,8 @@ private: if constexpr (first) vec_to.assign(size, hash); else - { for (size_t i = 0; i < size; ++i) - { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); vec_to[i] = combineHashes(key, vec_to[i], hash); - } - } } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", @@ -1366,7 +1237,7 @@ private: } template - void executeArray(const KeyColumnsType & key_cols, const IDataType * type, const IColumn * column, typename ColumnVector::Container & vec_to) const + void executeArray(const KeyType & key, const IDataType * type, const IColumn * column, typename ColumnVector::Container & vec_to) const { const IDataType * nested_type = typeid_cast(*type).getNestedType().get(); @@ -1378,19 +1249,13 @@ private: typename ColumnVector::Container vec_temp(nested_size); bool nested_is_first = true; - executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first); + executeForArgument(key, nested_type, nested_column, vec_temp, nested_is_first); const size_t size = offsets.size(); ColumnArray::Offset current_offset = 0; - KeyType key{}; - if constexpr (Keyed) - key = Impl::getKey(key_cols, 0); for (size_t i = 0; i < size; ++i) { - if constexpr (Keyed) - if (!key_cols.is_const && i != 0) - key = Impl::getKey(key_cols, i); ColumnArray::Offset next_offset = offsets[i]; ToType hash; @@ -1414,7 +1279,7 @@ private: { /// NOTE: here, of course, you can do without the materialization of the column. ColumnPtr full_column = col_from_const->convertToFullColumn(); - executeArray(key_cols, type, full_column.get(), vec_to); + executeArray(key, type, full_column.get(), vec_to); } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", @@ -1422,7 +1287,7 @@ private: } template - void executeAny(const KeyColumnsType & key_cols, const IDataType * from_type, const IColumn * icolumn, typename ColumnVector::Container & vec_to) const + void executeAny(const KeyType & key, const IDataType * from_type, const IColumn * icolumn, typename ColumnVector::Container & vec_to) const { WhichDataType which(from_type); @@ -1430,45 +1295,40 @@ private: throw Exception(ErrorCodes::LOGICAL_ERROR, "Argument column '{}' size {} doesn't match result column size {} of function {}", icolumn->getName(), icolumn->size(), vec_to.size(), getName()); - if constexpr (Keyed) - if ((!key_cols.is_const && key_cols.size() != vec_to.size()) - || (key_cols.is_const && key_cols.size() != 1)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Key column size {} doesn't match result column size {} of function {}", key_cols.size(), vec_to.size(), getName()); - - if (which.isUInt8()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isUInt16()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isUInt32()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isUInt64()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isUInt128()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isUInt256()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isInt8()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isInt16()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isInt32()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isInt64()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isInt128()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isInt256()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isUUID()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isIPv4()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isIPv6()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isEnum8()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isEnum16()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isDate()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isDate32()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isDateTime()) executeIntType(key_cols, icolumn, vec_to); + if (which.isUInt8()) executeIntType(key, icolumn, vec_to); + else if (which.isUInt16()) executeIntType(key, icolumn, vec_to); + else if (which.isUInt32()) executeIntType(key, icolumn, vec_to); + else if (which.isUInt64()) executeIntType(key, icolumn, vec_to); + else if (which.isUInt128()) executeBigIntType(key, icolumn, vec_to); + else if (which.isUInt256()) executeBigIntType(key, icolumn, vec_to); + else if (which.isInt8()) executeIntType(key, icolumn, vec_to); + else if (which.isInt16()) executeIntType(key, icolumn, vec_to); + else if (which.isInt32()) executeIntType(key, icolumn, vec_to); + else if (which.isInt64()) executeIntType(key, icolumn, vec_to); + else if (which.isInt128()) executeBigIntType(key, icolumn, vec_to); + else if (which.isInt256()) executeBigIntType(key, icolumn, vec_to); + else if (which.isUUID()) executeBigIntType(key, icolumn, vec_to); + else if (which.isIPv4()) executeIntType(key, icolumn, vec_to); + else if (which.isIPv6()) executeBigIntType(key, icolumn, vec_to); + else if (which.isEnum8()) executeIntType(key, icolumn, vec_to); + else if (which.isEnum16()) executeIntType(key, icolumn, vec_to); + else if (which.isDate()) executeIntType(key, icolumn, vec_to); + else if (which.isDate32()) executeIntType(key, icolumn, vec_to); + else if (which.isDateTime()) executeIntType(key, icolumn, vec_to); /// TODO: executeIntType() for Decimal32/64 leads to incompatible result - else if (which.isDecimal32()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isDecimal64()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isDecimal128()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isDecimal256()) executeBigIntType(key_cols, icolumn, vec_to); - else if (which.isFloat32()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isFloat64()) executeIntType(key_cols, icolumn, vec_to); - else if (which.isString()) executeString(key_cols, icolumn, vec_to); - else if (which.isFixedString()) executeString(key_cols, icolumn, vec_to); - else if (which.isArray()) executeArray(key_cols, from_type, icolumn, vec_to); - else executeGeneric(key_cols, icolumn, vec_to); + else if (which.isDecimal32()) executeBigIntType(key, icolumn, vec_to); + else if (which.isDecimal64()) executeBigIntType(key, icolumn, vec_to); + else if (which.isDecimal128()) executeBigIntType(key, icolumn, vec_to); + else if (which.isDecimal256()) executeBigIntType(key, icolumn, vec_to); + else if (which.isFloat32()) executeIntType(key, icolumn, vec_to); + else if (which.isFloat64()) executeIntType(key, icolumn, vec_to); + else if (which.isString()) executeString(key, icolumn, vec_to); + else if (which.isFixedString()) executeString(key, icolumn, vec_to); + else if (which.isArray()) executeArray(key, from_type, icolumn, vec_to); + else executeGeneric(key, icolumn, vec_to); } - void executeForArgument(const KeyColumnsType & key_cols, const IDataType * type, const IColumn * column, typename ColumnVector::Container & vec_to, bool & is_first) const + void executeForArgument(const KeyType & key, const IDataType * type, const IColumn * column, typename ColumnVector::Container & vec_to, bool & is_first) const { /// Flattening of tuples. if (const ColumnTuple * tuple = typeid_cast(column)) @@ -1477,7 +1337,7 @@ private: const DataTypes & tuple_types = typeid_cast(*type).getElements(); size_t tuple_size = tuple_columns.size(); for (size_t i = 0; i < tuple_size; ++i) - executeForArgument(key_cols, tuple_types[i].get(), tuple_columns[i].get(), vec_to, is_first); + executeForArgument(key, tuple_types[i].get(), tuple_columns[i].get(), vec_to, is_first); } else if (const ColumnTuple * tuple_const = checkAndGetColumnConstData(column)) { @@ -1487,24 +1347,24 @@ private: for (size_t i = 0; i < tuple_size; ++i) { auto tmp = ColumnConst::create(tuple_columns[i], column->size()); - executeForArgument(key_cols, tuple_types[i].get(), tmp.get(), vec_to, is_first); + executeForArgument(key, tuple_types[i].get(), tmp.get(), vec_to, is_first); } } else if (const auto * map = checkAndGetColumn(column)) { const auto & type_map = assert_cast(*type); - executeForArgument(key_cols, type_map.getNestedType().get(), map->getNestedColumnPtr().get(), vec_to, is_first); + executeForArgument(key, type_map.getNestedType().get(), map->getNestedColumnPtr().get(), vec_to, is_first); } else if (const auto * const_map = checkAndGetColumnConst(column)) { - executeForArgument(key_cols, type, const_map->convertToFullColumnIfConst().get(), vec_to, is_first); + executeForArgument(key, type, const_map->convertToFullColumnIfConst().get(), vec_to, is_first); } else { if (is_first) - executeAny(key_cols, type, column, vec_to); + executeAny(key, type, column, vec_to); else - executeAny(key_cols, type, column, vec_to); + executeAny(key, type, column, vec_to); } is_first = false; @@ -1535,39 +1395,39 @@ public: { auto col_to = ColumnVector::create(input_rows_count); - if (input_rows_count != 0) + typename ColumnVector::Container & vec_to = col_to->getData(); + + /// If using a "keyed" algorithm, the first argument is the key and + /// the data starts from the second argument. + /// Otherwise there is no key and all arguments are interpreted as data. + constexpr size_t first_data_argument = Keyed; + + if (arguments.size() <= first_data_argument) { - typename ColumnVector::Container & vec_to = col_to->getData(); + /// Return a fixed random-looking magic number when input is empty + vec_to.assign(input_rows_count, static_cast(0xe28dbde7fe22e41c)); + } - /// If using a "keyed" algorithm, the first argument is the key and - /// the data starts from the second argument. - /// Otherwise there is no key and all arguments are interpreted as data. - constexpr size_t first_data_argument = Keyed; + KeyType key{}; + if constexpr (Keyed) + if (!arguments.empty()) + key = Impl::parseKey(arguments[0]); - if (arguments.size() <= first_data_argument) - { - /// Return a fixed random-looking magic number when input is empty - vec_to.assign(input_rows_count, static_cast(0xe28dbde7fe22e41c)); - } - - KeyColumnsType key_cols{}; - if constexpr (Keyed) - if (!arguments.empty()) - key_cols = Impl::parseKeyColumns(arguments[0]); - - /// The function supports arbitrary number of arguments of arbitrary types. - bool is_first_argument = true; - for (size_t i = first_data_argument; i < arguments.size(); ++i) - { - const auto & col = arguments[i]; - executeForArgument(key_cols, col.type.get(), col.column.get(), vec_to, is_first_argument); - } + /// The function supports arbitrary number of arguments of arbitrary types. + bool is_first_argument = true; + for (size_t i = first_data_argument; i < arguments.size(); ++i) + { + const auto & col = arguments[i]; + executeForArgument(key, col.type.get(), col.column.get(), vec_to, is_first_argument); } if constexpr (std::is_same_v) /// backward-compatible { auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128)); - col_to_fixed_string->getChars() = std::move(*reinterpret_cast(&col_to->getData())); + const auto & data = col_to->getData(); + auto & chars = col_to_fixed_string->getChars(); + chars.resize(data.size() * sizeof(UInt128)); + memcpy(chars.data(), data.data(), data.size() * sizeof(UInt128)); return col_to_fixed_string; } @@ -1593,19 +1453,17 @@ public: ) // DECLARE_MULTITARGET_CODE -template -class FunctionAnyHash : public TargetSpecific::Default::FunctionAnyHash +template +class FunctionAnyHash : public TargetSpecific::Default::FunctionAnyHash { public: explicit FunctionAnyHash(ContextPtr context) : selector(context) { - selector - .registerImplementation>(); + selector.registerImplementation>(); #if USE_MULTITARGET_CODE - selector.registerImplementation>(); - selector - .registerImplementation>(); + selector.registerImplementation>(); + selector.registerImplementation>(); #endif } @@ -1841,7 +1699,7 @@ struct NameIntHash32 { static constexpr auto name = "intHash32"; }; struct NameIntHash64 { static constexpr auto name = "intHash64"; }; using FunctionSipHash64 = FunctionAnyHash; -using FunctionSipHash64Keyed = FunctionAnyHash; +using FunctionSipHash64Keyed = FunctionAnyHash; using FunctionIntHash32 = FunctionIntHash; using FunctionIntHash64 = FunctionIntHash; #if USE_SSL @@ -1855,10 +1713,8 @@ using FunctionSHA384 = FunctionStringHashFixedString; using FunctionSHA512 = FunctionStringHashFixedString; #endif using FunctionSipHash128 = FunctionAnyHash; -using FunctionSipHash128Keyed = FunctionAnyHash; +using FunctionSipHash128Keyed = FunctionAnyHash; using FunctionSipHash128Reference = FunctionAnyHash; -using FunctionSipHash128ReferenceKeyed - = FunctionAnyHash; using FunctionCityHash64 = FunctionAnyHash; using FunctionFarmFingerprint64 = FunctionAnyHash; using FunctionFarmHash64 = FunctionAnyHash; diff --git a/src/Functions/FunctionsHashingMisc.cpp b/src/Functions/FunctionsHashingMisc.cpp index f56568b2508..56c3c1ed00c 100644 --- a/src/Functions/FunctionsHashingMisc.cpp +++ b/src/Functions/FunctionsHashingMisc.cpp @@ -20,11 +20,6 @@ REGISTER_FUNCTION(Hashing) .examples{{"hash", "SELECT hex(sipHash128Reference('foo', '\\x01', 3))", ""}}, .categories{"Hash"} }); - factory.registerFunction(FunctionDocumentation{ - .description = "Same as [sipHash128Reference](#hash_functions-siphash128reference) but additionally takes an explicit key argument " - "instead of using a fixed key.", - .examples{{"hash", "SELECT hex(sipHash128ReferenceKeyed((506097522914230528, 1084818905618843912),'foo', '\\x01', 3));", ""}}, - .categories{"Hash"}}); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index 67bc01279c3..6c0c1681a4c 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -95,7 +95,7 @@ size_t ReadBufferFromFileDescriptor::readImpl(char * to, size_t min_bytes, size_ /// It reports real time spent including the time spent while thread was preempted doing nothing. /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables). /// Sometimes it is better to use taskstats::blkio_delay_total, but it is quite expensive to get it - /// (TaskStatsInfoGetter has about 500K RPS). + /// (NetlinkMetricsProvider has about 500K RPS). watch.stop(); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); diff --git a/src/IO/SynchronousReader.cpp b/src/IO/SynchronousReader.cpp index 7cef3bd8963..e1c654e48a3 100644 --- a/src/IO/SynchronousReader.cpp +++ b/src/IO/SynchronousReader.cpp @@ -78,7 +78,7 @@ std::future SynchronousReader::submit(Request reque /// It reports real time spent including the time spent while thread was preempted doing nothing. /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables). /// Sometimes it is better to use taskstats::blkio_delay_total, but it is quite expensive to get it - /// (TaskStatsInfoGetter has about 500K RPS). + /// (NetlinkMetricsProvider has about 500K RPS). watch.stop(); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); diff --git a/src/IO/examples/CMakeLists.txt b/src/IO/examples/CMakeLists.txt index b42aa1a4f96..12b85c483a1 100644 --- a/src/IO/examples/CMakeLists.txt +++ b/src/IO/examples/CMakeLists.txt @@ -73,3 +73,9 @@ target_link_libraries (snappy_read_buffer PRIVATE clickhouse_common_io) clickhouse_add_executable (hadoop_snappy_read_buffer hadoop_snappy_read_buffer.cpp) target_link_libraries (hadoop_snappy_read_buffer PRIVATE clickhouse_common_io) +if (TARGET ch_contrib::hdfs) + clickhouse_add_executable (read_buffer_from_hdfs read_buffer_from_hdfs.cpp) + target_link_libraries (read_buffer_from_hdfs PRIVATE dbms ch_contrib::hdfs) +endif () + + diff --git a/src/IO/examples/read_buffer_from_hdfs.cpp b/src/IO/examples/read_buffer_from_hdfs.cpp new file mode 100644 index 00000000000..da4e5298681 --- /dev/null +++ b/src/IO/examples/read_buffer_from_hdfs.cpp @@ -0,0 +1,25 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace DB; + +int main() +{ + setenv("LIBHDFS3_CONF", "/path/to/hdfs-site.xml", true); /// NOLINT + String hdfs_uri = "hdfs://cluster_name"; + String hdfs_file_path = "/path/to/hdfs/file"; + ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); + ReadSettings read_settings; + ReadBufferFromHDFS read_buffer(hdfs_uri, hdfs_file_path, *config, read_settings, 2097152UL, false); + + String download_path = "./download"; + WriteBufferFromFile write_buffer(download_path); + copyData(read_buffer, write_buffer); + return 0; +} diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 91d1c63e832..42cc7b80a66 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -870,13 +870,12 @@ void FileCache::loadMetadata() } size_t total_size = 0; - for (auto key_prefix_it = fs::directory_iterator{metadata.getBaseDirectory()}; - key_prefix_it != fs::directory_iterator();) + for (auto key_prefix_it = fs::directory_iterator{metadata.getBaseDirectory()}; key_prefix_it != fs::directory_iterator(); + key_prefix_it++) { const fs::path key_prefix_directory = key_prefix_it->path(); - key_prefix_it++; - if (!fs::is_directory(key_prefix_directory)) + if (!key_prefix_it->is_directory()) { if (key_prefix_directory.filename() != "status") { @@ -887,19 +886,19 @@ void FileCache::loadMetadata() continue; } - if (fs::is_empty(key_prefix_directory)) + fs::directory_iterator key_it{key_prefix_directory}; + if (key_it == fs::directory_iterator{}) { LOG_DEBUG(log, "Removing empty key prefix directory: {}", key_prefix_directory.string()); fs::remove(key_prefix_directory); continue; } - for (fs::directory_iterator key_it{key_prefix_directory}; key_it != fs::directory_iterator();) + for (/* key_it already initialized to verify emptiness */; key_it != fs::directory_iterator(); key_it++) { const fs::path key_directory = key_it->path(); - ++key_it; - if (!fs::is_directory(key_directory)) + if (!key_it->is_directory()) { LOG_DEBUG( log, @@ -908,7 +907,7 @@ void FileCache::loadMetadata() continue; } - if (fs::is_empty(key_directory)) + if (fs::directory_iterator{key_directory} == fs::directory_iterator{}) { LOG_DEBUG(log, "Removing empty key directory: {}", key_directory.string()); fs::remove(key_directory); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 9e4d1e8d1e2..cc1277e08b9 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1461,15 +1461,24 @@ void Context::addQueryAccessInfo( void Context::addQueryAccessInfo(const Names & partition_names) { if (isGlobalContext()) - { throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot have query access info"); - } std::lock_guard lock(query_access_info.mutex); for (const auto & partition_name : partition_names) - { query_access_info.partitions.emplace(partition_name); - } +} + +void Context::addQueryAccessInfo(const QualifiedProjectionName & qualified_projection_name) +{ + if (!qualified_projection_name) + return; + + if (isGlobalContext()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot have query access info"); + + std::lock_guard lock(query_access_info.mutex); + query_access_info.projections.emplace(fmt::format( + "{}.{}", qualified_projection_name.storage_id.getFullTableName(), backQuoteIfNeed(qualified_projection_name.projection_name))); } void Context::addQueryFactoriesInfo(QueryLogFactories factory_type, const String & created_object) const diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 3a8d41bf130..fa210f04451 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -658,6 +658,14 @@ public: const String & view_name = {}); void addQueryAccessInfo(const Names & partition_names); + struct QualifiedProjectionName + { + StorageID storage_id = StorageID::createEmpty(); + String projection_name; + explicit operator bool() const { return !projection_name.empty(); } + }; + void addQueryAccessInfo(const QualifiedProjectionName & qualified_projection_name); + /// Supported factories for records in query_log enum class QueryLogFactories diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index ee5c288afbb..29add31fd5d 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -337,6 +337,11 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se LOG_TRACE(&Poco::Logger::get("JoinedTables"), "Can't use dictionary join: dictionary '{}' was not found", dictionary_name); return nullptr; } + if (dictionary->getSpecialKeyType() == DictionarySpecialKeyType::Range) + { + LOG_TRACE(&Poco::Logger::get("JoinedTables"), "Can't use dictionary join: dictionary '{}' is a range dictionary", dictionary_name); + return nullptr; + } auto dictionary_kv = std::dynamic_pointer_cast(dictionary); table_join->setStorageJoin(dictionary_kv); diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index 1503e396298..c299572ef41 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -223,7 +223,10 @@ ProcessList::insert(const String & query_, const IAST * ast, ContextMutablePtr q { /// Set up memory profiling thread_group->memory_tracker.setProfilerStep(settings.memory_profiler_step); + thread_group->memory_tracker.setSampleProbability(settings.memory_profiler_sample_probability); + thread_group->memory_tracker.setSampleMinAllocationSize(settings.memory_profiler_sample_min_allocation_size); + thread_group->memory_tracker.setSampleMaxAllocationSize(settings.memory_profiler_sample_max_allocation_size); thread_group->performance_counters.setTraceProfileEvents(settings.trace_profile_events); } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index ba3befab59b..5d14a57759f 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -223,10 +223,10 @@ public: { /// When join_algorithm = 'default' (not specified by user) we use hash or direct algorithm. /// It's behaviour that was initially supported by clickhouse. - bool is_enbaled_by_default = val == JoinAlgorithm::DEFAULT + bool is_enabled_by_default = val == JoinAlgorithm::DEFAULT || val == JoinAlgorithm::HASH || val == JoinAlgorithm::DIRECT; - if (join_algorithm.isSet(JoinAlgorithm::DEFAULT) && is_enbaled_by_default) + if (join_algorithm.isSet(JoinAlgorithm::DEFAULT) && is_enabled_by_default) return true; return join_algorithm.isSet(val); } diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 398bea26b87..bac16c05533 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -83,6 +83,8 @@ ThreadGroupPtr ThreadGroup::createForBackgroundProcess(ContextPtr storage_contex const Settings & settings = storage_context->getSettingsRef(); group->memory_tracker.setProfilerStep(settings.memory_profiler_step); group->memory_tracker.setSampleProbability(settings.memory_profiler_sample_probability); + group->memory_tracker.setSampleMinAllocationSize(settings.memory_profiler_sample_min_allocation_size); + group->memory_tracker.setSampleMaxAllocationSize(settings.memory_profiler_sample_max_allocation_size); group->memory_tracker.setSoftLimit(settings.memory_overcommit_ratio_denominator); group->memory_tracker.setParent(&background_memory_tracker); if (settings.memory_tracker_fault_probability > 0.0) diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp index 7da10a8523b..e495b0967e9 100644 --- a/src/Planner/PlannerJoins.cpp +++ b/src/Planner/PlannerJoins.cpp @@ -542,7 +542,8 @@ void trySetStorageInTableJoin(const QueryTreeNodePtr & table_expression, std::sh if (!table_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) return; - if (auto storage_dictionary = std::dynamic_pointer_cast(storage); storage_dictionary) + if (auto storage_dictionary = std::dynamic_pointer_cast(storage); + storage_dictionary && storage_dictionary->getDictionary()->getSpecialKeyType() != DictionarySpecialKeyType::Range) table_join->setStorageJoin(std::dynamic_pointer_cast(storage_dictionary->getDictionary())); else if (auto storage_key_value = std::dynamic_pointer_cast(storage); storage_key_value) table_join->setStorageJoin(storage_key_value); diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index e611bb5b2ef..eab4d3f5d43 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -92,18 +92,6 @@ static AggregateProjectionInfo getAggregatingProjectionInfo( return info; } -static bool hasNullableOrMissingColumn(const DAGIndex & index, const Names & names) -{ - for (const auto & query_name : names) - { - auto jt = index.find(query_name); - if (jt == index.end() || jt->second->result_type->isNullable()) - return true; - } - - return false; -} - struct AggregateFunctionMatch { const AggregateDescription * description = nullptr; @@ -170,20 +158,14 @@ std::optional matchAggregateFunctions( } /// This is a special case for the function count(). - /// We can assume that 'count(expr) == count()' if expr is not nullable. - if (typeid_cast(candidate.function.get())) + /// We can assume that 'count(expr) == count()' if expr is not nullable, + /// which can be verified by simply casting to `AggregateFunctionCount *`. + if (typeid_cast(aggregate.function.get())) { - bool has_nullable_or_missing_arg = false; - has_nullable_or_missing_arg |= hasNullableOrMissingColumn(query_index, aggregate.argument_names); - has_nullable_or_missing_arg |= hasNullableOrMissingColumn(proj_index, candidate.argument_names); - - if (!has_nullable_or_missing_arg) - { - /// we can ignore arguments for count() - found_match = true; - res.push_back({&candidate, DataTypes()}); - break; - } + /// we can ignore arguments for count() + found_match = true; + res.push_back({&candidate, DataTypes()}); + break; } /// Now, function names and types matched. @@ -628,8 +610,16 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & // candidates.minmax_projection->block.dumpStructure()); Pipe pipe(std::make_shared(std::move(candidates.minmax_projection->block))); - projection_reading = std::make_unique(std::move(pipe)); - + projection_reading = std::make_unique( + std::move(pipe), + context, + query_info.is_internal + ? Context::QualifiedProjectionName{} + : Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = candidates.minmax_projection->candidate.projection->name, + }); has_ordinary_parts = !candidates.minmax_projection->normal_parts.empty(); if (has_ordinary_parts) reading->resetParts(std::move(candidates.minmax_projection->normal_parts)); @@ -661,7 +651,16 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & { auto header = proj_snapshot->getSampleBlockForColumns(best_candidate->dag->getRequiredColumnsNames()); Pipe pipe(std::make_shared(std::move(header))); - projection_reading = std::make_unique(std::move(pipe)); + projection_reading = std::make_unique( + std::move(pipe), + context, + query_info.is_internal + ? Context::QualifiedProjectionName{} + : Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = best_candidate->projection->name, + }); } has_ordinary_parts = best_candidate->merge_tree_ordinary_select_result_ptr != nullptr; diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp index 2a03a082d89..727afcb1a99 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp @@ -187,7 +187,16 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) if (!projection_reading) { Pipe pipe(std::make_shared(proj_snapshot->getSampleBlockForColumns(required_columns))); - projection_reading = std::make_unique(std::move(pipe)); + projection_reading = std::make_unique( + std::move(pipe), + context, + query_info.is_internal + ? Context::QualifiedProjectionName{} + : Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = best_candidate->projection->name, + }); } bool has_ordinary_parts = best_candidate->merge_tree_ordinary_select_result_ptr != nullptr; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 13de5d1d140..2d2412f7e36 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1761,6 +1761,10 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons fmt::format("{}.{}", data.getStorageID().getFullNameNotQuoted(), part.data_part->info.partition_id)); } context->getQueryContext()->addQueryAccessInfo(partition_names); + + if (storage_snapshot->projection) + context->getQueryContext()->addQueryAccessInfo( + Context::QualifiedProjectionName{.storage_id = data.getStorageID(), .projection_name = storage_snapshot->projection->name}); } ProfileEvents::increment(ProfileEvents::SelectedParts, result.selected_parts); diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp index 7446203ec35..a24c4dbe4d0 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp @@ -4,14 +4,19 @@ namespace DB { -ReadFromPreparedSource::ReadFromPreparedSource(Pipe pipe_) +ReadFromPreparedSource::ReadFromPreparedSource(Pipe pipe_, ContextPtr context_, Context::QualifiedProjectionName qualified_projection_name_) : ISourceStep(DataStream{.header = pipe_.getHeader()}) , pipe(std::move(pipe_)) + , context(std::move(context_)) + , qualified_projection_name(std::move(qualified_projection_name_)) { } void ReadFromPreparedSource::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { + if (context && context->hasQueryContext()) + context->getQueryContext()->addQueryAccessInfo(qualified_projection_name); + for (const auto & processor : pipe.getProcessors()) processors.emplace_back(processor); diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.h b/src/Processors/QueryPlan/ReadFromPreparedSource.h index 05e3ebd5102..2606f501009 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.h +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.h @@ -1,4 +1,6 @@ #pragma once + +#include #include #include @@ -9,7 +11,8 @@ namespace DB class ReadFromPreparedSource : public ISourceStep { public: - explicit ReadFromPreparedSource(Pipe pipe_); + explicit ReadFromPreparedSource( + Pipe pipe_, ContextPtr context_ = nullptr, Context::QualifiedProjectionName qualified_projection_name_ = {}); String getName() const override { return "ReadFromPreparedSource"; } @@ -18,6 +21,7 @@ public: protected: Pipe pipe; ContextPtr context; + Context::QualifiedProjectionName qualified_projection_name; }; class ReadFromStorageStep : public ReadFromPreparedSource diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index ee8e0764db0..483f0894cc4 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -89,7 +89,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory {})", file_offset, read_until_position - 1); - num_bytes_to_read = read_until_position - file_offset; + num_bytes_to_read = std::min(read_until_position - file_offset, internal_buffer.size()); } else { diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index e0f259306aa..19b90a39800 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -130,3 +130,4 @@ 02581_share_big_sets_between_mutation_tasks_long 02581_share_big_sets_between_multiple_mutations_tasks_long 00992_system_parts_race_condition_zookeeper_long +02815_range_dict_no_direct_join diff --git a/tests/integration/test_memory_profiler_min_max_borders/__init__.py b/tests/integration/test_memory_profiler_min_max_borders/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_memory_profiler_min_max_borders/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_memory_profiler_min_max_borders/configs/max_untracked_memory.xml b/tests/integration/test_memory_profiler_min_max_borders/configs/max_untracked_memory.xml new file mode 100644 index 00000000000..56fc5ed34ca --- /dev/null +++ b/tests/integration/test_memory_profiler_min_max_borders/configs/max_untracked_memory.xml @@ -0,0 +1,7 @@ + + + + 1 + + + diff --git a/tests/integration/test_memory_profiler_min_max_borders/configs/memory_profiler.xml b/tests/integration/test_memory_profiler_min_max_borders/configs/memory_profiler.xml new file mode 100644 index 00000000000..5b3e17d145f --- /dev/null +++ b/tests/integration/test_memory_profiler_min_max_borders/configs/memory_profiler.xml @@ -0,0 +1,5 @@ + + 1 + 4096 + 8192 + diff --git a/tests/integration/test_memory_profiler_min_max_borders/test.py b/tests/integration/test_memory_profiler_min_max_borders/test.py new file mode 100644 index 00000000000..6ab971fa9c4 --- /dev/null +++ b/tests/integration/test_memory_profiler_min_max_borders/test.py @@ -0,0 +1,37 @@ +from helpers.cluster import ClickHouseCluster +import pytest + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance( + "node", + main_configs=["configs/memory_profiler.xml"], + user_configs=["configs/max_untracked_memory.xml"], +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def test_trace_boundaries_work(started_cluster): + node.query("select randomPrintableASCII(number) from numbers(1000) FORMAT Null") + node.query("SYSTEM FLUSH LOGS") + + assert ( + node.query( + "SELECT countDistinct(abs(size)) > 0 FROM system.trace_log where trace_type = 'MemorySample'" + ) + == "1\n" + ) + assert ( + node.query( + "SELECT count() FROM system.trace_log where trace_type = 'MemorySample' and (abs(size) > 8192 or abs(size) < 4096)" + ) + == "0\n" + ) diff --git a/tests/performance/re2_regex_caching.xml b/tests/performance/re2_regex_caching.xml index 6edc83097ba..9778a8d4c0c 100644 --- a/tests/performance/re2_regex_caching.xml +++ b/tests/performance/re2_regex_caching.xml @@ -24,8 +24,8 @@ '.*' || toString(number) || '.' '.*' || toString(number % 10) || '.' - - '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?([^ @]+)@([^ @]+)([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])' || toString(number) + + '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?([^ @]+)@([^ @]+)([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])' || toString(number % 10)