Merge branch 'ClickHouse:master' into hilbert-index-analysis

This commit is contained in:
Artem Mustafin 2024-06-06 00:54:44 +03:00 committed by GitHub
commit fe63c3409f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
444 changed files with 10800 additions and 6501 deletions

View File

@ -10,3 +10,11 @@ assignees: ''
> Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse
> If you still prefer GitHub issues, remove all this text and ask your question here.
**Company or project name**
Put your company name or project description here
**Question**
Your question

View File

@ -9,6 +9,10 @@ assignees: ''
> (you don't have to strictly follow this form)
**Company or project name**
> Put your company name or project description here
**Use case**
> A clear and concise description of what is the intended usage scenario is.

View File

@ -9,6 +9,10 @@ assignees: ''
(you don't have to strictly follow this form)
**Company or project name**
Put your company name or project description here
**Describe the unexpected behaviour**
A clear and concise description of what works not as it is supposed to.

View File

@ -9,6 +9,10 @@ assignees: ''
(you don't have to strictly follow this form)
**Company or project name**
Put your company name or project description here
**Describe the unexpected behaviour**
A clear and concise description of what works not as it is supposed to.

View File

@ -9,6 +9,9 @@ assignees: ''
(you don't have to strictly follow this form)
**Company or project name**
Put your company name or project description here
**Describe the issue**
A clear and concise description of what works not as it is supposed to.

View File

@ -9,6 +9,10 @@ assignees: ''
> Make sure that `git diff` result is empty and you've just pulled fresh master. Try cleaning up cmake cache. Just in case, official build instructions are published here: https://clickhouse.com/docs/en/development/build/
**Company or project name**
> Put your company name or project description here
**Operating system**
> OS kind or distribution, specific version/release, non-standard kernel if any. If you are trying to build inside virtual machine, please mention it too.

View File

@ -8,6 +8,9 @@ labels: comp-documentation
(you don't have to strictly follow this form)
**Company or project name**
Put your company name or project description here
**Describe the issue**
A clear and concise description of what's wrong in documentation.

View File

@ -9,6 +9,9 @@ assignees: ''
(you don't have to strictly follow this form)
**Company or project name**
Put your company name or project description here
**Describe the situation**
What exactly works slower than expected?

View File

@ -9,6 +9,9 @@ assignees: ''
(you don't have to strictly follow this form)
**Company or project name**
Put your company name or project description here
**Describe the issue**
A clear and concise description of what works not as it is supposed to.

View File

@ -11,6 +11,10 @@ assignees: ''
> You have to provide the following information whenever possible.
**Company or project name**
> Put your company name or project description here
**Describe what's wrong**
> A clear and concise description of what works not as it is supposed to.

View File

@ -7,6 +7,10 @@ assignees: ''
---
**Company or project name**
Put your company name or project description here
**I have tried the following solutions**: https://clickhouse.com/docs/en/faq/troubleshooting/#troubleshooting-installation-errors
**Installation type**

View File

@ -106,7 +106,8 @@ jobs:
data: ${{ needs.RunConfig.outputs.data }}
# stage for jobs that do not prohibit merge
Tests_3:
needs: [RunConfig, Builds_1]
# Test_3 should not wait for Test_1/Test_2 and should not be blocked by them on master branch since all jobs need to run there.
needs: [RunConfig, Builds_1, Builds_2]
if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_3') }}
uses: ./.github/workflows/reusable_test_stage.yml
with:

View File

@ -135,7 +135,7 @@ jobs:
data: ${{ needs.RunConfig.outputs.data }}
# stage for jobs that do not prohibit merge
Tests_3:
needs: [RunConfig, Tests_1, Tests_2]
needs: [RunConfig, Builds_1, Tests_1, Builds_2, Tests_2]
if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_3') }}
uses: ./.github/workflows/reusable_test_stage.yml
with:

View File

@ -58,7 +58,7 @@ jobs:
env:
GITHUB_JOB_OVERRIDDEN: ${{inputs.test_name}}${{ fromJson(inputs.data).jobs_data.jobs_params[inputs.test_name].num_batches > 1 && format('-{0}',matrix.batch) || '' }}
strategy:
fail-fast: false # we always wait for entire matrix
fail-fast: false # we always wait for the entire matrix
matrix:
batch: ${{ fromJson(inputs.data).jobs_data.jobs_params[inputs.test_name].batches }}
steps:

3
.gitignore vendored
View File

@ -21,6 +21,9 @@
*.stderr
*.stdout
# llvm-xray logs
xray-log.*
/docs/build
/docs/publish
/docs/edit

View File

@ -1,29 +0,0 @@
### CI modificators (add a leading space to apply) ###
## To avoid a merge commit in CI:
#no_merge_commit
## To discard CI cache:
#no_ci_cache
## To not test (only style check):
#do_not_test
## To run specified set of tests in CI:
#ci_set_<SET_NAME>
#ci_set_reduced
#ci_set_arm
#ci_set_integration
#ci_set_old_analyzer
## To run specified job in CI:
#job_<JOB NAME>
#job_stateless_tests_release
#job_package_debug
#job_integration_tests_asan
## To run only specified batches for multi-batch job(s)
#batch_2
#batch_1_2_3

View File

@ -11,7 +11,7 @@
### <a id="245"></a> ClickHouse release 24.5, 2024-05-30
#### Backward Incompatible Change
* Renamed "inverted indexes" to "full-text indexes" which is a less technical / more user-friendly name. This also changes internal table metadata and breaks tables with existing (experimental) inverted indexes. Please make to drop such indexes before upgrade and re-create them after upgrade. [#62884](https://github.com/ClickHouse/ClickHouse/pull/62884) ([Robert Schulze](https://github.com/rschu1ze)).
* Renamed "inverted indexes" to "full-text indexes" which is a less technical / more user-friendly name. This also changes internal table metadata and breaks tables with existing (experimental) inverted indexes. Please make sure to drop such indexes before upgrade and re-create them after upgrade. [#62884](https://github.com/ClickHouse/ClickHouse/pull/62884) ([Robert Schulze](https://github.com/rschu1ze)).
* Usage of functions `neighbor`, `runningAccumulate`, `runningDifferenceStartingWithFirstValue`, `runningDifference` deprecated (because it is error-prone). Proper window functions should be used instead. To enable them back, set `allow_deprecated_error_prone_window_functions = 1` or set `compatibility = '24.4'` or lower. [#63132](https://github.com/ClickHouse/ClickHouse/pull/63132) ([Nikita Taranov](https://github.com/nickitat)).
* Queries from `system.columns` will work faster if there is a large number of columns, but many databases or tables are not granted for `SHOW TABLES`. Note that in previous versions, if you grant `SHOW COLUMNS` to individual columns without granting `SHOW TABLES` to the corresponding tables, the `system.columns` table will show these columns, but in a new version, it will skip the table entirely. Remove trace log messages "Access granted" and "Access denied" that slowed down queries. [#63439](https://github.com/ClickHouse/ClickHouse/pull/63439) ([Alexey Milovidov](https://github.com/alexey-milovidov)).

View File

@ -122,6 +122,8 @@ add_library(global-libs INTERFACE)
include (cmake/sanitize.cmake)
include (cmake/xray_instrumentation.cmake)
option(ENABLE_COLORED_BUILD "Enable colors in compiler output" ON)
set (CMAKE_COLOR_MAKEFILE ${ENABLE_COLORED_BUILD}) # works only for the makefile generator
@ -208,8 +210,6 @@ option(OMIT_HEAVY_DEBUG_SYMBOLS
"Do not generate debugger info for heavy modules (ClickHouse functions and dictionaries, some contrib)"
${OMIT_HEAVY_DEBUG_SYMBOLS_DEFAULT})
option(USE_DEBUG_HELPERS "Enable debug helpers" ${USE_DEBUG_HELPERS})
option(BUILD_STANDALONE_KEEPER "Build keeper as small standalone binary" OFF)
if (NOT BUILD_STANDALONE_KEEPER)
option(CREATE_KEEPER_SYMLINK "Create symlink for clickhouse-keeper to main server binary" ON)

View File

@ -19,7 +19,10 @@ The following versions of ClickHouse server are currently supported with securit
| 24.3 | ✔️ |
| 24.2 | ❌ |
| 24.1 | ❌ |
| 23.* | ❌ |
| 23.12 | ❌ |
| 23.11 | ❌ |
| 23.10 | ❌ |
| 23.9 | ❌ |
| 23.8 | ✔️ |
| 23.7 | ❌ |
| 23.6 | ❌ |

View File

@ -34,15 +34,6 @@ set (SRCS
throwError.cpp
)
if (USE_DEBUG_HELPERS)
get_target_property(MAGIC_ENUM_INCLUDE_DIR ch_contrib::magic_enum INTERFACE_INCLUDE_DIRECTORIES)
# CMake generator expression will do insane quoting when it encounters special character like quotes, spaces, etc.
# Prefixing "SHELL:" will force it to use the original text.
set (INCLUDE_DEBUG_HELPERS "SHELL:-I\"${MAGIC_ENUM_INCLUDE_DIR}\" -include \"${ClickHouse_SOURCE_DIR}/base/base/iostream_debug_helpers.h\"")
# Use generator expression as we don't want to pollute CMAKE_CXX_FLAGS, which will interfere with CMake check system.
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${INCLUDE_DEBUG_HELPERS}>)
endif ()
add_library (common ${SRCS})
if (WITH_COVERAGE)

View File

@ -1,187 +0,0 @@
#pragma once
#include "demangle.h"
#include "getThreadId.h"
#include <type_traits>
#include <tuple>
#include <iomanip>
#include <iostream>
#include <magic_enum.hpp>
/** Usage:
*
* DUMP(variable...)
*/
template <typename Out, typename T>
Out & dumpValue(Out &, T &&);
/// Catch-all case.
template <int priority, typename Out, typename T>
requires(priority == -1)
Out & dumpImpl(Out & out, T &&) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return out << "{...}";
}
/// An object, that could be output with operator <<.
template <int priority, typename Out, typename T>
requires(priority == 0)
Out & dumpImpl(Out & out, T && x, std::decay_t<decltype(std::declval<Out &>() << std::declval<T>())> * = nullptr) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return out << x;
}
/// A pointer-like object.
template <int priority, typename Out, typename T>
requires(priority == 1
/// Protect from the case when operator * do effectively nothing (function pointer).
&& !std::is_same_v<std::decay_t<T>, std::decay_t<decltype(*std::declval<T>())>>)
Out & dumpImpl(Out & out, T && x, std::decay_t<decltype(*std::declval<T>())> * = nullptr) // NOLINT(cppcoreguidelines-missing-std-forward)
{
if (!x)
return out << "nullptr";
return dumpValue(out, *x);
}
/// Container.
template <int priority, typename Out, typename T>
requires(priority == 2)
Out & dumpImpl(Out & out, T && x, std::decay_t<decltype(std::begin(std::declval<T>()))> * = nullptr) // NOLINT(cppcoreguidelines-missing-std-forward)
{
bool first = true;
out << "{";
for (const auto & elem : x)
{
if (first)
first = false;
else
out << ", ";
dumpValue(out, elem);
}
return out << "}";
}
template <int priority, typename Out, typename T>
requires(priority == 3 && std::is_enum_v<std::decay_t<T>>)
Out & dumpImpl(Out & out, T && x) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return out << magic_enum::enum_name(x);
}
/// string and const char * - output not as container or pointer.
template <int priority, typename Out, typename T>
requires(priority == 3 && (std::is_same_v<std::decay_t<T>, std::string> || std::is_same_v<std::decay_t<T>, const char *>))
Out & dumpImpl(Out & out, T && x) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return out << std::quoted(x);
}
/// UInt8 - output as number, not char.
template <int priority, typename Out, typename T>
requires(priority == 3 && std::is_same_v<std::decay_t<T>, unsigned char>)
Out & dumpImpl(Out & out, T && x) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return out << int(x);
}
/// Tuple, pair
template <size_t N, typename Out, typename T>
Out & dumpTupleImpl(Out & out, T && x) // NOLINT(cppcoreguidelines-missing-std-forward)
{
if constexpr (N == 0)
out << "{";
else
out << ", ";
dumpValue(out, std::get<N>(x));
if constexpr (N + 1 == std::tuple_size_v<std::decay_t<T>>)
out << "}";
else
dumpTupleImpl<N + 1>(out, x);
return out;
}
template <int priority, typename Out, typename T>
requires(priority == 4)
Out & dumpImpl(Out & out, T && x, std::decay_t<decltype(std::get<0>(std::declval<T>()))> * = nullptr) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return dumpTupleImpl<0>(out, x);
}
template <int priority, typename Out, typename T>
Out & dumpDispatchPriorities(Out & out, T && x, std::decay_t<decltype(dumpImpl<priority>(std::declval<Out &>(), std::declval<T>()))> *) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return dumpImpl<priority>(out, x);
}
// NOLINTNEXTLINE(google-explicit-constructor)
struct LowPriority { LowPriority(void *) {} };
template <int priority, typename Out, typename T>
Out & dumpDispatchPriorities(Out & out, T && x, LowPriority) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return dumpDispatchPriorities<priority - 1>(out, x, nullptr);
}
template <typename Out, typename T>
Out & dumpValue(Out & out, T && x) // NOLINT(cppcoreguidelines-missing-std-forward)
{
return dumpDispatchPriorities<5>(out, x, nullptr);
}
template <typename Out, typename T>
Out & dump(Out & out, const char * name, T && x) // NOLINT(cppcoreguidelines-missing-std-forward)
{
// Dumping string literal, printing name and demangled type is irrelevant.
if constexpr (std::is_same_v<const char *, std::decay_t<std::remove_reference_t<T>>>)
{
const auto name_len = strlen(name);
const auto value_len = strlen(x);
// `name` is the same as quoted `x`
if (name_len > 2 && value_len > 0 && name[0] == '"' && name[name_len - 1] == '"'
&& strncmp(name + 1, x, std::min(value_len, name_len) - 1) == 0)
return out << x;
}
out << demangle(typeid(x).name()) << " " << name << " = ";
return dumpValue(out, x) << "; ";
}
#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
#define DUMPVAR(VAR) ::dump(std::cerr, #VAR, (VAR));
#define DUMPHEAD std::cerr << __FILE__ << ':' << __LINE__ << " [ " << getThreadId() << " ] ";
#define DUMPTAIL std::cerr << '\n';
#define DUMP1(V1) do { DUMPHEAD DUMPVAR(V1) DUMPTAIL } while(0)
#define DUMP2(V1, V2) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPTAIL } while(0)
#define DUMP3(V1, V2, V3) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPTAIL } while(0)
#define DUMP4(V1, V2, V3, V4) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPVAR(V4) DUMPTAIL } while(0)
#define DUMP5(V1, V2, V3, V4, V5) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPVAR(V4) DUMPVAR(V5) DUMPTAIL } while(0)
#define DUMP6(V1, V2, V3, V4, V5, V6) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPVAR(V4) DUMPVAR(V5) DUMPVAR(V6) DUMPTAIL } while(0)
#define DUMP7(V1, V2, V3, V4, V5, V6, V7) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPVAR(V4) DUMPVAR(V5) DUMPVAR(V6) DUMPVAR(V7) DUMPTAIL } while(0)
#define DUMP8(V1, V2, V3, V4, V5, V6, V7, V8) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPVAR(V4) DUMPVAR(V5) DUMPVAR(V6) DUMPVAR(V7) DUMPVAR(V8) DUMPTAIL } while(0)
#define DUMP9(V1, V2, V3, V4, V5, V6, V7, V8, V9) do { DUMPHEAD DUMPVAR(V1) DUMPVAR(V2) DUMPVAR(V3) DUMPVAR(V4) DUMPVAR(V5) DUMPVAR(V6) DUMPVAR(V7) DUMPVAR(V8) DUMPVAR(V9) DUMPTAIL } while(0)
/// https://groups.google.com/forum/#!searchin/kona-dev/variadic$20macro%7Csort:date/kona-dev/XMA-lDOqtlI/GCzdfZsD41sJ
#define VA_NUM_ARGS_IMPL(x1, x2, x3, x4, x5, x6, x7, x8, x9, N, ...) N
#define VA_NUM_ARGS(...) VA_NUM_ARGS_IMPL(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
#define MAKE_VAR_MACRO_IMPL_CONCAT(PREFIX, NUM_ARGS) PREFIX ## NUM_ARGS
#define MAKE_VAR_MACRO_IMPL(PREFIX, NUM_ARGS) MAKE_VAR_MACRO_IMPL_CONCAT(PREFIX, NUM_ARGS)
#define MAKE_VAR_MACRO(PREFIX, ...) MAKE_VAR_MACRO_IMPL(PREFIX, VA_NUM_ARGS(__VA_ARGS__))
#define DUMP(...) MAKE_VAR_MACRO(DUMP, __VA_ARGS__)(__VA_ARGS__)

View File

@ -1,2 +0,0 @@
clickhouse_add_executable (dump_variable dump_variable.cpp)
target_link_libraries (dump_variable PRIVATE clickhouse_common_io)

View File

@ -1,70 +0,0 @@
#include <base/iostream_debug_helpers.h>
#include <iostream>
#include <memory>
#include <vector>
#include <map>
#include <set>
#include <tuple>
#include <array>
#include <utility>
struct S1;
struct S2 {};
struct S3
{
std::set<const char *> m1;
};
std::ostream & operator<<(std::ostream & stream, const S3 & what)
{
stream << "S3 {m1=";
dumpValue(stream, what.m1) << "}";
return stream;
}
int main(int, char **)
{
int x = 1;
DUMP(x);
DUMP(x, 1, &x);
DUMP(std::make_unique<int>(1));
DUMP(std::make_shared<int>(1));
std::vector<int> vec{1, 2, 3};
DUMP(vec);
auto pair = std::make_pair(1, 2);
DUMP(pair);
auto tuple = std::make_tuple(1, 2, 3);
DUMP(tuple);
std::map<int, std::string> map{{1, "hello"}, {2, "world"}};
DUMP(map);
std::initializer_list<const char *> list{"hello", "world"};
DUMP(list);
std::array<const char *, 2> arr{{"hello", "world"}};
DUMP(arr);
//DUMP([]{});
S1 * s = nullptr;
DUMP(s);
DUMP(S2());
std::set<const char *> variants = {"hello", "world"};
DUMP(variants);
S3 s3 {{"hello", "world"}};
DUMP(s3);
return 0;
}

View File

@ -0,0 +1,20 @@
# https://llvm.org/docs/XRay.html
option (ENABLE_XRAY "Enable LLVM XRay" OFF)
if (NOT ENABLE_XRAY)
message (STATUS "Not using LLVM XRay")
return()
endif()
if (NOT (ARCH_AMD64 AND OS_LINUX))
message (STATUS "Not using LLVM XRay, only amd64 Linux or FreeBSD are supported")
return()
endif()
# The target clang must support xray, otherwise it should error on invalid option
set (XRAY_FLAGS "-fxray-instrument -DUSE_XRAY")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${XRAY_FLAGS}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${XRAY_FLAGS}")
message (STATUS "Using LLVM XRay")

View File

@ -285,7 +285,7 @@ stop_logs_replication
# Try to get logs while server is running
failed_to_save_logs=0
for table in query_log zookeeper_log trace_log transactions_info_log metric_log
for table in query_log zookeeper_log trace_log transactions_info_log metric_log blob_storage_log
do
err=$(clickhouse-client -q "select * from system.$table into outfile '/test_output/$table.tsv.gz' format TSVWithNamesAndTypes")
echo "$err"
@ -339,7 +339,7 @@ if [ $failed_to_save_logs -ne 0 ]; then
# directly
# - even though ci auto-compress some files (but not *.tsv) it does this only
# for files >64MB, we want this files to be compressed explicitly
for table in query_log zookeeper_log trace_log transactions_info_log metric_log
for table in query_log zookeeper_log trace_log transactions_info_log metric_log blob_storage_log
do
clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then

View File

@ -15,7 +15,6 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
file \
libxml2-utils \
moreutils \
python3-fuzzywuzzy \
python3-pip \
yamllint \
locales \
@ -23,8 +22,18 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
# python-magic is the same version as in Ubuntu 22.04
RUN pip3 install black==23.12.0 boto3 codespell==2.2.1 mypy==1.8.0 PyGithub unidiff pylint==3.1.0 \
python-magic==0.4.24 requests types-requests \
RUN pip3 install \
PyGithub \
black==23.12.0 \
boto3 \
codespell==2.2.1 \
mypy==1.8.0 \
pylint==3.1.0 \
python-magic==0.4.24 \
requests \
thefuzz \
types-requests \
unidiff \
&& rm -rf /root/.cache/pip
RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8

View File

@ -71,7 +71,7 @@ If it fails, fix the style errors following the [code style guide](style.md).
```sh
mkdir -p /tmp/test_output
# running all checks
docker run --rm --volume=.:/ClickHouse --volume=/tmp/test_output:/test_output -u $(id -u ${USER}):$(id -g ${USER}) --cap-add=SYS_PTRACE clickhouse/style-test
python3 tests/ci/style_check.py --no-push
# run specified check script (e.g.: ./check-mypy)
docker run --rm --volume=.:/ClickHouse --volume=/tmp/test_output:/test_output -u $(id -u ${USER}):$(id -g ${USER}) --cap-add=SYS_PTRACE --entrypoint= -w/ClickHouse/utils/check-style clickhouse/style-test ./check-mypy

View File

@ -34,10 +34,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name
- `options` — MongoDB connection string options (optional parameter).
:::tip
If you are using the MongoDB Atlas cloud offering please add these options:
If you are using the MongoDB Atlas cloud offering:
```
'connectTimeoutMS=10000&ssl=true&authSource=admin'
- connection url can be obtained from 'Atlas SQL' option
- use options: 'connectTimeoutMS=10000&ssl=true&authSource=admin'
```
:::

View File

@ -37,7 +37,7 @@ ways, for example with respect to their DDL/DQL syntax or performance/compressio
To use full-text indexes, first enable them in the configuration:
```sql
SET allow_experimental_inverted_index = true;
SET allow_experimental_full_text_index = true;
```
An full-text index can be defined on a string column using the following syntax

View File

@ -39,8 +39,8 @@ If you need to update rows frequently, we recommend using the [`ReplacingMergeTr
``` sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
(
name1 [type1] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr1] [COMMENT ...] [CODEC(codec1)] [STATISTIC(stat1)] [TTL expr1] [PRIMARY KEY] [SETTINGS (name = value, ...)],
name2 [type2] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr2] [COMMENT ...] [CODEC(codec2)] [STATISTIC(stat2)] [TTL expr2] [PRIMARY KEY] [SETTINGS (name = value, ...)],
name1 [type1] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr1] [COMMENT ...] [CODEC(codec1)] [STATISTICS(stat1)] [TTL expr1] [PRIMARY KEY] [SETTINGS (name = value, ...)],
name2 [type2] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr2] [COMMENT ...] [CODEC(codec2)] [STATISTICS(stat2)] [TTL expr2] [PRIMARY KEY] [SETTINGS (name = value, ...)],
...
INDEX index_name1 expr1 TYPE type1(...) [GRANULARITY value1],
INDEX index_name2 expr2 TYPE type2(...) [GRANULARITY value2],
@ -178,6 +178,10 @@ Additional parameters that control the behavior of the `MergeTree` (optional):
`max_partitions_to_read` — Limits the maximum number of partitions that can be accessed in one query. You can also specify setting [max_partitions_to_read](/docs/en/operations/settings/merge-tree-settings.md/#max-partitions-to-read) in the global setting.
#### allow_experimental_optimized_row_order
`allow_experimental_optimized_row_order` - Experimental. Enables the optimization of the row order during inserts to improve the compressability of the data for compression codecs (e.g. LZ4). Analyzes and reorders the data, and thus increases the CPU overhead of inserts.
**Example of Sections Setting**
``` sql
@ -1039,12 +1043,12 @@ ClickHouse versions 22.3 through 22.7 use a different cache configuration, see [
## Column Statistics (Experimental) {#column-statistics}
The statistic declaration is in the columns section of the `CREATE` query for tables from the `*MergeTree*` Family when we enable `set allow_experimental_statistic = 1`.
The statistics declaration is in the columns section of the `CREATE` query for tables from the `*MergeTree*` Family when we enable `set allow_experimental_statistics = 1`.
``` sql
CREATE TABLE tab
(
a Int64 STATISTIC(tdigest),
a Int64 STATISTICS(TDigest, Uniq),
b Float64
)
ENGINE = MergeTree
@ -1054,19 +1058,23 @@ ORDER BY a
We can also manipulate statistics with `ALTER` statements.
```sql
ALTER TABLE tab ADD STATISTIC b TYPE tdigest;
ALTER TABLE tab DROP STATISTIC a TYPE tdigest;
ALTER TABLE tab ADD STATISTICS b TYPE TDigest, Uniq;
ALTER TABLE tab DROP STATISTICS a;
```
These lightweight statistics aggregate information about distribution of values in columns.
They can be used for query optimization when we enable `set allow_statistic_optimize = 1`.
These lightweight statistics aggregate information about distribution of values in columns. Statistics are stored in every part and updated when every insert comes.
They can be used for prewhere optimization only if we enable `set allow_statistics_optimize = 1`.
#### Available Types of Column Statistics {#available-types-of-column-statistics}
- `tdigest`
- `TDigest`
Stores distribution of values from numeric columns in [TDigest](https://github.com/tdunning/t-digest) sketch.
- `Uniq`
Estimate the number of distinct values of a column by HyperLogLog.
## Column-level Settings {#column-level-settings}
Certain MergeTree settings can be override at column level:

View File

@ -885,3 +885,47 @@ Default value: false
**See Also**
- [exclude_deleted_rows_for_part_size_in_merge](#exclude_deleted_rows_for_part_size_in_merge) setting
### allow_experimental_optimized_row_order
Controls if the row order should be optimized during inserts to improve the compressability of the newly inserted table part.
MergeTree tables are (optionally) compressed using [compression codecs](../../sql-reference/statements/create/table.md#column_compression_codec).
Generic compression codecs such as LZ4 and ZSTD achieve maximum compression rates if the data exposes patterns.
Long runs of the same value typically compress very well.
If this setting is enabled, ClickHouse attempts to store the data in newly inserted parts in a row order that minimizes the number of equal-value runs across the columns of the new table part.
In other words, a small number of equal-value runs mean that individual runs are long and compress well.
Finding the optimal row order is computationally infeasible (NP hard).
Therefore, ClickHouse uses a heuristics to quickly find a row order which still improves compression rates over the original row order.
<details markdown="1">
<summary>Heuristics for finding a row order</summary>
It is generally possible to shuffle the rows of a table (or table part) freely as SQL considers the same table (table part) in different row order equivalent.
This freedom of shuffling rows is restricted when a primary key is defined for the table.
In ClickHouse, a primary key `C1, C2, ..., CN` enforces that the table rows are sorted by columns `C1`, `C2`, ... `Cn` ([clustered index](https://en.wikipedia.org/wiki/Database_index#Clustered)).
As a result, rows can only be shuffled within "equivalence classes" of row, i.e. rows which have the same values in their primary key columns.
The intuition is that primary keys with high-cardinality, e.g. primary keys involving a `DateTime64` timestamp column, lead to many small equivalence classes.
Likewise, tables with a low-cardinality primary key, create few and large equivalence classes.
A table with no primary key represents the extreme case of a single equivalence class which spans all rows.
The fewer and the larger the equivalence classes are, the higher the degree of freedom when re-shuffling rows.
The heuristics applied to find the best row order within each equivalence class is suggested by D. Lemir, O. Kaser in [Reordering columns for smaller indexes](https://doi.org/10.1016/j.ins.2011.02.002) and based on sorting the rows within each equivalence class by ascending cardinality of the non-primary key columns.
It performs three steps:
1. Find all equivalence classes based on the row values in primary key columns.
2. For each equivalence class, calculate (usually estimate) the cardinalities of the non-primary-key columns.
3. For each equivalence class, sort the rows in order of ascending non-primary-key column cardinality.
</details>
If enabled, insert operations incur additional CPU costs to analyze and optimize the row order of the new data.
INSERTs are expected to take 30-50% longer depending on the data characteristics.
Compression rates of LZ4 or ZSTD improve on average by 20-40%.
This setting works best for tables with no primary key or a low-cardinality primary key, i.e. a table with only few distinct primary key values.
High-cardinality primary keys, e.g. involving timestamp columns of type `DateTime64`, are not expected to benefit from this setting.

View File

@ -5108,7 +5108,7 @@ a Tuple(
)
```
## allow_experimental_statistic {#allow_experimental_statistic}
## allow_experimental_statistics {#allow_experimental_statistics}
Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics).

View File

@ -5,10 +5,57 @@ sidebar_position: 107
# corr
Syntax: `corr(x, y)`
Calculates the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient):
$$
\frac{\Sigma{(x - \bar{x})(y - \bar{y})}}{\sqrt{\Sigma{(x - \bar{x})^2} * \Sigma{(y - \bar{y})^2}}}
$$
Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`.
:::note
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `corrStable` function. It works slower but provides a lower computational error.
:::
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the [`corrStable`](../reference/corrstable.md) function. It is slower but provides a more accurate result.
:::
**Syntax**
```sql
corr(x, y)
```
**Arguments**
- `x` — first variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
- `y` — second variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- The Pearson correlation coefficient. [Float64](../../data-types/float.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS series;
CREATE TABLE series
(
i UInt32,
x_value Float64,
y_value Float64
)
ENGINE = Memory;
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6, -4.4),(2, -9.6, 3),(3, -1.3, -4),(4, 5.3, 9.7),(5, 4.4, 0.037),(6, -8.6, -7.8),(7, 5.1, 9.3),(8, 7.9, -3.6),(9, -8.2, 0.62),(10, -3, 7.3);
```
```sql
SELECT corr(x_value, y_value)
FROM series;
```
Result:
```response
┌─corr(x_value, y_value)─┐
│ 0.1730265755453256 │
└────────────────────────┘
```

View File

@ -0,0 +1,55 @@
---
slug: /en/sql-reference/aggregate-functions/reference/corrmatrix
sidebar_position: 108
---
# corrMatrix
Computes the correlation matrix over N variables.
**Syntax**
```sql
corrMatrix(x[, ...])
```
**Arguments**
- `x` — a variable number of parameters. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned value**
- Correlation matrix. [Array](../../data-types/array.md)([Array](../../data-types/array.md)([Float64](../../data-types/float.md))).
**Example**
Query:
```sql
DROP TABLE IF EXISTS test;
CREATE TABLE test
(
a UInt32,
b Float64,
c Float64,
d Float64
)
ENGINE = Memory;
INSERT INTO test(a, b, c, d) VALUES (1, 5.6, -4.4, 2.6), (2, -9.6, 3, 3.3), (3, -1.3, -4, 1.2), (4, 5.3, 9.7, 2.3), (5, 4.4, 0.037, 1.222), (6, -8.6, -7.8, 2.1233), (7, 5.1, 9.3, 8.1222), (8, 7.9, -3.6, 9.837), (9, -8.2, 0.62, 8.43555), (10, -3, 7.3, 6.762);
```
```sql
SELECT arrayMap(x -> round(x, 3), arrayJoin(corrMatrix(a, b, c, d))) AS corrMatrix
FROM test;
```
Result:
```response
┌─corrMatrix─────────────┐
1. │ [1,-0.096,0.243,0.746] │
2. │ [-0.096,1,0.173,0.106] │
3. │ [0.243,0.173,1,0.258] │
4. │ [0.746,0.106,0.258,1] │
└────────────────────────┘
```

View File

@ -0,0 +1,58 @@
---
slug: /en/sql-reference/aggregate-functions/reference/corrstable
sidebar_position: 107
---
# corrStable
Calculates the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient):
$$
\frac{\Sigma{(x - \bar{x})(y - \bar{y})}}{\sqrt{\Sigma{(x - \bar{x})^2} * \Sigma{(y - \bar{y})^2}}}
$$
Similar to the [`corr`](../reference/corr.md) function, but uses a numerically stable algorithm. As a result, `corrStable` is slower than `corr` but produces a more accurate result.
**Syntax**
```sql
corrStable(x, y)
```
**Arguments**
- `x` — first variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
- `y` — second variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- The Pearson correlation coefficient. [Float64](../../data-types/float.md).
***Example**
Query:
```sql
DROP TABLE IF EXISTS series;
CREATE TABLE series
(
i UInt32,
x_value Float64,
y_value Float64
)
ENGINE = Memory;
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6, -4.4),(2, -9.6, 3),(3, -1.3, -4),(4, 5.3, 9.7),(5, 4.4, 0.037),(6, -8.6, -7.8),(7, 5.1, 9.3),(8, 7.9, -3.6),(9, -8.2, 0.62),(10, -3, 7.3);
```
```sql
SELECT corrStable(x_value, y_value)
FROM series;
```
Result:
```response
┌─corrStable(x_value, y_value)─┐
│ 0.17302657554532558 │
└──────────────────────────────┘
```

View File

@ -1,14 +1,54 @@
---
slug: /en/sql-reference/aggregate-functions/reference/covarpop
sidebar_position: 36
sidebar_position: 37
---
# covarPop
Syntax: `covarPop(x, y)`
Calculates the population covariance:
Calculates the value of `Σ((x - x̅)(y - y̅)) / n`.
$$
\frac{\Sigma{(x - \bar{x})(y - \bar{y})}}{n}
$$
:::note
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarPopStable` function. It works slower but provides a lower computational error.
:::
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the [`covarPopStable`](../reference/covarpopstable.md) function. It works slower but provides a lower computational error.
:::
**Syntax**
```sql
covarPop(x, y)
```
**Arguments**
- `x` — first variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
- `y` — second variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- The population covariance between `x` and `y`. [Float64](../../data-types/float.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS series;
CREATE TABLE series(i UInt32, x_value Float64, y_value Float64) ENGINE = Memory;
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6, -4.4),(2, -9.6, 3),(3, -1.3, -4),(4, 5.3, 9.7),(5, 4.4, 0.037),(6, -8.6, -7.8),(7, 5.1, 9.3),(8, 7.9, -3.6),(9, -8.2, 0.62),(10, -3, 7.3);
```
```sql
SELECT covarPop(x_value, y_value)
FROM series;
```
Result:
```reference
┌─covarPop(x_value, y_value)─┐
│ 6.485648 │
└────────────────────────────┘
```

View File

@ -0,0 +1,55 @@
---
slug: /en/sql-reference/aggregate-functions/reference/covarpopmatrix
sidebar_position: 36
---
# covarPopMatrix
Returns the population covariance matrix over N variables.
**Syntax**
```sql
covarPopMatrix(x[, ...])
```
**Arguments**
- `x` — a variable number of parameters. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- Population covariance matrix. [Array](../../data-types/array.md)([Array](../../data-types/array.md)([Float64](../../data-types/float.md))).
**Example**
Query:
```sql
DROP TABLE IF EXISTS test;
CREATE TABLE test
(
a UInt32,
b Float64,
c Float64,
d Float64
)
ENGINE = Memory;
INSERT INTO test(a, b, c, d) VALUES (1, 5.6, -4.4, 2.6), (2, -9.6, 3, 3.3), (3, -1.3, -4, 1.2), (4, 5.3, 9.7, 2.3), (5, 4.4, 0.037, 1.222), (6, -8.6, -7.8, 2.1233), (7, 5.1, 9.3, 8.1222), (8, 7.9, -3.6, 9.837), (9, -8.2, 0.62, 8.43555), (10, -3, 7.3, 6.762);
```
```sql
SELECT arrayMap(x -> round(x, 3), arrayJoin(covarPopMatrix(a, b, c, d))) AS covarPopMatrix
FROM test;
```
Result:
```reference
┌─covarPopMatrix────────────┐
1. │ [8.25,-1.76,4.08,6.748] │
2. │ [-1.76,41.07,6.486,2.132] │
3. │ [4.08,6.486,34.21,4.755] │
4. │ [6.748,2.132,4.755,9.93] │
└───────────────────────────┘
```

View File

@ -0,0 +1,60 @@
---
slug: /en/sql-reference/aggregate-functions/reference/covarpopstable
sidebar_position: 36
---
# covarPopStable
Calculates the value of the population covariance:
$$
\frac{\Sigma{(x - \bar{x})(y - \bar{y})}}{n}
$$
It is similar to the [covarPop](../reference/covarpop.md) function, but uses a numerically stable algorithm. As a result, `covarPopStable` is slower than `covarPop` but produces a more accurate result.
**Syntax**
```sql
covarPop(x, y)
```
**Arguments**
- `x` — first variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
- `y` — second variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- The population covariance between `x` and `y`. [Float64](../../data-types/float.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS series;
CREATE TABLE series(i UInt32, x_value Float64, y_value Float64) ENGINE = Memory;
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6,-4.4),(2, -9.6,3),(3, -1.3,-4),(4, 5.3,9.7),(5, 4.4,0.037),(6, -8.6,-7.8),(7, 5.1,9.3),(8, 7.9,-3.6),(9, -8.2,0.62),(10, -3,7.3);
```
```sql
SELECT covarPopStable(x_value, y_value)
FROM
(
SELECT
x_value,
y_value
FROM series
);
```
Result:
```reference
┌─covarPopStable(x_value, y_value)─┐
│ 6.485648 │
└──────────────────────────────────┘
```

View File

@ -7,8 +7,74 @@ sidebar_position: 37
Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`.
Returns Float64. When `n <= 1`, returns `nan`.
:::note
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarSampStable` function. It works slower but provides a lower computational error.
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the [`covarSampStable`](../reference/covarsamp.md) function. It works slower but provides a lower computational error.
:::
**Syntax**
```sql
covarSamp(x, y)
```
**Arguments**
- `x` — first variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
- `y` — second variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- The sample covariance between `x` and `y`. For `n <= 1`, `nan` is returned. [Float64](../../data-types/float.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS series;
CREATE TABLE series(i UInt32, x_value Float64, y_value Float64) ENGINE = Memory;
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6,-4.4),(2, -9.6,3),(3, -1.3,-4),(4, 5.3,9.7),(5, 4.4,0.037),(6, -8.6,-7.8),(7, 5.1,9.3),(8, 7.9,-3.6),(9, -8.2,0.62),(10, -3,7.3);
```
```sql
SELECT covarSamp(x_value, y_value)
FROM
(
SELECT
x_value,
y_value
FROM series
);
```
Result:
```reference
┌─covarSamp(x_value, y_value)─┐
│ 7.206275555555556 │
└─────────────────────────────┘
```
Query:
```sql
SELECT covarSamp(x_value, y_value)
FROM
(
SELECT
x_value,
y_value
FROM series LIMIT 1
);
```
Result:
```reference
┌─covarSamp(x_value, y_value)─┐
│ nan │
└─────────────────────────────┘
```

View File

@ -0,0 +1,57 @@
---
slug: /en/sql-reference/aggregate-functions/reference/covarsampmatrix
sidebar_position: 38
---
# covarSampMatrix
Returns the sample covariance matrix over N variables.
**Syntax**
```sql
covarSampMatrix(x[, ...])
```
**Arguments**
- `x` — a variable number of parameters. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- Sample covariance matrix. [Array](../../data-types/array.md)([Array](../../data-types/array.md)([Float64](../../data-types/float.md))).
**Example**
Query:
```sql
DROP TABLE IF EXISTS test;
CREATE TABLE test
(
a UInt32,
b Float64,
c Float64,
d Float64
)
ENGINE = Memory;
INSERT INTO test(a, b, c, d) VALUES (1, 5.6, -4.4, 2.6), (2, -9.6, 3, 3.3), (3, -1.3, -4, 1.2), (4, 5.3, 9.7, 2.3), (5, 4.4, 0.037, 1.222), (6, -8.6, -7.8, 2.1233), (7, 5.1, 9.3, 8.1222), (8, 7.9, -3.6, 9.837), (9, -8.2, 0.62, 8.43555), (10, -3, 7.3, 6.762);
```
```sql
SELECT arrayMap(x -> round(x, 3), arrayJoin(covarSampMatrix(a, b, c, d))) AS covarSampMatrix
FROM test;
```
Result:
```reference
┌─covarSampMatrix─────────────┐
1. │ [9.167,-1.956,4.534,7.498] │
2. │ [-1.956,45.634,7.206,2.369] │
3. │ [4.534,7.206,38.011,5.283] │
4. │ [7.498,2.369,5.283,11.034] │
└─────────────────────────────┘
```

View File

@ -0,0 +1,73 @@
---
slug: /en/sql-reference/aggregate-functions/reference/covarsampstable
sidebar_position: 37
---
# covarSampStable
Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`. Similar to [covarSamp](../reference/covarsamp.md) but works slower while providing a lower computational error.
**Syntax**
```sql
covarSampStable(x, y)
```
**Arguments**
- `x` — first variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
- `y` — second variable. [(U)Int*](../../data-types/int-uint.md), [Float*](../../data-types/float.md), [Decimal](../../data-types/decimal.md).
**Returned Value**
- The sample covariance between `x` and `y`. For `n <= 1`, `inf` is returned. [Float64](../../data-types/float.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS series;
CREATE TABLE series(i UInt32, x_value Float64, y_value Float64) ENGINE = Memory;
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6,-4.4),(2, -9.6,3),(3, -1.3,-4),(4, 5.3,9.7),(5, 4.4,0.037),(6, -8.6,-7.8),(7, 5.1,9.3),(8, 7.9,-3.6),(9, -8.2,0.62),(10, -3,7.3);
```
```sql
SELECT covarSampStable(x_value, y_value)
FROM
(
SELECT
x_value,
y_value
FROM series
);
```
Result:
```reference
┌─covarSampStable(x_value, y_value)─┐
│ 7.206275555555556 │
└───────────────────────────────────┘
```
Query:
```sql
SELECT covarSampStable(x_value, y_value)
FROM
(
SELECT
x_value,
y_value
FROM series LIMIT 1
);
```
Result:
```reference
┌─covarSampStable(x_value, y_value)─┐
│ inf │
└───────────────────────────────────┘
```

View File

@ -9,110 +9,116 @@ toc_hidden: true
Standard aggregate functions:
- [count](/docs/en/sql-reference/aggregate-functions/reference/count.md)
- [min](/docs/en/sql-reference/aggregate-functions/reference/min.md)
- [max](/docs/en/sql-reference/aggregate-functions/reference/max.md)
- [sum](/docs/en/sql-reference/aggregate-functions/reference/sum.md)
- [avg](/docs/en/sql-reference/aggregate-functions/reference/avg.md)
- [any](/docs/en/sql-reference/aggregate-functions/reference/any.md)
- [stddevPop](/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md)
- [stddevPopStable](/docs/en/sql-reference/aggregate-functions/reference/stddevpopstable.md)
- [stddevSamp](/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md)
- [stddevSampStable](/docs/en/sql-reference/aggregate-functions/reference/stddevsampstable.md)
- [varPop](/docs/en/sql-reference/aggregate-functions/reference/varpop.md)
- [varSamp](/docs/en/sql-reference/aggregate-functions/reference/varsamp.md)
- [corr](./corr.md)
- [covarPop](/docs/en/sql-reference/aggregate-functions/reference/covarpop.md)
- [covarSamp](/docs/en/sql-reference/aggregate-functions/reference/covarsamp.md)
- [entropy](./entropy.md)
- [exponentialMovingAverage](./exponentialmovingaverage.md)
- [intervalLengthSum](./intervalLengthSum.md)
- [kolmogorovSmirnovTest](./kolmogorovsmirnovtest.md)
- [mannwhitneyutest](./mannwhitneyutest.md)
- [median](./median.md)
- [rankCorr](./rankCorr.md)
- [sumKahan](./sumkahan.md)
- [studentTTest](./studentttest.md)
- [welchTTest](./welchttest.md)
- [count](../reference/count.md)
- [min](../reference/min.md)
- [max](../reference/max.md)
- [sum](../reference/sum.md)
- [avg](../reference/avg.md)
- [any](../reference/any.md)
- [stddevPop](../reference/stddevpop.md)
- [stddevPopStable](../reference/stddevpopstable.md)
- [stddevSamp](../reference/stddevsamp.md)
- [stddevSampStable](../reference/stddevsampstable.md)
- [varPop](../reference/varpop.md)
- [varSamp](../reference/varsamp.md)
- [corr](../reference/corr.md)
- [corr](../reference/corrstable.md)
- [corrMatrix](../reference/corrmatrix.md)
- [covarPop](../reference/covarpop.md)
- [covarStable](../reference/covarpopstable.md)
- [covarPopMatrix](../reference/covarpopmatrix.md)
- [covarSamp](../reference/covarsamp.md)
- [covarSampStable](../reference/covarsampstable.md)
- [covarSampMatrix](../reference/covarsampmatrix.md)
- [entropy](../reference/entropy.md)
- [exponentialMovingAverage](../reference/exponentialmovingaverage.md)
- [intervalLengthSum](../reference/intervalLengthSum.md)
- [kolmogorovSmirnovTest](../reference/kolmogorovsmirnovtest.md)
- [mannwhitneyutest](../reference/mannwhitneyutest.md)
- [median](../reference/median.md)
- [rankCorr](../reference/rankCorr.md)
- [sumKahan](../reference/sumkahan.md)
- [studentTTest](../reference/studentttest.md)
- [welchTTest](../reference/welchttest.md)
ClickHouse-specific aggregate functions:
- [analysisOfVariance](/docs/en/sql-reference/aggregate-functions/reference/analysis_of_variance.md)
- [any](/docs/en/sql-reference/aggregate-functions/reference/any_respect_nulls.md)
- [anyHeavy](/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md)
- [anyLast](/docs/en/sql-reference/aggregate-functions/reference/anylast.md)
- [anyLast](/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md)
- [boundingRatio](/docs/en/sql-reference/aggregate-functions/reference/boundrat.md)
- [first_value](/docs/en/sql-reference/aggregate-functions/reference/first_value.md)
- [last_value](/docs/en/sql-reference/aggregate-functions/reference/last_value.md)
- [argMin](/docs/en/sql-reference/aggregate-functions/reference/argmin.md)
- [argMax](/docs/en/sql-reference/aggregate-functions/reference/argmax.md)
- [avgWeighted](/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md)
- [topK](/docs/en/sql-reference/aggregate-functions/reference/topk.md)
- [topKWeighted](/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md)
- [deltaSum](./deltasum.md)
- [deltaSumTimestamp](./deltasumtimestamp.md)
- [groupArray](/docs/en/sql-reference/aggregate-functions/reference/grouparray.md)
- [groupArrayLast](/docs/en/sql-reference/aggregate-functions/reference/grouparraylast.md)
- [groupUniqArray](/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray.md)
- [groupArrayInsertAt](/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md)
- [groupArrayMovingAvg](/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md)
- [groupArrayMovingSum](/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md)
- [groupArraySample](./grouparraysample.md)
- [groupArraySorted](/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md)
- [groupArrayIntersect](./grouparrayintersect.md)
- [groupBitAnd](/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md)
- [groupBitOr](/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md)
- [groupBitXor](/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md)
- [groupBitmap](/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md)
- [groupBitmapAnd](/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md)
- [groupBitmapOr](/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md)
- [groupBitmapXor](/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md)
- [sumWithOverflow](/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow.md)
- [sumMap](/docs/en/sql-reference/aggregate-functions/reference/summap.md)
- [sumMapWithOverflow](/docs/en/sql-reference/aggregate-functions/reference/summapwithoverflow.md)
- [sumMapFiltered](/docs/en/sql-reference/aggregate-functions/parametric-functions.md/#summapfiltered)
- [sumMapFilteredWithOverflow](/docs/en/sql-reference/aggregate-functions/parametric-functions.md/#summapfilteredwithoverflow)
- [minMap](/docs/en/sql-reference/aggregate-functions/reference/minmap.md)
- [maxMap](/docs/en/sql-reference/aggregate-functions/reference/maxmap.md)
- [skewSamp](/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md)
- [skewPop](/docs/en/sql-reference/aggregate-functions/reference/skewpop.md)
- [kurtSamp](/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md)
- [kurtPop](/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md)
- [uniq](/docs/en/sql-reference/aggregate-functions/reference/uniq.md)
- [uniqExact](/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md)
- [uniqCombined](/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md)
- [uniqCombined64](/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64.md)
- [uniqHLL12](/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md)
- [uniqTheta](/docs/en/sql-reference/aggregate-functions/reference/uniqthetasketch.md)
- [quantile](/docs/en/sql-reference/aggregate-functions/reference/quantile.md)
- [quantiles](/docs/en/sql-reference/aggregate-functions/reference/quantiles.md)
- [quantileExact](/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md)
- [quantileExactLow](/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactlow)
- [quantileExactHigh](/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md#quantileexacthigh)
- [quantileExactWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md)
- [quantileTiming](/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md)
- [quantileTimingWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md)
- [quantileDeterministic](/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md)
- [quantileTDigest](/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md)
- [quantileTDigestWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md)
- [quantileBFloat16](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16)
- [quantileBFloat16Weighted](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16weighted)
- [quantileDD](/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch)
- [simpleLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md)
- [singleValueOrNull](/docs/en/sql-reference/aggregate-functions/reference/singlevalueornull.md)
- [stochasticLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md)
- [stochasticLogisticRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md)
- [categoricalInformationValue](/docs/en/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md)
- [contingency](./contingency.md)
- [cramersV](./cramersv.md)
- [cramersVBiasCorrected](./cramersvbiascorrected.md)
- [theilsU](./theilsu.md)
- [maxIntersections](./maxintersections.md)
- [maxIntersectionsPosition](./maxintersectionsposition.md)
- [meanZTest](./meanztest.md)
- [quantileGK](./quantileGK.md)
- [quantileInterpolatedWeighted](./quantileinterpolatedweighted.md)
- [sparkBar](./sparkbar.md)
- [sumCount](./sumcount.md)
- [largestTriangleThreeBuckets](./largestTriangleThreeBuckets.md)
- [analysisOfVariance](../reference/analysis_of_variance.md)
- [any](../reference/any_respect_nulls.md)
- [anyHeavy](../reference/anyheavy.md)
- [anyLast](../reference/anylast.md)
- [anyLast](../reference/anylast_respect_nulls.md)
- [boundingRatio](../reference/boundrat.md)
- [first_value](../reference/first_value.md)
- [last_value](../reference/last_value.md)
- [argMin](../reference/argmin.md)
- [argMax](../reference/argmax.md)
- [avgWeighted](../reference/avgweighted.md)
- [topK](../reference/topk.md)
- [topKWeighted](../reference/topkweighted.md)
- [deltaSum](../reference/deltasum.md)
- [deltaSumTimestamp](../reference/deltasumtimestamp.md)
- [groupArray](../reference/grouparray.md)
- [groupArrayLast](../reference/grouparraylast.md)
- [groupUniqArray](../reference/groupuniqarray.md)
- [groupArrayInsertAt](../reference/grouparrayinsertat.md)
- [groupArrayMovingAvg](../reference/grouparraymovingavg.md)
- [groupArrayMovingSum](../reference/grouparraymovingsum.md)
- [groupArraySample](../reference/grouparraysample.md)
- [groupArraySorted](../reference/grouparraysorted.md)
- [groupArrayIntersect](../reference/grouparrayintersect.md)
- [groupBitAnd](../reference/groupbitand.md)
- [groupBitOr](../reference/groupbitor.md)
- [groupBitXor](../reference/groupbitxor.md)
- [groupBitmap](../reference/groupbitmap.md)
- [groupBitmapAnd](../reference/groupbitmapand.md)
- [groupBitmapOr](../reference/groupbitmapor.md)
- [groupBitmapXor](../reference/groupbitmapxor.md)
- [sumWithOverflow](../reference/sumwithoverflow.md)
- [sumMap](../reference/summap.md)
- [sumMapWithOverflow](../reference/summapwithoverflow.md)
- [sumMapFiltered](../parametric-functions.md/#summapfiltered)
- [sumMapFilteredWithOverflow](../parametric-functions.md/#summapfilteredwithoverflow)
- [minMap](../reference/minmap.md)
- [maxMap](../reference/maxmap.md)
- [skewSamp](../reference/skewsamp.md)
- [skewPop](../reference/skewpop.md)
- [kurtSamp](../reference/kurtsamp.md)
- [kurtPop](../reference/kurtpop.md)
- [uniq](../reference/uniq.md)
- [uniqExact](../reference/uniqexact.md)
- [uniqCombined](../reference/uniqcombined.md)
- [uniqCombined64](../reference/uniqcombined64.md)
- [uniqHLL12](../reference/uniqhll12.md)
- [uniqTheta](../reference/uniqthetasketch.md)
- [quantile](../reference/quantile.md)
- [quantiles](../reference/quantiles.md)
- [quantileExact](../reference/quantileexact.md)
- [quantileExactLow](../reference/quantileexact.md#quantileexactlow)
- [quantileExactHigh](../reference/quantileexact.md#quantileexacthigh)
- [quantileExactWeighted](../reference/quantileexactweighted.md)
- [quantileTiming](../reference/quantiletiming.md)
- [quantileTimingWeighted](../reference/quantiletimingweighted.md)
- [quantileDeterministic](../reference/quantiledeterministic.md)
- [quantileTDigest](../reference/quantiletdigest.md)
- [quantileTDigestWeighted](../reference/quantiletdigestweighted.md)
- [quantileBFloat16](../reference/quantilebfloat16.md#quantilebfloat16)
- [quantileBFloat16Weighted](../reference/quantilebfloat16.md#quantilebfloat16weighted)
- [quantileDD](../reference/quantileddsketch.md#quantileddsketch)
- [simpleLinearRegression](../reference/simplelinearregression.md)
- [singleValueOrNull](../reference/singlevalueornull.md)
- [stochasticLinearRegression](../reference/stochasticlinearregression.md)
- [stochasticLogisticRegression](../reference/stochasticlogisticregression.md)
- [categoricalInformationValue](../reference/categoricalinformationvalue.md)
- [contingency](../reference/contingency.md)
- [cramersV](../reference/cramersv.md)
- [cramersVBiasCorrected](../reference/cramersvbiascorrected.md)
- [theilsU](../reference/theilsu.md)
- [maxIntersections](../reference/maxintersections.md)
- [maxIntersectionsPosition](../reference/maxintersectionsposition.md)
- [meanZTest](../reference/meanztest.md)
- [quantileGK](../reference/quantileGK.md)
- [quantileInterpolatedWeighted](../reference/quantileinterpolatedweighted.md)
- [sparkBar](../reference/sparkbar.md)
- [sumCount](../reference/sumcount.md)
- [largestTriangleThreeBuckets](../reference/largestTriangleThreeBuckets.md)

View File

@ -1,7 +1,7 @@
---
slug: /en/sql-reference/data-types/boolean
sidebar_position: 22
sidebar_label: Boolean
sidebar_label: Bool
---
# Bool

View File

@ -6,101 +6,106 @@ sidebar_label: Map(K, V)
# Map(K, V)
`Map(K, V)` data type stores `key:value` pairs.
The Map datatype is implemented as `Array(Tuple(key T1, value T2))`, which means that the order of keys in each map does not change, i.e., this data type maintains insertion order.
Data type `Map(K, V)` stores key-value pairs.
Unlike other databases, maps are not unique in ClickHouse, i.e. a map can contain two elements with the same key.
(The reason for that is that maps are internally implemented as `Array(Tuple(K, V))`.)
You can use use syntax `m[k]` to obtain the value for key `k` in map `m`.
Also, `m[k]` scans the map, i.e. the runtime of the operation is linear in the size of the map.
**Parameters**
- `key` — The key part of the pair. Arbitrary type, except [Nullable](../../sql-reference/data-types/nullable.md) and [LowCardinality](../../sql-reference/data-types/lowcardinality.md) nested with [Nullable](../../sql-reference/data-types/nullable.md) types.
- `value` — The value part of the pair. Arbitrary type, including [Map](../../sql-reference/data-types/map.md) and [Array](../../sql-reference/data-types/array.md).
To get the value from an `a Map('key', 'value')` column, use `a['key']` syntax. This lookup works now with a linear complexity.
- `K` — The type of the Map keys. Arbitrary type except [Nullable](../../sql-reference/data-types/nullable.md) and [LowCardinality](../../sql-reference/data-types/lowcardinality.md) nested with [Nullable](../../sql-reference/data-types/nullable.md) types.
- `V` — The type of the Map values. Arbitrary type.
**Examples**
Consider the table:
Create a table with a column of type map:
``` sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
CREATE TABLE tab (m Map(String, UInt64)) ENGINE=Memory;
INSERT INTO tab VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
```
Select all `key2` values:
To select `key2` values:
```sql
SELECT a['key2'] FROM table_map;
SELECT m['key2'] FROM tab;
```
Result:
```text
┌─arrayElement(a, 'key2')─┐
┌─arrayElement(m, 'key2')─┐
│ 10 │
│ 20 │
│ 30 │
└─────────────────────────┘
```
If there's no such `key` in the `Map()` column, the query returns zeros for numerical values, empty strings or empty arrays.
If the requested key `k` is not contained in the map, `m[k]` returns the value type's default value, e.g. `0` for integer types and `''` for string types.
To check whether a key exists in a map, you can use function [mapContains](../../sql-reference/functions/tuple-map-functions#mapcontains).
```sql
INSERT INTO table_map VALUES ({'key3':100}), ({});
SELECT a['key3'] FROM table_map;
CREATE TABLE tab (m Map(String, UInt64)) ENGINE=Memory;
INSERT INTO tab VALUES ({'key1':100}), ({});
SELECT m['key1'] FROM tab;
```
Result:
```text
┌─arrayElement(a, 'key3')─┐
┌─arrayElement(m, 'key1')─┐
│ 100 │
│ 0 │
└─────────────────────────┘
┌─arrayElement(a, 'key3')─┐
│ 0 │
│ 0 │
│ 0 │
└─────────────────────────┘
```
## Convert Tuple to Map Type
## Converting Tuple to Map
You can cast `Tuple()` as `Map()` using [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function:
Values of type `Tuple()` can be casted to values of type `Map()` using function [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast):
**Example**
Query:
``` sql
SELECT CAST(([1, 2, 3], ['Ready', 'Steady', 'Go']), 'Map(UInt8, String)') AS map;
```
Result:
``` text
┌─map───────────────────────────┐
│ {1:'Ready',2:'Steady',3:'Go'} │
└───────────────────────────────┘
```
## Map.keys and Map.values Subcolumns
## Reading subcolumns of Map
To optimize `Map` column processing, in some cases you can use the `keys` and `values` subcolumns instead of reading the whole column.
To avoid reading the entire map, you can use subcolumns `keys` and `values` in some cases.
**Example**
Query:
``` sql
CREATE TABLE t_map (`a` Map(String, UInt64)) ENGINE = Memory;
CREATE TABLE tab (m Map(String, UInt64)) ENGINE = Memory;
INSERT INTO tab VALUES (map('key1', 1, 'key2', 2, 'key3', 3));
INSERT INTO t_map VALUES (map('key1', 1, 'key2', 2, 'key3', 3));
SELECT a.keys FROM t_map;
SELECT a.values FROM t_map;
SELECT m.keys FROM tab; -- same as mapKeys(m)
SELECT m.values FROM tab; -- same as mapValues(m)
```
Result:
``` text
┌─a.keys─────────────────┐
┌─m.keys─────────────────┐
│ ['key1','key2','key3'] │
└────────────────────────┘
┌─a.values─┐
┌─m.values─┐
│ [1,2,3] │
└──────────┘
```

View File

@ -415,8 +415,8 @@ Alias: `power(x, y)`
**Arguments**
- `x` - [(U)Int8/16/32/64](../data-types/int-uint.md) or [Float*](../data-types/float.md)
- `y` - [(U)Int8/16/32/64](../data-types/int-uint.md) or [Float*](../data-types/float.md)
- `x` - [(U)Int8/16/32/64](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md)
- `y` - [(U)Int8/16/32/64](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md)
**Returned value**
@ -635,8 +635,8 @@ atan2(y, x)
**Arguments**
- `y` — y-coordinate of the point through which the ray passes. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md).
- `x` — x-coordinate of the point through which the ray passes. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md).
- `y` — y-coordinate of the point through which the ray passes. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md).
- `x` — x-coordinate of the point through which the ray passes. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md).
**Returned value**
@ -670,8 +670,8 @@ hypot(x, y)
**Arguments**
- `x` — The first cathetus of a right-angle triangle. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md).
- `y` — The second cathetus of a right-angle triangle. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md).
- `x` — The first cathetus of a right-angle triangle. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md).
- `y` — The second cathetus of a right-angle triangle. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md).
**Returned value**
@ -838,6 +838,7 @@ degrees(x)
**Arguments**
- `x` — Input in radians. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md).
- `x` — Input in radians. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md).
**Returned value**

View File

@ -735,6 +735,8 @@ LIMIT 10
Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string.
The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull).
**Syntax**
```sql
@ -766,6 +768,8 @@ Result:
Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string.
The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull).
**Syntax**
```sql
@ -890,6 +894,122 @@ SELECT
└────────────────────┴────────────────────────────────────────────────┘
```
## parseReadableSize
Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes.
If the function is unable to parse the input value, it throws an exception.
The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize).
**Syntax**
```sql
formatReadableSize(x)
```
**Arguments**
- `x` : Readable size with ISO/IEC 80000-13 or decimal byte unit ([String](../../sql-reference/data-types/string.md)).
**Returned value**
- Number of bytes, rounded up to the nearest integer ([UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
```sql
SELECT
arrayJoin(['1 B', '1 KiB', '3 MB', '5.314 KiB']) AS readable_sizes,
parseReadableSize(readable_sizes) AS sizes;
```
```text
┌─readable_sizes─┬───sizes─┐
│ 1 B │ 1 │
│ 1 KiB │ 1024 │
│ 3 MB │ 3000000 │
│ 5.314 KiB │ 5442 │
└────────────────┴─────────┘
```
## parseReadableSizeOrNull
Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes.
If the function is unable to parse the input value, it returns `NULL`.
The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize).
**Syntax**
```sql
parseReadableSizeOrNull(x)
```
**Arguments**
- `x` : Readable size with ISO/IEC 80000-13 or decimal byte unit ([String](../../sql-reference/data-types/string.md)).
**Returned value**
- Number of bytes, rounded up to the nearest integer, or NULL if unable to parse the input (Nullable([UInt64](../../sql-reference/data-types/int-uint.md))).
**Example**
```sql
SELECT
arrayJoin(['1 B', '1 KiB', '3 MB', '5.314 KiB', 'invalid']) AS readable_sizes,
parseReadableSizeOrNull(readable_sizes) AS sizes;
```
```text
┌─readable_sizes─┬───sizes─┐
│ 1 B │ 1 │
│ 1 KiB │ 1024 │
│ 3 MB │ 3000000 │
│ 5.314 KiB │ 5442 │
│ invalid │ ᴺᵁᴸᴸ │
└────────────────┴─────────┘
```
## parseReadableSizeOrZero
Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `0`.
The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize).
**Syntax**
```sql
parseReadableSizeOrZero(x)
```
**Arguments**
- `x` : Readable size with ISO/IEC 80000-13 or decimal byte unit ([String](../../sql-reference/data-types/string.md)).
**Returned value**
- Number of bytes, rounded up to the nearest integer, or 0 if unable to parse the input ([UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
```sql
SELECT
arrayJoin(['1 B', '1 KiB', '3 MB', '5.314 KiB', 'invalid']) AS readable_sizes,
parseReadableSizeOrZero(readable_sizes) AS sizes;
```
```text
┌─readable_sizes─┬───sizes─┐
│ 1 B │ 1 │
│ 1 KiB │ 1024 │
│ 3 MB │ 3000000 │
│ 5.314 KiB │ 5442 │
│ invalid │ 0 │
└────────────────┴─────────┘
```
## parseTimeDelta
Parse a sequence of numbers followed by something resembling a time unit.

View File

@ -6,7 +6,7 @@ sidebar_label: Maps
## map
Arranges `key:value` pairs into [Map(key, value)](../data-types/map.md) data type.
Creates a value of type [Map(key, value)](../data-types/map.md) from key-value pairs.
**Syntax**
@ -16,12 +16,12 @@ map(key1, value1[, key2, value2, ...])
**Arguments**
- `key` — The key part of the pair. Arbitrary type, except [Nullable](../data-types/nullable.md) and [LowCardinality](../data-types/lowcardinality.md) nested with [Nullable](../data-types/nullable.md).
- `value` — The value part of the pair. Arbitrary type, including [Map](../data-types/map.md) and [Array](../data-types/array.md).
- `key_n` — The keys of the map entries. Any type supported as key type of [Map](../data-types/map.md).
- `value_n` — The values of the map entries. Any type supported as value type of [Map](../data-types/map.md).
**Returned value**
- Data structure as `key:value` pairs. [Map(key, value)](../data-types/map.md).
- A map containing `key:value` pairs. [Map(key, value)](../data-types/map.md).
**Examples**
@ -41,35 +41,16 @@ Result:
└──────────────────────────────────────────────────┘
```
Query:
```sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
SELECT a['key2'] FROM table_map;
```
Result:
```text
┌─arrayElement(a, 'key2')─┐
│ 0 │
│ 2 │
│ 4 │
└─────────────────────────┘
```
**See Also**
- [Map(key, value)](../data-types/map.md) data type
## mapFromArrays
Merges an [Array](../data-types/array.md) of keys and an [Array](../data-types/array.md) of values into a [Map(key, value)](../data-types/map.md). Notice that the second argument could also be a [Map](../data-types/map.md), thus it is casted to an Array when executing.
Creates a map from an array of keys and an array of values.
The function is a convenient alternative to syntax `CAST([...], 'Map(key_type, value_type)')`.
For example, instead of writing
- `CAST((['aa', 'bb'], [4, 5]), 'Map(String, UInt32)')`, or
- `CAST([('aa',4), ('bb',5)], 'Map(String, UInt32)')`
The function is a more convenient alternative to `CAST((key_array, value_array_or_map), 'Map(key_type, value_type)')`. For example, instead of writing `CAST((['aa', 'bb'], [4, 5]), 'Map(String, UInt32)')`, you can write `mapFromArrays(['aa', 'bb'], [4, 5])`.
you can write `mapFromArrays(['aa', 'bb'], [4, 5])`.
**Syntax**
@ -81,12 +62,12 @@ Alias: `MAP_FROM_ARRAYS(keys, values)`
**Arguments**
- `keys`Given key array to create a map from. The nested type of array must be: [String](../data-types/string.md), [Integer](../data-types/int-uint.md), [LowCardinality](../data-types/lowcardinality.md), [FixedString](../data-types/fixedstring.md), [UUID](../data-types/uuid.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [Date32](../data-types/date32.md), [Enum](../data-types/enum.md)
- `values` - Given value array or map to create a map from.
- `keys` Array of keys to create the map from. [Array(T)](../data-types/array.md) where `T` can be any type supported by [Map](../data-types/map.md) as key type.
- `values` - Array or map of values to create the map from. [Array](../data-types/array.md) or [Map](../data-types/map.md).
**Returned value**
- A map whose keys and values are constructed from the key array and value array/map.
- A map with keys and values constructed from the key array and value array/map.
**Example**
@ -94,14 +75,25 @@ Query:
```sql
select mapFromArrays(['a', 'b', 'c'], [1, 2, 3])
```
Result:
```
┌─mapFromArrays(['a', 'b', 'c'], [1, 2, 3])─┐
│ {'a':1,'b':2,'c':3} │
└───────────────────────────────────────────┘
```
`mapFromArrays` also accepts arguments of type [Map](../data-types/map.md). These are casted to array of tuples during execution.
```sql
SELECT mapFromArrays([1, 2, 3], map('a', 1, 'b', 2, 'c', 3))
```
Result:
```
┌─mapFromArrays([1, 2, 3], map('a', 1, 'b', 2, 'c', 3))─┐
│ {1:('a',1),2:('b',2),3:('c',3)} │
└───────────────────────────────────────────────────────┘
@ -109,9 +101,11 @@ SELECT mapFromArrays([1, 2, 3], map('a', 1, 'b', 2, 'c', 3))
## extractKeyValuePairs
Extracts key-value pairs, i.e. a [Map(String, String)](../data-types/map.md), from a string. Parsing is robust towards noise (e.g. log files).
A key-value pair consists of a key, followed by a `key_value_delimiter` and a value. Key value pairs must be separated by `pair_delimiter`. Quoted keys and values are also supported.
Converts a string of key-value pairs to a [Map(String, String)](../data-types/map.md).
Parsing is tolerant towards noise (e.g. log files).
Key-value pairs in the input string consist of a key, followed by a key-value delimiter, and a value.
Key value pairs are separated by a pair delimiter.
Keys and values can be quoted.
**Syntax**
@ -126,17 +120,17 @@ Alias:
**Arguments**
- `data` - String to extract key-value pairs from. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
- `key_value_delimiter` - Character to be used as delimiter between the key and the value. Defaults to `:`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
- `pair_delimiters` - Set of character to be used as delimiters between pairs. Defaults to ` `, `,` and `;`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
- `quoting_character` - Character to be used as quoting character. Defaults to `"`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
- `key_value_delimiter` - Single character delimiting keys and values. Defaults to `:`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
- `pair_delimiters` - Set of character delimiting pairs. Defaults to ` `, `,` and `;`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
- `quoting_character` - Single character used as quoting character. Defaults to `"`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
**Returned values**
- A [Map(String, String)](../data-types/map.md) of key-value pairs.
- A of key-value pairs. Type: [Map(String, String)](../data-types/map.md)
**Examples**
Simple case:
Query
``` sql
SELECT extractKeyValuePairs('name:neymar, age:31 team:psg,nationality:brazil') as kv
@ -150,7 +144,7 @@ Result:
└─────────────────────────────────────────────────────────────────────────┘
```
Single quote as quoting character:
With a single quote `'` as quoting character:
``` sql
SELECT extractKeyValuePairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') as kv
@ -178,9 +172,29 @@ Result:
└────────────────────────┘
```
To restore a map string key-value pairs serialized with `toString`:
```sql
SELECT
map('John', '33', 'Paula', '31') AS m,
toString(m) as map_serialized,
extractKeyValuePairs(map_serialized, ':', ',', '\'') AS map_restored
FORMAT Vertical;
```
Result:
```
Row 1:
──────
m: {'John':'33','Paula':'31'}
map_serialized: {'John':'33','Paula':'31'}
map_restored: {'John':'33','Paula':'31'}
```
## extractKeyValuePairsWithEscaping
Same as `extractKeyValuePairs` but with escaping support.
Same as `extractKeyValuePairs` but supports escaping.
Supported escape sequences: `\x`, `\N`, `\a`, `\b`, `\e`, `\f`, `\n`, `\r`, `\t`, `\v` and `\0`.
Non standard escape sequences are returned as it is (including the backslash) unless they are one of the following:
@ -229,20 +243,6 @@ Arguments are [maps](../data-types/map.md) or [tuples](../data-types/tuple.md#tu
**Example**
Query with a tuple:
```sql
SELECT mapAdd(([toUInt8(1), 2], [1, 1]), ([toUInt8(1), 2], [1, 1])) as res, toTypeName(res) as type;
```
Result:
```text
┌─res───────────┬─type───────────────────────────────┐
│ ([1,2],[2,2]) │ Tuple(Array(UInt8), Array(UInt64)) │
└───────────────┴────────────────────────────────────┘
```
Query with `Map` type:
```sql
@ -257,6 +257,20 @@ Result:
└──────────────────────────────┘
```
Query with a tuple:
```sql
SELECT mapAdd(([toUInt8(1), 2], [1, 1]), ([toUInt8(1), 2], [1, 1])) as res, toTypeName(res) as type;
```
Result:
```text
┌─res───────────┬─type───────────────────────────────┐
│ ([1,2],[2,2]) │ Tuple(Array(UInt8), Array(UInt64)) │
└───────────────┴────────────────────────────────────┘
```
## mapSubtract
Collect all the keys and subtract corresponding values.
@ -277,20 +291,6 @@ Arguments are [maps](../data-types/map.md) or [tuples](../data-types/tuple.md#tu
**Example**
Query with a tuple map:
```sql
SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt32(2), 1])) as res, toTypeName(res) as type;
```
Result:
```text
┌─res────────────┬─type──────────────────────────────┐
│ ([1,2],[-1,0]) │ Tuple(Array(UInt8), Array(Int64)) │
└────────────────┴───────────────────────────────────┘
```
Query with `Map` type:
```sql
@ -305,55 +305,57 @@ Result:
└───────────────────────────────────┘
```
## mapPopulateSeries
Fills missing keys in the maps (key and value array pair), where keys are integers. Also, it supports specifying the max key, which is used to extend the keys array.
**Syntax**
Query with a tuple map:
```sql
mapPopulateSeries(keys, values[, max])
mapPopulateSeries(map[, max])
```
Generates a map (a tuple with two arrays or a value of `Map` type, depending on the arguments), where keys are a series of numbers, from minimum to maximum keys (or `max` argument if it specified) taken from the map with a step size of one, and corresponding values. If the value is not specified for the key, then it uses the default value in the resulting map. For repeated keys, only the first value (in order of appearing) gets associated with the key.
For array arguments the number of elements in `keys` and `values` must be the same for each row.
**Arguments**
Arguments are [maps](../data-types/map.md) or two [arrays](../data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key.
Mapped arrays:
- `keys` — Array of keys. [Array](../data-types/array.md#data-type-array)([Int](../data-types/int-uint.md#uint-ranges)).
- `values` — Array of values. [Array](../data-types/array.md#data-type-array)([Int](../data-types/int-uint.md#uint-ranges)).
- `max` — Maximum key value. Optional. [Int8, Int16, Int32, Int64, Int128, Int256](../data-types/int-uint.md#int-ranges).
or
- `map` — Map with integer keys. [Map](../data-types/map.md).
**Returned value**
- Depending on the arguments returns a [map](../data-types/map.md) or a [tuple](../data-types/tuple.md#tuplet1-t2) of two [arrays](../data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys.
**Example**
Query with mapped arrays:
```sql
SELECT mapPopulateSeries([1,2,4], [11,22,44], 5) AS res, toTypeName(res) AS type;
SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt32(2), 1])) as res, toTypeName(res) as type;
```
Result:
```text
┌─res──────────────────────────┬─type──────────────────────────────┐
│ ([1,2,3,4,5],[11,22,0,44,0]) │ Tuple(Array(UInt8), Array(UInt8)) │
└──────────────────────────────┴───────────────────────────────────┘
┌─res────────────┬─type──────────────────────────────┐
│ ([1,2],[-1,0]) │ Tuple(Array(UInt8), Array(Int64)) │
└────────────────┴───────────────────────────────────┘
```
## mapPopulateSeries
Fills missing key-value pairs in a map with integer keys.
To support extending the keys beyond the largest value, a maximum key can be specified.
More specifically, the function returns a map in which the the keys form a series from the smallest to the largest key (or `max` argument if it specified) with step size of 1, and corresponding values.
If no value is specified for a key, a default value is used as value.
In case keys repeat, only the first value (in order of appearance) is associated with the key.
**Syntax**
```sql
mapPopulateSeries(map[, max])
mapPopulateSeries(keys, values[, max])
```
For array arguments the number of elements in `keys` and `values` must be the same for each row.
**Arguments**
Arguments are [Maps](../data-types/map.md) or two [Arrays](../data-types/array.md#data-type-array), where the first and second array contains keys and values for the each key.
Mapped arrays:
- `map` — Map with integer keys. [Map](../data-types/map.md).
or
- `keys` — Array of keys. [Array](../data-types/array.md#data-type-array)([Int](../data-types/int-uint.md#uint-ranges)).
- `values` — Array of values. [Array](../data-types/array.md#data-type-array)([Int](../data-types/int-uint.md#uint-ranges)).
- `max` — Maximum key value. Optional. [Int8, Int16, Int32, Int64, Int128, Int256](../data-types/int-uint.md#int-ranges).
**Returned value**
- Depending on the arguments a [Map](../data-types/map.md) or a [Tuple](../data-types/tuple.md#tuplet1-t2) of two [Arrays](../data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys.
**Example**
Query with `Map` type:
```sql
@ -368,9 +370,23 @@ Result:
└─────────────────────────────────────────┘
```
Query with mapped arrays:
```sql
SELECT mapPopulateSeries([1,2,4], [11,22,44], 5) AS res, toTypeName(res) AS type;
```
Result:
```text
┌─res──────────────────────────┬─type──────────────────────────────┐
│ ([1,2,3,4,5],[11,22,0,44,0]) │ Tuple(Array(UInt8), Array(UInt8)) │
└──────────────────────────────┴───────────────────────────────────┘
```
## mapContains
Determines whether the `map` contains the `key` parameter.
Returns if a given key is contained in a given map.
**Syntax**
@ -381,7 +397,7 @@ mapContains(map, key)
**Arguments**
- `map` — Map. [Map](../data-types/map.md).
- `key` — Key. Type matches the type of keys of `map` parameter.
- `key` — Key. Type must match the key type of `map`.
**Returned value**
@ -392,11 +408,11 @@ mapContains(map, key)
Query:
```sql
CREATE TABLE test (a Map(String,String)) ENGINE = Memory;
CREATE TABLE tab (a Map(String, String)) ENGINE = Memory;
INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'});
INSERT INTO tab VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'});
SELECT mapContains(a, 'name') FROM test;
SELECT mapContains(a, 'name') FROM tab;
```
@ -411,9 +427,11 @@ Result:
## mapKeys
Returns all keys from the `map` parameter.
Returns the keys of a given map.
Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [keys](../data-types/map.md#map-subcolumns) subcolumn instead of reading and processing the whole column data. The query `SELECT mapKeys(m) FROM table` transforms to `SELECT m.keys FROM table`.
This function can be optimized by enabling setting [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns).
With enabled setting, the function only reads the [keys](../data-types/map.md#map-subcolumns) subcolumn instead the whole map.
The query `SELECT mapKeys(m) FROM table` is transformed to `SELECT m.keys FROM table`.
**Syntax**
@ -434,11 +452,11 @@ mapKeys(map)
Query:
```sql
CREATE TABLE test (a Map(String,String)) ENGINE = Memory;
CREATE TABLE tab (a Map(String, String)) ENGINE = Memory;
INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'});
INSERT INTO tab VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'});
SELECT mapKeys(a) FROM test;
SELECT mapKeys(a) FROM tab;
```
Result:
@ -452,9 +470,11 @@ Result:
## mapValues
Returns all values from the `map` parameter.
Returns the values of a given map.
Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [values](../data-types/map.md#map-subcolumns) subcolumn instead of reading and processing the whole column data. The query `SELECT mapValues(m) FROM table` transforms to `SELECT m.values FROM table`.
This function can be optimized by enabling setting [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns).
With enabled setting, the function only reads the [values](../data-types/map.md#map-subcolumns) subcolumn instead the whole map.
The query `SELECT mapValues(m) FROM table` is transformed to `SELECT m.values FROM table`.
**Syntax**
@ -475,11 +495,11 @@ mapValues(map)
Query:
```sql
CREATE TABLE test (a Map(String,String)) ENGINE = Memory;
CREATE TABLE tab (a Map(String, String)) ENGINE = Memory;
INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'});
INSERT INTO tab VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'});
SELECT mapValues(a) FROM test;
SELECT mapValues(a) FROM tab;
```
Result:
@ -512,11 +532,11 @@ mapContainsKeyLike(map, pattern)
Query:
```sql
CREATE TABLE test (a Map(String,String)) ENGINE = Memory;
CREATE TABLE tab (a Map(String, String)) ENGINE = Memory;
INSERT INTO test VALUES ({'abc':'abc','def':'def'}), ({'hij':'hij','klm':'klm'});
INSERT INTO tab VALUES ({'abc':'abc','def':'def'}), ({'hij':'hij','klm':'klm'});
SELECT mapContainsKeyLike(a, 'a%') FROM test;
SELECT mapContainsKeyLike(a, 'a%') FROM tab;
```
Result:
@ -530,6 +550,8 @@ Result:
## mapExtractKeyLike
Give a map with string keys and a LIKE pattern, this function returns a map with elements where the key matches the pattern.
**Syntax**
```sql
@ -543,18 +565,18 @@ mapExtractKeyLike(map, pattern)
**Returned value**
- A map contained elements the key of which matches the specified pattern. If there are no elements matched the pattern, it will return an empty map.
- A map containing elements the key matching the specified pattern. If no elements match the pattern, an empty map is returned.
**Example**
Query:
```sql
CREATE TABLE test (a Map(String,String)) ENGINE = Memory;
CREATE TABLE tab (a Map(String, String)) ENGINE = Memory;
INSERT INTO test VALUES ({'abc':'abc','def':'def'}), ({'hij':'hij','klm':'klm'});
INSERT INTO tab VALUES ({'abc':'abc','def':'def'}), ({'hij':'hij','klm':'klm'});
SELECT mapExtractKeyLike(a, 'a%') FROM test;
SELECT mapExtractKeyLike(a, 'a%') FROM tab;
```
Result:
@ -568,6 +590,8 @@ Result:
## mapApply
Applies a function to each element of a map.
**Syntax**
```sql
@ -608,6 +632,8 @@ Result:
## mapFilter
Filters a map by applying a function to each map element.
**Syntax**
```sql
@ -623,7 +649,6 @@ mapFilter(func, map)
- Returns a map containing only the elements in `map` for which `func(map1[i], ..., mapN[i])` returns something other than 0.
**Example**
Query:
@ -647,7 +672,6 @@ Result:
└─────────────────────┘
```
## mapUpdate
**Syntax**
@ -683,6 +707,9 @@ Result:
## mapConcat
Concatenates multiple maps based on the equality of their keys.
If elements with the same key exist in more than one input map, all elements are added to the result map, but only the first one is accessible via operator `[]`
**Syntax**
```sql
@ -691,11 +718,11 @@ mapConcat(maps)
**Arguments**
- `maps` Arbitrary number of arguments of [Map](../data-types/map.md) type.
- `maps` Arbitrarily many [Maps](../data-types/map.md).
**Returned value**
- Returns a map with concatenated maps passed as arguments. If there are same keys in two or more maps, all of them are added to the result map, but only the first one is accessible via operator `[]`
- Returns a map with concatenated maps passed as arguments.
**Examples**
@ -729,9 +756,12 @@ Result:
## mapExists(\[func,\], map)
Returns 1 if there is at least one key-value pair in `map` for which `func(key, value)` returns something other than 0. Otherwise, it returns 0.
Returns 1 if at least one key-value pair in `map` exists for which `func(key, value)` returns something other than 0. Otherwise, it returns 0.
Note that the `mapExists` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
:::note
`mapExists` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions).
You can pass a lambda function to it as the first argument.
:::
**Example**
@ -743,7 +773,7 @@ SELECT mapExists((k, v) -> (v = 1), map('k1', 1, 'k2', 2)) AS res
Result:
```text
```
┌─res─┐
│ 1 │
└─────┘
@ -753,7 +783,10 @@ Result:
Returns 1 if `func(key, value)` returns something other than 0 for all key-value pairs in `map`. Otherwise, it returns 0.
Note that the `mapAll` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
:::note
Note that the `mapAll` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions).
You can pass a lambda function to it as the first argument.
:::
**Example**
@ -765,7 +798,7 @@ SELECT mapAll((k, v) -> (v = 1), map('k1', 1, 'k2', 2)) AS res
Result:
```text
```
┌─res─┐
│ 0 │
└─────┘
@ -773,7 +806,8 @@ Result:
## mapSort(\[func,\], map)
Sorts the elements of the `map` in ascending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the keys and values of the map.
Sorts the elements of a map in ascending order.
If the `func` function is specified, the sorting order is determined by the result of the `func` function applied to the keys and values of the map.
**Examples**
@ -801,8 +835,8 @@ For more details see the [reference](../../sql-reference/functions/array-functio
## mapReverseSort(\[func,\], map)
Sorts the elements of the `map` in descending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the keys and values of the map.
Sorts the elements of a map in descending order.
If the `func` function is specified, the sorting order is determined by the result of the `func` function applied to the keys and values of the map.
**Examples**
@ -826,4 +860,4 @@ SELECT mapReverseSort((k, v) -> v, map('key2', 2, 'key3', 1, 'key1', 3)) AS map;
└──────────────────────────────┘
```
For more details see the [reference](../../sql-reference/functions/array-functions.md#array_functions-reverse-sort) for `arrayReverseSort` function.
For more details see function [arrayReverseSort](../../sql-reference/functions/array-functions.md#array_functions-reverse-sort).

View File

@ -16,7 +16,7 @@ Most `ALTER TABLE` queries modify table settings or data:
- [INDEX](/docs/en/sql-reference/statements/alter/skipping-index.md)
- [CONSTRAINT](/docs/en/sql-reference/statements/alter/constraint.md)
- [TTL](/docs/en/sql-reference/statements/alter/ttl.md)
- [STATISTIC](/docs/en/sql-reference/statements/alter/statistic.md)
- [STATISTICS](/docs/en/sql-reference/statements/alter/statistics.md)
- [APPLY DELETED MASK](/docs/en/sql-reference/statements/alter/apply-deleted-mask.md)
:::note

View File

@ -1,25 +0,0 @@
---
slug: /en/sql-reference/statements/alter/statistic
sidebar_position: 45
sidebar_label: STATISTIC
---
# Manipulating Column Statistics
The following operations are available:
- `ALTER TABLE [db].table ADD STATISTIC (columns list) TYPE type` - Adds statistic description to tables metadata.
- `ALTER TABLE [db].table DROP STATISTIC (columns list) TYPE type` - Removes statistic description from tables metadata and deletes statistic files from disk.
- `ALTER TABLE [db].table CLEAR STATISTIC (columns list) TYPE type` - Deletes statistic files from disk.
- `ALTER TABLE [db.]table MATERIALIZE STATISTIC (columns list) TYPE type` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
The first two commands are lightweight in a sense that they only change metadata or remove files.
Also, they are replicated, syncing statistics metadata via ZooKeeper.
:::note
Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants).
:::

View File

@ -0,0 +1,33 @@
---
slug: /en/sql-reference/statements/alter/statistics
sidebar_position: 45
sidebar_label: STATISTICS
---
# Manipulating Column Statistics
The following operations are available:
- `ALTER TABLE [db].table ADD STATISTICS (columns list) TYPE (type list)` - Adds statistic description to tables metadata.
- `ALTER TABLE [db].table MODIFY STATISTICS (columns list) TYPE (type list)` - Modifies statistic description to tables metadata.
- `ALTER TABLE [db].table DROP STATISTICS (columns list)` - Removes statistics from the metadata of the specified columns and deletes all statistics objects in all parts for the specified columns.
- `ALTER TABLE [db].table CLEAR STATISTICS (columns list)` - Deletes all statistics objects in all parts for the specified columns. Statistics objects can be rebuild using `ALTER TABLE MATERIALIZE STATISTICS`.
- `ALTER TABLE [db.]table MATERIALIZE STATISTICS (columns list)` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
The first two commands are lightweight in a sense that they only change metadata or remove files.
Also, they are replicated, syncing statistics metadata via ZooKeeper.
There is an example adding two statistics types to two columns:
```
ALTER TABLE t1 MODIFY STATISTICS c, d TYPE TDigest, Uniq;
```
:::note
Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants).
:::

View File

@ -337,7 +337,7 @@ Then, when executing the query `SELECT name FROM users_a WHERE length(name) < 5;
Defines storage time for values. Can be specified only for MergeTree-family tables. For the detailed description, see [TTL for columns and tables](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl).
## Column Compression Codecs
## Column Compression Codecs {#column_compression_codec}
By default, ClickHouse applies `lz4` compression in the self-managed version, and `zstd` in ClickHouse Cloud.

View File

@ -304,8 +304,8 @@ atan2(y, x)
**Аргументы**
- `y` — координата y точки, в которую проведена линия. [Float64](../../sql-reference/data-types/float.md#float32-float64).
- `x` — координата х точки, в которую проведена линия. [Float64](../../sql-reference/data-types/float.md#float32-float64).
- `y` — координата y точки, в которую проведена линия. [Float64](../../sql-reference/data-types/float.md#float32-float64) или [Decimal](../../sql-reference/data-types/decimal.md).
- `x` — координата х точки, в которую проведена линия. [Float64](../../sql-reference/data-types/float.md#float32-float64) или [Decimal](../../sql-reference/data-types/decimal.md).
**Возвращаемое значение**
@ -341,8 +341,8 @@ hypot(x, y)
**Аргументы**
- `x` — первый катет прямоугольного треугольника. [Float64](../../sql-reference/data-types/float.md#float32-float64).
- `y` — второй катет прямоугольного треугольника. [Float64](../../sql-reference/data-types/float.md#float32-float64).
- `x` — первый катет прямоугольного треугольника. [Float64](../../sql-reference/data-types/float.md#float32-float64) или [Decimal](../../sql-reference/data-types/decimal.md).
- `y` — второй катет прямоугольного треугольника. [Float64](../../sql-reference/data-types/float.md#float32-float64) или [Decimal](../../sql-reference/data-types/decimal.md).
**Возвращаемое значение**

View File

@ -154,7 +154,8 @@ function _clickhouse_quote()
# Extract every option (everything that starts with "-") from the --help dialog.
function _clickhouse_get_options()
{
"$@" --help 2>&1 | awk -F '[ ,=<>.]' '{ for (i=1; i <= NF; ++i) { if (substr($i, 1, 1) == "-" && length($i) > 1) print $i; } }' | sort -u
# By default --help will not print all settings, this is done only under --verbose
"$@" --help --verbose 2>&1 | awk -F '[ ,=<>.]' '{ for (i=1; i <= NF; ++i) { if (substr($i, 1, 1) == "-" && length($i) > 1) print $i; } }' | sort -u
}
function _complete_for_clickhouse_generic_bin_impl()

View File

@ -57,7 +57,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
DB::KeeperSnapshotManager manager(1, keeper_context);
auto snp = manager.serializeSnapshotToBuffer(snapshot);
auto file_info = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID());
std::cout << "Snapshot serialized to path:" << fs::path(file_info.disk->getPath()) / file_info.path << std::endl;
std::cout << "Snapshot serialized to path:" << fs::path(file_info->disk->getPath()) / file_info->path << std::endl;
}
catch (...)
{

View File

@ -9,8 +9,6 @@ set (CLICKHOUSE_KEEPER_LINK
clickhouse_common_zookeeper
daemon
dbms
${LINK_RESOURCE_LIB}
)
clickhouse_program_add(keeper)
@ -210,8 +208,6 @@ if (BUILD_STANDALONE_KEEPER)
loggers_no_text_log
clickhouse_common_io
clickhouse_parsers # Otherwise compression will not built. FIXME.
${LINK_RESOURCE_LIB_STANDALONE_KEEPER}
)
set_target_properties(clickhouse-keeper PROPERTIES RUNTIME_OUTPUT_DIRECTORY ../)

View File

@ -14,8 +14,6 @@ set (CLICKHOUSE_SERVER_LINK
clickhouse_storages_system
clickhouse_table_functions
${LINK_RESOURCE_LIB}
PUBLIC
daemon
)

View File

@ -51,10 +51,11 @@ enum class AccessType : uint8_t
M(ALTER_CLEAR_INDEX, "CLEAR INDEX", TABLE, ALTER_INDEX) \
M(ALTER_INDEX, "INDEX", GROUP, ALTER_TABLE) /* allows to execute ALTER ORDER BY or ALTER {ADD|DROP...} INDEX */\
\
M(ALTER_ADD_STATISTIC, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTIC) \
M(ALTER_DROP_STATISTIC, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTIC) \
M(ALTER_MATERIALIZE_STATISTIC, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTIC) \
M(ALTER_STATISTIC, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\
M(ALTER_ADD_STATISTICS, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTICS) \
M(ALTER_DROP_STATISTICS, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTICS) \
M(ALTER_MODIFY_STATISTICS, "ALTER MODIFY STATISTIC", TABLE, ALTER_STATISTICS) \
M(ALTER_MATERIALIZE_STATISTICS, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTICS) \
M(ALTER_STATISTICS, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\
\
M(ALTER_ADD_PROJECTION, "ADD PROJECTION", TABLE, ALTER_PROJECTION) \
M(ALTER_DROP_PROJECTION, "DROP PROJECTION", TABLE, ALTER_PROJECTION) \

View File

@ -334,6 +334,18 @@ public:
compress(); // Allows reading/writing TDigests with different epsilon/max_centroids params
}
Float64 getCountEqual(Float64 value) const
{
Float64 result = 0;
for (const auto & c : centroids)
{
/// std::cerr << "c "<< c.mean << " "<< c.count << std::endl;
if (value == c.mean)
result += c.count;
}
return result;
}
Float64 getCountLessThan(Float64 value) const
{
bool first = true;

View File

@ -3777,7 +3777,8 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id
scope.scope_node->formatASTForErrorMessage());
auto & table_node = node->as<TableNode &>();
result_projection_names.push_back(table_node.getStorageID().getFullNameNotQuoted());
if (result_projection_names.empty())
result_projection_names.push_back(table_node.getStorageID().getFullNameNotQuoted());
break;
}
@ -5475,7 +5476,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
/// Add current alias to non cached set, because in case of cyclic alias identifier should not be substituted from cache.
/// See 02896_cyclic_aliases_crash.
resolveExpressionNode(node, scope, true /*allow_lambda_expression*/, false /*allow_table_expression*/);
resolveExpressionNode(node, scope, true /*allow_lambda_expression*/, true /*allow_table_expression*/);
bool has_node_in_alias_table = false;
@ -5484,7 +5485,16 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
{
has_node_in_alias_table = true;
if (!it->second->isEqual(*node))
bool matched = it->second->isEqual(*node);
if (!matched)
/// Table expression could be resolved as scalar subquery,
/// but for duplicating alias we allow table expression to be returned.
/// So, check constant node source expression as well.
if (const auto * constant_node = it->second->as<ConstantNode>())
if (const auto & source_expression = constant_node->getSourceExpression())
matched = source_expression->isEqual(*node);
if (!matched)
throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS,
"Multiple expressions {} and {} for alias {}. In scope {}",
node->formatASTForErrorMessage(),

View File

@ -22,15 +22,6 @@ include (configure_config.cmake)
configure_file (Common/config.h.in ${CONFIG_INCLUDE_PATH}/config.h)
configure_file (Common/config_version.cpp.in ${CONFIG_INCLUDE_PATH}/config_version.cpp)
if (USE_DEBUG_HELPERS)
get_target_property(MAGIC_ENUM_INCLUDE_DIR ch_contrib::magic_enum INTERFACE_INCLUDE_DIRECTORIES)
# CMake generator expression will do insane quoting when it encounters special character like quotes, spaces, etc.
# Prefixing "SHELL:" will force it to use the original text.
set (INCLUDE_DEBUG_HELPERS "SHELL:-I\"${ClickHouse_SOURCE_DIR}/base\" -I\"${MAGIC_ENUM_INCLUDE_DIR}\" -include \"${ClickHouse_SOURCE_DIR}/src/Core/iostream_debug_helpers.h\"")
# Use generator expression as we don't want to pollute CMAKE_CXX_FLAGS, which will interfere with CMake check system.
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${INCLUDE_DEBUG_HELPERS}>)
endif ()
# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`).
# If turned ON, this option defines such macro.
# See `src/Common/TargetSpecific.h`

View File

@ -598,7 +598,7 @@ DataTypePtr QueryFuzzer::fuzzDataType(DataTypePtr type)
{
auto key_type = fuzzDataType(type_map->getKeyType());
auto value_type = fuzzDataType(type_map->getValueType());
if (!DataTypeMap::checkKeyType(key_type))
if (!DataTypeMap::isValidKeyType(key_type))
key_type = type_map->getKeyType();
return std::make_shared<DataTypeMap>(key_type, value_type);

View File

@ -828,7 +828,7 @@ ColumnPtr ColumnArray::filterTuple(const Filter & filt, ssize_t result_size_hint
size_t tuple_size = tuple.tupleSize();
if (tuple_size == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple");
return filterGeneric(filt, result_size_hint);
Columns temporary_arrays(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)
@ -1265,7 +1265,7 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const
size_t tuple_size = tuple.tupleSize();
if (tuple_size == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple");
return replicateGeneric(replicate_offsets);
Columns temporary_arrays(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)

View File

@ -1,5 +1,6 @@
#include <Common/Arena.h>
#include <Common/Exception.h>
#include <Common/HashTable/HashSet.h>
#include <Common/HashTable/Hash.h>
#include <Common/RadixSort.h>
#include <Common/SipHash.h>
@ -264,6 +265,23 @@ void ColumnDecimal<T>::updatePermutation(IColumn::PermutationSortDirection direc
}
}
template <is_decimal T>
size_t ColumnDecimal<T>::estimateCardinalityInPermutedRange(const IColumn::Permutation & permutation, const EqualRange & equal_range) const
{
const size_t range_size = equal_range.size();
if (range_size <= 1)
return range_size;
/// TODO use sampling if the range is too large (e.g. 16k elements, but configurable)
HashSet<T> elements;
for (size_t i = equal_range.from; i < equal_range.to; ++i)
{
size_t permuted_i = permutation[i];
elements.insert(data[permuted_i]);
}
return elements.size();
}
template <is_decimal T>
ColumnPtr ColumnDecimal<T>::permute(const IColumn::Permutation & perm, size_t limit) const
{

View File

@ -97,6 +97,8 @@ public:
size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int, IColumn::Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const IColumn::Permutation & permutation, const EqualRange & equal_range) const override;
MutableColumnPtr cloneResized(size_t size) const override;

View File

@ -5,6 +5,7 @@
#include <IO/WriteHelpers.h>
#include <Common/Arena.h>
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/StringHashSet.h>
#include <Common/SipHash.h>
#include <Common/WeakHash.h>
#include <Common/assert_cast.h>
@ -200,6 +201,24 @@ void ColumnFixedString::updatePermutation(IColumn::PermutationSortDirection dire
updatePermutationImpl(limit, res, equal_ranges, ComparatorDescendingStable(*this), comparator_equal, DefaultSort(), DefaultPartialSort());
}
size_t ColumnFixedString::estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const
{
const size_t range_size = equal_range.size();
if (range_size <= 1)
return range_size;
/// TODO use sampling if the range is too large (e.g. 16k elements, but configurable)
StringHashSet elements;
bool inserted = false;
for (size_t i = equal_range.from; i < equal_range.to; ++i)
{
size_t permuted_i = permutation[i];
StringRef value = getDataAt(permuted_i);
elements.emplace(value, inserted);
}
return elements.size();
}
void ColumnFixedString::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
const ColumnFixedString & src_concrete = assert_cast<const ColumnFixedString &>(src);

View File

@ -142,6 +142,8 @@ public:
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;

View File

@ -3,9 +3,12 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/NumberTraits.h>
#include <Common/HashTable/HashSet.h>
#include <Common/HashTable/HashMap.h>
#include <Common/WeakHash.h>
#include <Common/assert_cast.h>
#include "Storages/IndicesDescription.h"
#include "base/types.h"
#include <base/sort.h>
#include <base/scope_guard.h>
@ -486,6 +489,21 @@ void ColumnLowCardinality::updatePermutationWithCollation(const Collator & colla
updatePermutationImpl(limit, res, equal_ranges, comparator, equal_comparator, DefaultSort(), DefaultPartialSort());
}
size_t ColumnLowCardinality::estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const
{
const size_t range_size = equal_range.size();
if (range_size <= 1)
return range_size;
HashSet<UInt64> elements;
for (size_t i = equal_range.from; i < equal_range.to; ++i)
{
UInt64 index = getIndexes().getUInt(permutation[i]);
elements.insert(index);
}
return elements.size();
}
std::vector<MutableColumnPtr> ColumnLowCardinality::scatter(ColumnIndex num_columns, const Selector & selector) const
{
auto columns = getIndexes().scatter(num_columns, selector);

View File

@ -145,6 +145,8 @@ public:
void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
ColumnPtr replicate(const Offsets & offsets) const override
{
return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().replicate(offsets));

View File

@ -1,4 +1,5 @@
#include <Common/Arena.h>
#include <Common/HashTable/StringHashSet.h>
#include <Common/SipHash.h>
#include <Common/assert_cast.h>
#include <Common/WeakHash.h>
@ -621,7 +622,7 @@ void ColumnNullable::updatePermutationImpl(IColumn::PermutationSortDirection dir
if (unlikely(stability == PermutationSortStability::Stable))
{
for (auto & null_range : null_ranges)
::sort(res.begin() + null_range.first, res.begin() + null_range.second);
::sort(std::ranges::next(res.begin(), null_range.from), std::ranges::next(res.begin(), null_range.to));
}
if (is_nulls_last || null_ranges.empty())
@ -660,6 +661,33 @@ void ColumnNullable::updatePermutationWithCollation(const Collator & collator, I
updatePermutationImpl(direction, stability, limit, null_direction_hint, res, equal_ranges, &collator);
}
size_t ColumnNullable::estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const
{
const size_t range_size = equal_range.size();
if (range_size <= 1)
return range_size;
/// TODO use sampling if the range is too large (e.g. 16k elements, but configurable)
StringHashSet elements;
bool has_null = false;
bool inserted = false;
for (size_t i = equal_range.from; i < equal_range.to; ++i)
{
size_t permuted_i = permutation[i];
if (isNullAt(permuted_i))
{
has_null = true;
}
else
{
StringRef value = getDataAt(permuted_i);
elements.emplace(value, inserted);
}
}
return elements.size() + (has_null ? 1 : 0);
}
void ColumnNullable::reserve(size_t n)
{
getNestedColumn().reserve(n);

View File

@ -109,6 +109,7 @@ public:
size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
void reserve(size_t n) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -820,6 +820,9 @@ ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()))
{
auto columns = column_tuple->getColumns();
if (columns.empty())
return column;
for (auto & element : columns)
element = recursiveRemoveSparse(element);

View File

@ -5,6 +5,7 @@
#include <Columns/ColumnCompressed.h>
#include <Columns/MaskOperations.h>
#include <Common/Arena.h>
#include <Common/HashTable/StringHashSet.h>
#include <Common/HashTable/Hash.h>
#include <Common/WeakHash.h>
#include <Common/assert_cast.h>
@ -481,6 +482,23 @@ void ColumnString::updatePermutationWithCollation(const Collator & collator, Per
DefaultPartialSort());
}
size_t ColumnString::estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const
{
const size_t range_size = equal_range.size();
if (range_size <= 1)
return range_size;
/// TODO use sampling if the range is too large (e.g. 16k elements, but configurable)
StringHashSet elements;
bool inserted = false;
for (size_t i = equal_range.from; i < equal_range.to; ++i)
{
size_t permuted_i = permutation[i];
StringRef value = getDataAt(permuted_i);
elements.emplace(value, inserted);
}
return elements.size();
}
ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const
{

View File

@ -260,6 +260,8 @@ public:
void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
ColumnPtr compress() const override;

View File

@ -3,14 +3,16 @@
#include <Columns/ColumnCompressed.h>
#include <Columns/IColumnImpl.h>
#include <Core/Field.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
#include <Common/Arena.h>
#include <Common/WeakHash.h>
#include <Common/assert_cast.h>
#include <Common/iota.h>
#include <Common/typeid_cast.h>
#include <Columns/ColumnsCommon.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
#include <base/sort.h>
@ -23,6 +25,7 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
extern const int CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE;
extern const int LOGICAL_ERROR;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
@ -44,6 +47,9 @@ std::string ColumnTuple::getName() const
ColumnTuple::ColumnTuple(MutableColumns && mutable_columns)
{
if (mutable_columns.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "This function cannot be used to construct empty tuple. It is a bug");
columns.reserve(mutable_columns.size());
for (auto & column : mutable_columns)
{
@ -52,15 +58,21 @@ ColumnTuple::ColumnTuple(MutableColumns && mutable_columns)
columns.push_back(std::move(column));
}
column_length = columns[0]->size();
}
ColumnTuple::ColumnTuple(size_t len) : column_length(len) {}
ColumnTuple::Ptr ColumnTuple::create(const Columns & columns)
{
if (columns.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "This function cannot be used to construct empty tuple. It is a bug");
for (const auto & column : columns)
if (isColumnConst(*column))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnTuple cannot have ColumnConst as its element");
auto column_tuple = ColumnTuple::create(MutableColumns());
auto column_tuple = ColumnTuple::create(columns[0]->size());
column_tuple->columns.assign(columns.begin(), columns.end());
return column_tuple;
@ -68,11 +80,14 @@ ColumnTuple::Ptr ColumnTuple::create(const Columns & columns)
ColumnTuple::Ptr ColumnTuple::create(const TupleColumns & columns)
{
if (columns.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "This function cannot be used to construct empty tuple. It is a bug");
for (const auto & column : columns)
if (isColumnConst(*column))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnTuple cannot have ColumnConst as its element");
auto column_tuple = ColumnTuple::create(MutableColumns());
auto column_tuple = ColumnTuple::create(columns[0]->size());
column_tuple->columns = columns;
return column_tuple;
@ -80,6 +95,9 @@ ColumnTuple::Ptr ColumnTuple::create(const TupleColumns & columns)
MutableColumnPtr ColumnTuple::cloneEmpty() const
{
if (columns.empty())
return ColumnTuple::create(0);
const size_t tuple_size = columns.size();
MutableColumns new_columns(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)
@ -90,6 +108,9 @@ MutableColumnPtr ColumnTuple::cloneEmpty() const
MutableColumnPtr ColumnTuple::cloneResized(size_t new_size) const
{
if (columns.empty())
return ColumnTuple::create(new_size);
const size_t tuple_size = columns.size();
MutableColumns new_columns(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)
@ -98,6 +119,16 @@ MutableColumnPtr ColumnTuple::cloneResized(size_t new_size) const
return ColumnTuple::create(std::move(new_columns));
}
size_t ColumnTuple::size() const
{
if (columns.empty())
return column_length;
/// It's difficult to maintain a consistent `column_length` because there
/// are many places that manipulates sub-columns directly.
return columns.at(0)->size();
}
Field ColumnTuple::operator[](size_t n) const
{
Field res;
@ -144,6 +175,7 @@ void ColumnTuple::insert(const Field & x)
if (tuple.size() != tuple_size)
throw Exception(ErrorCodes::CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE, "Cannot insert value of different size into tuple");
++column_length;
for (size_t i = 0; i < tuple_size; ++i)
columns[i]->insert(tuple[i]);
}
@ -181,6 +213,7 @@ void ColumnTuple::insertFrom(const IColumn & src_, size_t n)
if (src.columns.size() != tuple_size)
throw Exception(ErrorCodes::CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE, "Cannot insert value of different size into tuple");
++column_length;
for (size_t i = 0; i < tuple_size; ++i)
columns[i]->insertFrom(*src.columns[i], n);
}
@ -199,18 +232,28 @@ void ColumnTuple::insertManyFrom(const IColumn & src, size_t position, size_t le
void ColumnTuple::insertDefault()
{
++column_length;
for (auto & column : columns)
column->insertDefault();
}
void ColumnTuple::popBack(size_t n)
{
column_length -= n;
for (auto & column : columns)
column->popBack(n);
}
StringRef ColumnTuple::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
if (columns.empty())
{
/// Has to put one useless byte into Arena, because serialization into zero number of bytes is ambiguous.
char * res = arena.allocContinue(1, begin);
*res = 0;
return { res, 1 };
}
StringRef res(begin, 0);
for (const auto & column : columns)
{
@ -232,6 +275,11 @@ char * ColumnTuple::serializeValueIntoMemory(size_t n, char * memory) const
const char * ColumnTuple::deserializeAndInsertFromArena(const char * pos)
{
++column_length;
if (columns.empty())
return pos + 1;
for (auto & column : columns)
pos = column->deserializeAndInsertFromArena(pos);
@ -272,6 +320,7 @@ void ColumnTuple::updateHashFast(SipHash & hash) const
void ColumnTuple::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
column_length += length;
const size_t tuple_size = columns.size();
for (size_t i = 0; i < tuple_size; ++i)
columns[i]->insertRangeFrom(
@ -281,6 +330,12 @@ void ColumnTuple::insertRangeFrom(const IColumn & src, size_t start, size_t leng
ColumnPtr ColumnTuple::filter(const Filter & filt, ssize_t result_size_hint) const
{
if (columns.empty())
{
size_t bytes = countBytesInFilter(filt);
return cloneResized(bytes);
}
const size_t tuple_size = columns.size();
Columns new_columns(tuple_size);
@ -292,12 +347,29 @@ ColumnPtr ColumnTuple::filter(const Filter & filt, ssize_t result_size_hint) con
void ColumnTuple::expand(const Filter & mask, bool inverted)
{
if (columns.empty())
{
size_t bytes = countBytesInFilter(mask);
if (inverted)
bytes = mask.size() - bytes;
column_length = bytes;
return;
}
for (auto & column : columns)
column->expand(mask, inverted);
}
ColumnPtr ColumnTuple::permute(const Permutation & perm, size_t limit) const
{
if (columns.empty())
{
if (column_length != perm.size())
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of permutation doesn't match size of column");
return cloneResized(limit ? std::min(column_length, limit) : column_length);
}
const size_t tuple_size = columns.size();
Columns new_columns(tuple_size);
@ -309,6 +381,14 @@ ColumnPtr ColumnTuple::permute(const Permutation & perm, size_t limit) const
ColumnPtr ColumnTuple::index(const IColumn & indexes, size_t limit) const
{
if (columns.empty())
{
if (indexes.size() < limit)
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of indexes is less than required");
return cloneResized(limit ? limit : column_length);
}
const size_t tuple_size = columns.size();
Columns new_columns(tuple_size);
@ -320,6 +400,14 @@ ColumnPtr ColumnTuple::index(const IColumn & indexes, size_t limit) const
ColumnPtr ColumnTuple::replicate(const Offsets & offsets) const
{
if (columns.empty())
{
if (column_length != offsets.size())
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of offsets doesn't match size of column");
return cloneResized(offsets.back());
}
const size_t tuple_size = columns.size();
Columns new_columns(tuple_size);
@ -331,6 +419,22 @@ ColumnPtr ColumnTuple::replicate(const Offsets & offsets) const
MutableColumns ColumnTuple::scatter(ColumnIndex num_columns, const Selector & selector) const
{
if (columns.empty())
{
if (column_length != selector.size())
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of selector doesn't match size of column");
std::vector<size_t> counts(num_columns);
for (auto idx : selector)
++counts[idx];
MutableColumns res(num_columns);
for (size_t i = 0; i < num_columns; ++i)
res[i] = cloneResized(counts[i]);
return res;
}
const size_t tuple_size = columns.size();
std::vector<MutableColumns> scattered_tuple_elements(tuple_size);
@ -413,6 +517,9 @@ void ColumnTuple::getPermutationImpl(IColumn::PermutationSortDirection direction
res.resize(rows);
iota(res.data(), rows, IColumn::Permutation::value_type(0));
if (columns.empty())
return;
if (limit >= rows)
limit = 0;
@ -429,7 +536,7 @@ void ColumnTuple::updatePermutationImpl(IColumn::PermutationSortDirection direct
for (const auto & column : columns)
{
while (!equal_ranges.empty() && limit && limit <= equal_ranges.back().first)
while (!equal_ranges.empty() && limit && limit <= equal_ranges.back().from)
equal_ranges.pop_back();
if (collator && column->isCollationSupported())
@ -603,6 +710,9 @@ void ColumnTuple::takeDynamicStructureFromSourceColumns(const Columns & source_c
ColumnPtr ColumnTuple::compress() const
{
if (columns.empty())
return Ptr();
size_t byte_size = 0;
Columns compressed;
compressed.reserve(columns.size());

View File

@ -26,6 +26,13 @@ private:
explicit ColumnTuple(MutableColumns && columns);
ColumnTuple(const ColumnTuple &) = default;
/// Empty tuple needs a dedicated field to store its size.
/// This field used *only* for zero-sized tuples.
/// Otherwise `columns[0].size()` should be used to get a size of tuple column
size_t column_length;
/// Dedicated constructor for empty tuples.
explicit ColumnTuple(size_t len);
public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
@ -39,6 +46,8 @@ public:
requires std::is_rvalue_reference_v<Arg &&>
static MutablePtr create(Arg && arg) { return Base::create(std::forward<Arg>(arg)); }
static MutablePtr create(size_t len_) { return Base::create(len_); }
std::string getName() const override;
const char * getFamilyName() const override { return "Tuple"; }
TypeIndex getDataType() const override { return TypeIndex::Tuple; }
@ -46,10 +55,7 @@ public:
MutableColumnPtr cloneEmpty() const override;
MutableColumnPtr cloneResized(size_t size) const override;
size_t size() const override
{
return columns.at(0)->size();
}
size_t size() const override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
@ -117,6 +123,9 @@ public:
bool hasDynamicStructure() const override;
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
/// Empty tuple needs a public method to manage its size.
void addSize(size_t delta) { column_length += delta; }
private:
int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const;

View File

@ -14,6 +14,7 @@
#include <Common/Arena.h>
#include <Common/Exception.h>
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/StringHashSet.h>
#include <Common/NaNUtils.h>
#include <Common/RadixSort.h>
#include <Common/SipHash.h>
@ -413,6 +414,25 @@ void ColumnVector<T>::updatePermutation(IColumn::PermutationSortDirection direct
}
}
template<typename T>
size_t ColumnVector<T>::estimateCardinalityInPermutedRange(const IColumn::Permutation & permutation, const EqualRange & equal_range) const
{
const size_t range_size = equal_range.size();
if (range_size <= 1)
return range_size;
/// TODO use sampling if the range is too large (e.g. 16k elements, but configurable)
StringHashSet elements;
bool inserted = false;
for (size_t i = equal_range.from; i < equal_range.to; ++i)
{
size_t permuted_i = permutation[i];
StringRef value = getDataAt(permuted_i);
elements.emplace(value, inserted);
}
return elements.size();
}
template <typename T>
MutableColumnPtr ColumnVector<T>::cloneResized(size_t size) const
{

View File

@ -161,6 +161,8 @@ public:
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const IColumn::Permutation & permutation, const EqualRange & equal_range) const override;
void reserve(size_t n) override
{
data.reserve_exact(n);

View File

@ -83,6 +83,11 @@ ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const ColumnConst
return res;
}
size_t IColumn::estimateCardinalityInPermutedRange(const IColumn::Permutation & /*permutation*/, const EqualRange & equal_range) const
{
return equal_range.size();
}
void IColumn::forEachSubcolumn(ColumnCallback callback) const
{
const_cast<IColumn*>(this)->forEachSubcolumn([&callback](WrappedPtr & subcolumn)

View File

@ -36,11 +36,19 @@ class Field;
class WeakHash32;
class ColumnConst;
/*
* Represents a set of equal ranges in previous column to perform sorting in current column.
* Used in sorting by tuples.
* */
using EqualRanges = std::vector<std::pair<size_t, size_t> >;
/// A range of column values between row indexes `from` and `to`. The name "equal range" is due to table sorting as its main use case: With
/// a PRIMARY KEY (c_pk1, c_pk2, ...), the first PK column is fully sorted. The second PK column is sorted within equal-value runs of the
/// first PK column, and so on. The number of runs (ranges) per column increases from one primary key column to the next. An "equal range"
/// is a run in a previous column, within the values of the current column can be sorted.
struct EqualRange
{
size_t from; /// inclusive
size_t to; /// exclusive
EqualRange(size_t from_, size_t to_) : from(from_), to(to_) { chassert(from <= to); }
size_t size() const { return to - from; }
};
using EqualRanges = std::vector<EqualRange>;
/// Declares interface to store columns in memory.
class IColumn : public COW<IColumn>
@ -399,6 +407,9 @@ public:
"or for Array or Tuple, containing them.");
}
/// Estimate the cardinality (number of unique values) of the values in 'equal_range' after permutation, formally: |{ column[permutation[r]] : r in equal_range }|.
virtual size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const;
/** Copies each element according offsets parameter.
* (i-th element should be copied offsets[i] - offsets[i - 1] times.)
* It is necessary in ARRAY JOIN operation.

View File

@ -60,12 +60,9 @@ ColumnPtr IColumnDummy::filter(const Filter & filt, ssize_t /*result_size_hint*/
return cloneDummy(bytes);
}
void IColumnDummy::expand(const IColumn::Filter & mask, bool inverted)
void IColumnDummy::expand(const IColumn::Filter & mask, bool)
{
size_t bytes = countBytesInFilter(mask);
if (inverted)
bytes = mask.size() - bytes;
s = bytes;
s = mask.size();
}
ColumnPtr IColumnDummy::permute(const Permutation & perm, size_t limit) const

View File

@ -139,7 +139,7 @@ void IColumn::updatePermutationImpl(
if (equal_ranges.empty())
return;
if (limit >= size() || limit > equal_ranges.back().second)
if (limit >= size() || limit > equal_ranges.back().to)
limit = 0;
EqualRanges new_ranges;

View File

@ -77,7 +77,7 @@ INSTANTIATE(IPv6)
#undef INSTANTIATE
template <bool inverted, bool column_is_short, typename Container>
template <bool inverted, typename Container>
static size_t extractMaskNumericImpl(
PaddedPODArray<UInt8> & mask,
const Container & data,
@ -85,42 +85,27 @@ static size_t extractMaskNumericImpl(
const PaddedPODArray<UInt8> * null_bytemap,
PaddedPODArray<UInt8> * nulls)
{
if constexpr (!column_is_short)
{
if (data.size() != mask.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "The size of a full data column is not equal to the size of a mask");
}
if (data.size() != mask.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "The size of a full data column is not equal to the size of a mask");
size_t ones_count = 0;
size_t data_index = 0;
size_t mask_size = mask.size();
size_t data_size = data.size();
for (size_t i = 0; i != mask_size && data_index != data_size; ++i)
for (size_t i = 0; i != mask_size; ++i)
{
// Change mask only where value is 1.
if (!mask[i])
continue;
UInt8 value;
size_t index;
if constexpr (column_is_short)
{
index = data_index;
++data_index;
}
else
index = i;
if (null_bytemap && (*null_bytemap)[index])
if (null_bytemap && (*null_bytemap)[i])
{
value = null_value;
if (nulls)
(*nulls)[i] = 1;
}
else
value = static_cast<bool>(data[index]);
value = static_cast<bool>(data[i]);
if constexpr (inverted)
value = !value;
@ -131,12 +116,6 @@ static size_t extractMaskNumericImpl(
mask[i] = value;
}
if constexpr (column_is_short)
{
if (data_index != data_size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "The size of a short column is not equal to the number of ones in a mask");
}
return ones_count;
}
@ -155,10 +134,7 @@ static bool extractMaskNumeric(
const auto & data = numeric_column->getData();
size_t ones_count;
if (column->size() < mask.size())
ones_count = extractMaskNumericImpl<inverted, true>(mask, data, null_value, null_bytemap, nulls);
else
ones_count = extractMaskNumericImpl<inverted, false>(mask, data, null_value, null_bytemap, nulls);
ones_count = extractMaskNumericImpl<inverted>(mask, data, null_value, null_bytemap, nulls);
mask_info.has_ones = ones_count > 0;
mask_info.has_zeros = ones_count != mask.size();
@ -279,25 +255,32 @@ void maskedExecute(ColumnWithTypeAndName & column, const PaddedPODArray<UInt8> &
if (!column_function)
return;
size_t original_size = column.column->size();
ColumnWithTypeAndName result;
/// If mask contains only zeros, we can just create
/// an empty column with the execution result type.
if (!mask_info.has_ones)
{
/// If mask contains only zeros, we can just create a column with default values as it will be ignored
auto result_type = column_function->getResultType();
auto empty_column = result_type->createColumn();
result = {std::move(empty_column), result_type, ""};
auto default_column = result_type->createColumnConstWithDefaultValue(original_size)->convertToFullColumnIfConst();
column = {default_column, result_type, ""};
}
/// Filter column only if mask contains zeros.
else if (mask_info.has_zeros)
{
/// If it contains both zeros and ones, we need to execute the function only on the mask values
/// First we filter the column, which creates a new column, then we apply the column, and finally we expand it
/// Expanding is done to keep consistency in function calls (all columns the same size) and it's ok
/// since the values won't be used by `if`
auto filtered = column_function->filter(mask, -1);
result = typeid_cast<const ColumnFunction *>(filtered.get())->reduce();
auto filter_after_execution = typeid_cast<const ColumnFunction *>(filtered.get())->reduce();
auto mut_column = IColumn::mutate(std::move(filter_after_execution.column));
mut_column->expand(mask, false);
column.column = std::move(mut_column);
}
else
result = column_function->reduce();
column = column_function->reduce();
column = std::move(result);
chassert(column.column->size() == original_size);
}
void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty)

View File

@ -13,8 +13,9 @@ struct CopyableAtomic
: value(other.value.load())
{}
explicit CopyableAtomic(T && value_)
: value(std::forward<T>(value_))
template <std::convertible_to<T> U>
explicit CopyableAtomic(U && value_)
: value(std::forward<U>(value_))
{}
CopyableAtomic & operator=(const CopyableAtomic & other)
@ -23,9 +24,10 @@ struct CopyableAtomic
return *this;
}
CopyableAtomic & operator=(bool value_)
template <std::convertible_to<T> U>
CopyableAtomic & operator=(U && value_)
{
value = value_;
value = std::forward<U>(value_);
return *this;
}

View File

@ -586,7 +586,7 @@
M(705, TABLE_NOT_EMPTY) \
M(706, LIBSSH_ERROR) \
M(707, GCP_ERROR) \
M(708, ILLEGAL_STATISTIC) \
M(708, ILLEGAL_STATISTICS) \
M(709, CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT) \
M(710, FAULT_INJECTED) \
M(711, FILECACHE_ACCESS_DENIED) \

View File

@ -43,12 +43,13 @@ namespace
endpoint,
proxy_scheme,
proxy_port,
cache_ttl
std::chrono::seconds {cache_ttl}
};
return std::make_shared<RemoteProxyConfigurationResolver>(
server_configuration,
request_protocol,
std::make_shared<RemoteProxyHostFetcherImpl>(),
isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration));
}

View File

@ -6,22 +6,47 @@
#include <Poco/Net/HTTPRequest.h>
#include <Poco/Net/HTTPResponse.h>
#include <Common/logger_useful.h>
#include <Common/DNSResolver.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;
}
std::string RemoteProxyHostFetcherImpl::fetch(const Poco::URI & endpoint, const ConnectionTimeouts & timeouts)
{
auto request = Poco::Net::HTTPRequest(Poco::Net::HTTPRequest::HTTP_GET, endpoint.getPath(), Poco::Net::HTTPRequest::HTTP_1_1);
auto session = makeHTTPSession(HTTPConnectionGroupType::HTTP, endpoint, timeouts);
session->sendRequest(request);
Poco::Net::HTTPResponse response;
auto & response_body_stream = session->receiveResponse(response);
if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_OK)
throw HTTPException(
ErrorCodes::RECEIVED_ERROR_FROM_REMOTE_IO_SERVER,
endpoint.toString(),
response.getStatus(),
response.getReason(),
"");
std::string proxy_host;
Poco::StreamCopier::copyToString(response_body_stream, proxy_host);
return proxy_host;
}
RemoteProxyConfigurationResolver::RemoteProxyConfigurationResolver(
const RemoteServerConfiguration & remote_server_configuration_,
Protocol request_protocol_,
std::shared_ptr<RemoteProxyHostFetcher> fetcher_,
bool disable_tunneling_for_https_requests_over_http_proxy_
)
: ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), remote_server_configuration(remote_server_configuration_)
: ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_),
remote_server_configuration(remote_server_configuration_), fetcher(fetcher_)
{
}
@ -29,9 +54,7 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve()
{
auto logger = getLogger("RemoteProxyConfigurationResolver");
auto & [endpoint, proxy_protocol, proxy_port, cache_ttl_] = remote_server_configuration;
LOG_DEBUG(logger, "Obtain proxy using resolver: {}", endpoint.toString());
auto & [endpoint, proxy_protocol_string, proxy_port, cache_ttl] = remote_server_configuration;
std::lock_guard lock(cache_mutex);
@ -55,66 +78,26 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve()
.withSendTimeout(1)
.withReceiveTimeout(1);
try
{
/// It should be just empty GET request.
Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, endpoint.getPath(), Poco::Net::HTTPRequest::HTTP_1_1);
const auto proxy_host = fetcher->fetch(endpoint, timeouts);
const auto & host = endpoint.getHost();
auto resolved_hosts = DNSResolver::instance().resolveHostAll(host);
LOG_DEBUG(logger, "Use proxy: {}://{}:{}", proxy_protocol_string, proxy_host, proxy_port);
HTTPSessionPtr session;
auto proxy_protocol = ProxyConfiguration::protocolFromString(proxy_protocol_string);
for (size_t i = 0; i < resolved_hosts.size(); ++i)
{
auto resolved_endpoint = endpoint;
resolved_endpoint.setHost(resolved_hosts[i].toString());
session = makeHTTPSession(HTTPConnectionGroupType::HTTP, resolved_endpoint, timeouts);
bool use_tunneling_for_https_requests_over_http_proxy = useTunneling(
request_protocol,
proxy_protocol,
disable_tunneling_for_https_requests_over_http_proxy);
try
{
session->sendRequest(request);
break;
}
catch (...)
{
if (i + 1 == resolved_hosts.size())
throw;
}
}
cached_config.protocol = proxy_protocol;
cached_config.host = proxy_host;
cached_config.port = proxy_port;
cached_config.tunneling = use_tunneling_for_https_requests_over_http_proxy;
cached_config.original_request_protocol = request_protocol;
cache_timestamp = std::chrono::system_clock::now();
cache_valid = true;
Poco::Net::HTTPResponse response;
auto & response_body_stream = session->receiveResponse(response);
if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_OK)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Proxy resolver returned not OK status: {}", response.getReason());
String proxy_host;
/// Read proxy host as string from response body.
Poco::StreamCopier::copyToString(response_body_stream, proxy_host);
LOG_DEBUG(logger, "Use proxy: {}://{}:{}", proxy_protocol, proxy_host, proxy_port);
bool use_tunneling_for_https_requests_over_http_proxy = useTunneling(
request_protocol,
cached_config.protocol,
disable_tunneling_for_https_requests_over_http_proxy);
cached_config.protocol = ProxyConfiguration::protocolFromString(proxy_protocol);
cached_config.host = proxy_host;
cached_config.port = proxy_port;
cached_config.tunneling = use_tunneling_for_https_requests_over_http_proxy;
cached_config.original_request_protocol = request_protocol;
cache_timestamp = std::chrono::system_clock::now();
cache_valid = true;
return cached_config;
}
catch (...)
{
tryLogCurrentException("RemoteProxyConfigurationResolver", "Failed to obtain proxy");
return {};
}
return cached_config;
}
void RemoteProxyConfigurationResolver::errorReport(const ProxyConfiguration & config)
@ -124,7 +107,7 @@ void RemoteProxyConfigurationResolver::errorReport(const ProxyConfiguration & co
std::lock_guard lock(cache_mutex);
if (!cache_ttl.count() || !cache_valid)
if (!remote_server_configuration.cache_ttl_.count() || !cache_valid)
return;
if (std::tie(cached_config.protocol, cached_config.host, cached_config.port)

View File

@ -10,6 +10,19 @@
namespace DB
{
struct ConnectionTimeouts;
struct RemoteProxyHostFetcher
{
virtual ~RemoteProxyHostFetcher() = default;
virtual std::string fetch(const Poco::URI & endpoint, const ConnectionTimeouts & timeouts) = 0;
};
struct RemoteProxyHostFetcherImpl : public RemoteProxyHostFetcher
{
std::string fetch(const Poco::URI & endpoint, const ConnectionTimeouts & timeouts) override;
};
/*
* Makes an HTTP GET request to the specified endpoint to obtain a proxy host.
* */
@ -22,13 +35,14 @@ public:
Poco::URI endpoint;
String proxy_protocol;
unsigned proxy_port;
unsigned cache_ttl_;
const std::chrono::seconds cache_ttl_;
};
RemoteProxyConfigurationResolver(
const RemoteServerConfiguration & remote_server_configuration_,
Protocol request_protocol_,
bool disable_tunneling_for_https_requests_over_http_proxy_ = true);
std::shared_ptr<RemoteProxyHostFetcher> fetcher_,
bool disable_tunneling_for_https_requests_over_http_proxy_ = false);
ProxyConfiguration resolve() override;
@ -36,11 +50,11 @@ public:
private:
RemoteServerConfiguration remote_server_configuration;
std::shared_ptr<RemoteProxyHostFetcher> fetcher;
std::mutex cache_mutex;
bool cache_valid = false;
std::chrono::time_point<std::chrono::system_clock> cache_timestamp;
const std::chrono::seconds cache_ttl{0};
ProxyConfiguration cached_config;
};

View File

@ -280,6 +280,10 @@ public:
if (!initialized())
abort();
/// Thread cannot join itself.
if (state->thread_id == std::this_thread::get_id())
abort();
state->event.wait();
state.reset();
}
@ -293,12 +297,7 @@ public:
bool joinable() const
{
if (!state)
return false;
/// Thread cannot join itself.
if (state->thread_id == std::this_thread::get_id())
return false;
return true;
return initialized();
}
std::thread::id get_id() const

View File

@ -637,6 +637,9 @@ void TestKeeper::finalize(const String &)
expired = true;
}
/// Signal request_queue to wake up processing thread without waiting for timeout
requests_queue.finish();
processing_thread.join();
try

View File

@ -1,5 +1,4 @@
#include "ZooKeeper.h"
#include "Coordination/KeeperConstants.h"
#include "Coordination/KeeperFeatureFlags.h"
#include "ZooKeeperImpl.h"
#include "KeeperException.h"
@ -376,11 +375,14 @@ void ZooKeeper::createAncestors(const std::string & path)
}
Coordination::Responses responses;
Coordination::Error code = multiImpl(create_ops, responses, /*check_session_valid*/ false);
const auto & [code, failure_reason] = multiImpl(create_ops, responses, /*check_session_valid*/ false);
if (code == Coordination::Error::ZOK)
return;
if (!failure_reason.empty())
throw KeeperException::fromMessage(code, failure_reason);
throw KeeperException::fromPath(code, path);
}
@ -676,17 +678,19 @@ Coordination::Error ZooKeeper::trySet(const std::string & path, const std::strin
}
Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
std::pair<Coordination::Error, std::string>
ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
{
if (requests.empty())
return Coordination::Error::ZOK;
return {Coordination::Error::ZOK, ""};
std::future<Coordination::MultiResponse> future_result;
Coordination::Requests requests_with_check_session;
if (check_session_valid)
{
Coordination::Requests new_requests = requests;
addCheckSessionOp(new_requests);
future_result = asyncTryMultiNoThrow(new_requests);
requests_with_check_session = requests;
addCheckSessionOp(requests_with_check_session);
future_result = asyncTryMultiNoThrow(requests_with_check_session);
}
else
{
@ -696,7 +700,7 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests
if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
{
impl->finalize(fmt::format("Operation timeout on {} {}", Coordination::OpNum::Multi, requests[0]->getPath()));
return Coordination::Error::ZOPERATIONTIMEOUT;
return {Coordination::Error::ZOPERATIONTIMEOUT, ""};
}
else
{
@ -704,11 +708,14 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests
Coordination::Error code = response.error;
responses = response.responses;
std::string reason;
if (check_session_valid)
{
if (code != Coordination::Error::ZOK && !Coordination::isHardwareError(code) && getFailedOpIndex(code, responses) == requests.size())
{
impl->finalize(fmt::format("Session was killed: {}", requests.back()->getPath()));
reason = fmt::format("Session was killed: {}", requests_with_check_session.back()->getPath());
impl->finalize(reason);
code = Coordination::Error::ZSESSIONMOVED;
}
responses.pop_back();
@ -717,23 +724,33 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests
chassert(code == Coordination::Error::ZOK || Coordination::isHardwareError(code) || responses.back()->error != Coordination::Error::ZOK);
}
return code;
return {code, std::move(reason)};
}
}
Coordination::Responses ZooKeeper::multi(const Coordination::Requests & requests, bool check_session_valid)
{
Coordination::Responses responses;
Coordination::Error code = multiImpl(requests, responses, check_session_valid);
const auto & [code, failure_reason] = multiImpl(requests, responses, check_session_valid);
if (!failure_reason.empty())
throw KeeperException::fromMessage(code, failure_reason);
KeeperMultiException::check(code, requests, responses);
return responses;
}
Coordination::Error ZooKeeper::tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
{
Coordination::Error code = multiImpl(requests, responses, check_session_valid);
const auto & [code, failure_reason] = multiImpl(requests, responses, check_session_valid);
if (code != Coordination::Error::ZOK && !Coordination::isUserError(code))
{
if (!failure_reason.empty())
throw KeeperException::fromMessage(code, failure_reason);
throw KeeperException(code);
}
return code;
}
@ -1346,7 +1363,7 @@ Coordination::Error ZooKeeper::tryMultiNoThrow(const Coordination::Requests & re
{
try
{
return multiImpl(requests, responses, check_session_valid);
return multiImpl(requests, responses, check_session_valid).first;
}
catch (const Coordination::Exception & e)
{

View File

@ -2,10 +2,8 @@
#include "Types.h"
#include <Poco/Util/LayeredConfiguration.h>
#include <unordered_set>
#include <future>
#include <memory>
#include <mutex>
#include <string>
#include <Common/logger_useful.h>
#include <Common/ProfileEvents.h>
@ -18,7 +16,6 @@
#include <Common/thread_local_rng.h>
#include <Coordination/KeeperFeatureFlags.h>
#include <unistd.h>
#include <random>
namespace ProfileEvents
@ -644,7 +641,11 @@ private:
Coordination::Stat * stat,
Coordination::WatchCallbackPtr watch_callback,
Coordination::ListRequestType list_request_type);
Coordination::Error multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid);
/// returns error code with optional reason
std::pair<Coordination::Error, std::string>
multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid);
Coordination::Error existsImpl(const std::string & path, Coordination::Stat * stat_, Coordination::WatchCallback watch_callback);
Coordination::Error syncImpl(const std::string & path, std::string & returned_path);

View File

@ -0,0 +1,172 @@
#include <gtest/gtest.h>
#include <Common/RemoteProxyConfigurationResolver.h>
#include <Poco/URI.h>
#include <IO/ConnectionTimeouts.h>
#include <base/sleep.h>
namespace
{
struct RemoteProxyHostFetcherMock : public DB::RemoteProxyHostFetcher
{
explicit RemoteProxyHostFetcherMock(const std::string & return_mock_) : return_mock(return_mock_) {}
std::string fetch(const Poco::URI &, const DB::ConnectionTimeouts &) override
{
fetch_count++;
return return_mock;
}
std::string return_mock;
std::size_t fetch_count {0};
};
}
namespace DB
{
TEST(RemoteProxyConfigurationResolver, HTTPOverHTTP)
{
const char * proxy_server_mock = "proxy1";
auto remote_server_configuration = RemoteProxyConfigurationResolver::RemoteServerConfiguration
{
Poco::URI("not_important"),
"http",
80,
std::chrono::seconds {10}
};
RemoteProxyConfigurationResolver resolver(
remote_server_configuration,
ProxyConfiguration::Protocol::HTTP,
std::make_shared<RemoteProxyHostFetcherMock>(proxy_server_mock)
);
auto configuration = resolver.resolve();
ASSERT_EQ(configuration.host, proxy_server_mock);
ASSERT_EQ(configuration.port, 80);
ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP);
ASSERT_EQ(configuration.original_request_protocol, ProxyConfiguration::Protocol::HTTP);
ASSERT_EQ(configuration.tunneling, false);
}
TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTPS)
{
const char * proxy_server_mock = "proxy1";
auto remote_server_configuration = RemoteProxyConfigurationResolver::RemoteServerConfiguration
{
Poco::URI("not_important"),
"https",
443,
std::chrono::seconds {10}
};
RemoteProxyConfigurationResolver resolver(
remote_server_configuration,
ProxyConfiguration::Protocol::HTTPS,
std::make_shared<RemoteProxyHostFetcherMock>(proxy_server_mock)
);
auto configuration = resolver.resolve();
ASSERT_EQ(configuration.host, proxy_server_mock);
ASSERT_EQ(configuration.port, 443);
ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTPS);
ASSERT_EQ(configuration.original_request_protocol, ProxyConfiguration::Protocol::HTTPS);
// tunneling should not be used, https over https.
ASSERT_EQ(configuration.tunneling, false);
}
TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTP)
{
const char * proxy_server_mock = "proxy1";
auto remote_server_configuration = RemoteProxyConfigurationResolver::RemoteServerConfiguration
{
Poco::URI("not_important"),
"http",
80,
std::chrono::seconds {10}
};
RemoteProxyConfigurationResolver resolver(
remote_server_configuration,
ProxyConfiguration::Protocol::HTTPS,
std::make_shared<RemoteProxyHostFetcherMock>(proxy_server_mock)
);
auto configuration = resolver.resolve();
ASSERT_EQ(configuration.host, proxy_server_mock);
ASSERT_EQ(configuration.port, 80);
ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP);
ASSERT_EQ(configuration.original_request_protocol, ProxyConfiguration::Protocol::HTTPS);
// tunneling should be used, https over http.
ASSERT_EQ(configuration.tunneling, true);
}
TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTPNoTunneling)
{
const char * proxy_server_mock = "proxy1";
auto remote_server_configuration = RemoteProxyConfigurationResolver::RemoteServerConfiguration
{
Poco::URI("not_important"),
"http",
80,
std::chrono::seconds {10}
};
RemoteProxyConfigurationResolver resolver(
remote_server_configuration,
ProxyConfiguration::Protocol::HTTPS,
std::make_shared<RemoteProxyHostFetcherMock>(proxy_server_mock),
true /* disable_tunneling_for_https_requests_over_http_proxy_ */
);
auto configuration = resolver.resolve();
ASSERT_EQ(configuration.host, proxy_server_mock);
ASSERT_EQ(configuration.port, 80);
ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP);
ASSERT_EQ(configuration.original_request_protocol, ProxyConfiguration::Protocol::HTTPS);
// tunneling should be used, https over http.
ASSERT_EQ(configuration.tunneling, false);
}
TEST(RemoteProxyConfigurationResolver, SimpleCacheTest)
{
const char * proxy_server_mock = "proxy1";
auto cache_ttl = 5u;
auto remote_server_configuration = RemoteProxyConfigurationResolver::RemoteServerConfiguration
{
Poco::URI("not_important"),
"http",
80,
std::chrono::seconds {cache_ttl}
};
auto fetcher_mock = std::make_shared<RemoteProxyHostFetcherMock>(proxy_server_mock);
RemoteProxyConfigurationResolver resolver(
remote_server_configuration,
ProxyConfiguration::Protocol::HTTP,
fetcher_mock
);
resolver.resolve();
resolver.resolve();
resolver.resolve();
ASSERT_EQ(fetcher_mock->fetch_count, 1u);
sleepForSeconds(cache_ttl * 2);
resolver.resolve();
ASSERT_EQ(fetcher_mock->fetch_count, 2);
}
}

View File

@ -305,7 +305,7 @@ String MonitorCommand::run()
print(ret, "ephemerals_count", state_machine.getTotalEphemeralNodesCount());
print(ret, "approximate_data_size", state_machine.getApproximateDataSize());
print(ret, "key_arena_size", state_machine.getKeyArenaSize());
print(ret, "latest_snapshot_size", state_machine.getLatestSnapshotBufSize());
print(ret, "latest_snapshot_size", state_machine.getLatestSnapshotSize());
#if defined(OS_LINUX) || defined(OS_DARWIN)
print(ret, "open_file_descriptor_count", getCurrentProcessFDCount());

View File

@ -332,9 +332,10 @@ void KeeperDispatcher::snapshotThread()
if (shutdown_called)
break;
if (snapshot_file_info.path.empty())
if (!snapshot_file_info)
continue;
chassert(snapshot_file_info->disk != nullptr);
if (isLeader())
snapshot_s3.uploadSnapshot(snapshot_file_info);
}

View File

@ -618,7 +618,7 @@ KeeperSnapshotManager::KeeperSnapshotManager(
LOG_TRACE(log, "Found {} on {}", snapshot_file, disk->getName());
size_t snapshot_up_to = getSnapshotPathUpToLogIdx(snapshot_file);
auto [_, inserted] = existing_snapshots.insert_or_assign(snapshot_up_to, SnapshotFileInfo{snapshot_file, disk});
auto [_, inserted] = existing_snapshots.insert_or_assign(snapshot_up_to, std::make_shared<SnapshotFileInfo>(snapshot_file, disk));
if (!inserted)
LOG_WARNING(
@ -651,7 +651,7 @@ KeeperSnapshotManager::KeeperSnapshotManager(
moveSnapshotsIfNeeded();
}
SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx)
SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx)
{
ReadBufferFromNuraftBuffer reader(buffer);
@ -672,11 +672,12 @@ SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::bu
disk->removeFile(tmp_snapshot_file_name);
existing_snapshots.emplace(up_to_log_idx, SnapshotFileInfo{snapshot_file_name, disk});
auto snapshot_file_info = std::make_shared<SnapshotFileInfo>(snapshot_file_name, disk);
existing_snapshots.emplace(up_to_log_idx, snapshot_file_info);
removeOutdatedSnapshotsIfNeeded();
moveSnapshotsIfNeeded();
return {snapshot_file_name, disk};
return snapshot_file_info;
}
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBufferFromDisk()
@ -690,7 +691,7 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBuff
}
catch (const DB::Exception &)
{
const auto & [path, disk] = latest_itr->second;
const auto & [path, disk, size] = *latest_itr->second;
disk->removeFile(path);
existing_snapshots.erase(latest_itr->first);
tryLogCurrentException(__PRETTY_FUNCTION__);
@ -702,7 +703,7 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBuff
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const
{
const auto & [snapshot_path, snapshot_disk] = existing_snapshots.at(up_to_log_idx);
const auto & [snapshot_path, snapshot_disk, size] = *existing_snapshots.at(up_to_log_idx);
WriteBufferFromNuraftBuffer writer;
auto reader = snapshot_disk->readFile(snapshot_path);
copyData(*reader, writer);
@ -794,18 +795,18 @@ void KeeperSnapshotManager::moveSnapshotsIfNeeded()
{
if (idx == latest_snapshot_idx)
{
if (file_info.disk != latest_snapshot_disk)
if (file_info->disk != latest_snapshot_disk)
{
moveSnapshotBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path, keeper_context);
file_info.disk = latest_snapshot_disk;
moveSnapshotBetweenDisks(file_info->disk, file_info->path, latest_snapshot_disk, file_info->path, keeper_context);
file_info->disk = latest_snapshot_disk;
}
}
else
{
if (file_info.disk != disk)
if (file_info->disk != disk)
{
moveSnapshotBetweenDisks(file_info.disk, file_info.path, disk, file_info.path, keeper_context);
file_info.disk = disk;
moveSnapshotBetweenDisks(file_info->disk, file_info->path, disk, file_info->path, keeper_context);
file_info->disk = disk;
}
}
}
@ -817,12 +818,12 @@ void KeeperSnapshotManager::removeSnapshot(uint64_t log_idx)
auto itr = existing_snapshots.find(log_idx);
if (itr == existing_snapshots.end())
throw Exception(ErrorCodes::UNKNOWN_SNAPSHOT, "Unknown snapshot with log index {}", log_idx);
const auto & [path, disk] = itr->second;
const auto & [path, disk, size] = *itr->second;
disk->removeFileIfExists(path);
existing_snapshots.erase(itr);
}
SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot)
SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot)
{
auto up_to_log_idx = snapshot.snapshot_meta->get_last_log_idx();
auto snapshot_file_name = getSnapshotFileName(up_to_log_idx, compress_snapshots_zstd);
@ -847,7 +848,8 @@ SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStor
disk->removeFile(tmp_snapshot_file_name);
existing_snapshots.emplace(up_to_log_idx, SnapshotFileInfo{snapshot_file_name, disk});
auto snapshot_file_info = std::make_shared<SnapshotFileInfo>(snapshot_file_name, disk);
existing_snapshots.emplace(up_to_log_idx, snapshot_file_info);
try
{
@ -859,7 +861,7 @@ SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStor
tryLogCurrentException(log, "Failed to cleanup and/or move older snapshots");
}
return {snapshot_file_name, disk};
return snapshot_file_info;
}
size_t KeeperSnapshotManager::getLatestSnapshotIndex() const
@ -869,23 +871,23 @@ size_t KeeperSnapshotManager::getLatestSnapshotIndex() const
return 0;
}
SnapshotFileInfo KeeperSnapshotManager::getLatestSnapshotInfo() const
SnapshotFileInfoPtr KeeperSnapshotManager::getLatestSnapshotInfo() const
{
if (!existing_snapshots.empty())
{
const auto & [path, disk] = existing_snapshots.at(getLatestSnapshotIndex());
const auto & [path, disk, size] = *existing_snapshots.at(getLatestSnapshotIndex());
try
{
if (disk->exists(path))
return {path, disk};
return std::make_shared<SnapshotFileInfo>(path, disk);
}
catch (...)
{
tryLogCurrentException(log);
}
}
return {"", nullptr};
return nullptr;
}
}

View File

@ -1,5 +1,6 @@
#pragma once
#include <Coordination/KeeperStorage.h>
#include <Common/CopyableAtomic.h>
#include <libnuraft/nuraft.hxx>
namespace DB
@ -93,12 +94,20 @@ public:
struct SnapshotFileInfo
{
SnapshotFileInfo(std::string path_, DiskPtr disk_)
: path(std::move(path_))
, disk(std::move(disk_))
{}
std::string path;
DiskPtr disk;
mutable std::atomic<size_t> size{0};
};
using SnapshotFileInfoPtr = std::shared_ptr<SnapshotFileInfo>;
using KeeperStorageSnapshotPtr = std::shared_ptr<KeeperStorageSnapshot>;
using CreateSnapshotCallback = std::function<SnapshotFileInfo(KeeperStorageSnapshotPtr &&, bool)>;
using CreateSnapshotCallback = std::function<std::shared_ptr<SnapshotFileInfo>(KeeperStorageSnapshotPtr &&, bool)>;
using SnapshotMetaAndStorage = std::pair<SnapshotMetadataPtr, KeeperStoragePtr>;
@ -121,10 +130,10 @@ public:
nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot) const;
/// Serialize already compressed snapshot to disk (return path)
SnapshotFileInfo serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx);
SnapshotFileInfoPtr serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx);
/// Serialize snapshot directly to disk
SnapshotFileInfo serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot);
SnapshotFileInfoPtr serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot);
SnapshotDeserializationResult deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;
@ -143,7 +152,7 @@ public:
/// The most fresh snapshot log index we have
size_t getLatestSnapshotIndex() const;
SnapshotFileInfo getLatestSnapshotInfo() const;
SnapshotFileInfoPtr getLatestSnapshotInfo() const;
private:
void removeOutdatedSnapshotsIfNeeded();
@ -159,7 +168,7 @@ private:
/// How many snapshots to keep before remove
const size_t snapshots_to_keep;
/// All existing snapshots in our path (log_index -> path)
std::map<uint64_t, SnapshotFileInfo> existing_snapshots;
std::map<uint64_t, SnapshotFileInfoPtr> existing_snapshots;
/// Compress snapshots in common ZSTD format instead of custom ClickHouse block LZ4 format
const bool compress_snapshots_zstd;
/// Superdigest for deserialization of storage

View File

@ -147,7 +147,7 @@ std::shared_ptr<KeeperSnapshotManagerS3::S3Configuration> KeeperSnapshotManagerS
void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapshot_file_info)
{
const auto & [snapshot_path, snapshot_disk] = snapshot_file_info;
const auto & [snapshot_path, snapshot_disk, snapshot_size] = snapshot_file_info;
try
{
auto s3_client = getSnapshotS3Client();
@ -169,9 +169,9 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh
);
};
LOG_INFO(log, "Will try to upload snapshot on {} to S3", snapshot_file_info.path);
LOG_INFO(log, "Will try to upload snapshot on {} to S3", snapshot_path);
auto snapshot_file = snapshot_disk->readFile(snapshot_file_info.path);
auto snapshot_file = snapshot_disk->readFile(snapshot_path);
auto snapshot_name = fs::path(snapshot_path).filename().string();
auto lock_file = fmt::format(".{}_LOCK", snapshot_name);
@ -261,31 +261,33 @@ void KeeperSnapshotManagerS3::snapshotS3Thread()
while (!shutdown_called)
{
SnapshotFileInfo snapshot_file_info;
SnapshotFileInfoPtr snapshot_file_info;
if (!snapshots_s3_queue.pop(snapshot_file_info))
break;
if (shutdown_called)
break;
uploadSnapshotImpl(snapshot_file_info);
uploadSnapshotImpl(*snapshot_file_info);
}
}
void KeeperSnapshotManagerS3::uploadSnapshot(const SnapshotFileInfo & file_info, bool async_upload)
void KeeperSnapshotManagerS3::uploadSnapshot(const SnapshotFileInfoPtr & file_info, bool async_upload)
{
chassert(file_info);
if (getSnapshotS3Client() == nullptr)
return;
if (async_upload)
{
if (!snapshots_s3_queue.push(file_info))
LOG_WARNING(log, "Failed to add snapshot {} to S3 queue", file_info.path);
LOG_WARNING(log, "Failed to add snapshot {} to S3 queue", file_info->path);
return;
}
uploadSnapshotImpl(file_info);
uploadSnapshotImpl(*file_info);
}
void KeeperSnapshotManagerS3::startup(const Poco::Util::AbstractConfiguration & config, const MultiVersion<Macros>::Version & macros)

View File

@ -12,8 +12,6 @@
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ThreadPool.h>
#include <string>
#endif
namespace DB
@ -27,13 +25,13 @@ public:
/// 'macros' are used to substitute macros in endpoint of disks
void updateS3Configuration(const Poco::Util::AbstractConfiguration & config, const MultiVersion<Macros>::Version & macros);
void uploadSnapshot(const SnapshotFileInfo & file_info, bool async_upload = true);
void uploadSnapshot(const SnapshotFileInfoPtr & file_info, bool async_upload = true);
/// 'macros' are used to substitute macros in endpoint of disks
void startup(const Poco::Util::AbstractConfiguration & config, const MultiVersion<Macros>::Version & macros);
void shutdown();
private:
using SnapshotS3Queue = ConcurrentBoundedQueue<SnapshotFileInfo>;
using SnapshotS3Queue = ConcurrentBoundedQueue<SnapshotFileInfoPtr>;
SnapshotS3Queue snapshots_s3_queue;
/// Upload new snapshots to S3
@ -63,7 +61,7 @@ public:
KeeperSnapshotManagerS3() = default;
void updateS3Configuration(const Poco::Util::AbstractConfiguration &, const MultiVersion<Macros>::Version &) {}
void uploadSnapshot(const SnapshotFileInfo &, [[maybe_unused]] bool async_upload = true) {}
void uploadSnapshot(const SnapshotFileInfoPtr &, [[maybe_unused]] bool async_upload = true) {}
void startup(const Poco::Util::AbstractConfiguration &, const MultiVersion<Macros>::Version &) {}

View File

@ -1,3 +1,4 @@
#include <atomic>
#include <cerrno>
#include <Coordination/KeeperSnapshotManager.h>
#include <Coordination/KeeperStateMachine.h>
@ -90,8 +91,9 @@ void KeeperStateMachine::init()
latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index);
auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf);
latest_snapshot_info = snapshot_manager.getLatestSnapshotInfo();
chassert(latest_snapshot_info);
if (isLocalDisk(*latest_snapshot_info.disk))
if (isLocalDisk(*latest_snapshot_info->disk))
latest_snapshot_buf = nullptr;
storage = std::move(snapshot_deserialization_result.storage);
@ -603,7 +605,11 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
}
ProfileEvents::increment(ProfileEvents::KeeperSnapshotCreations);
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), latest_snapshot_info.path);
LOG_DEBUG(
log,
"Created persistent snapshot {} with path {}",
latest_snapshot_meta->get_last_log_idx(),
latest_snapshot_info->path);
}
}
}
@ -627,7 +633,7 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
when_done(ret, exception);
return ret ? latest_snapshot_info : SnapshotFileInfo{};
return ret ? latest_snapshot_info : nullptr;
};
if (keeper_context->getServerState() == KeeperContext::Phase::SHUTDOWN)
@ -635,9 +641,9 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
LOG_INFO(log, "Creating a snapshot during shutdown because 'create_snapshot_on_exit' is enabled.");
auto snapshot_file_info = snapshot_task.create_snapshot(std::move(snapshot_task.snapshot), /*execute_only_cleanup=*/false);
if (!snapshot_file_info.path.empty() && snapshot_manager_s3)
if (snapshot_file_info && snapshot_manager_s3)
{
LOG_INFO(log, "Uploading snapshot {} during shutdown because 'upload_snapshot_on_exit' is enabled.", snapshot_file_info.path);
LOG_INFO(log, "Uploading snapshot {} during shutdown because 'upload_snapshot_on_exit' is enabled.", snapshot_file_info->path);
snapshot_manager_s3->uploadSnapshot(snapshot_file_info, /* asnyc_upload */ false);
}
@ -672,7 +678,7 @@ void KeeperStateMachine::save_logical_snp_obj(
latest_snapshot_info = snapshot_manager.serializeSnapshotBufferToDisk(data, s.get_last_log_idx());
latest_snapshot_meta = cloned_meta;
latest_snapshot_buf = std::move(cloned_buffer);
LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), latest_snapshot_info.path);
LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), latest_snapshot_info->path);
obj_id++;
ProfileEvents::increment(ProfileEvents::KeeperSaveSnapshot);
}
@ -733,7 +739,7 @@ int KeeperStateMachine::read_logical_snp_obj(
return -1;
}
const auto & [path, disk] = latest_snapshot_info;
const auto & [path, disk, size] = *latest_snapshot_info;
if (isLocalDisk(*disk))
{
auto full_path = fs::path(disk->getPath()) / path;
@ -862,12 +868,27 @@ uint64_t KeeperStateMachine::getKeyArenaSize() const
return storage->getArenaDataSize();
}
uint64_t KeeperStateMachine::getLatestSnapshotBufSize() const
uint64_t KeeperStateMachine::getLatestSnapshotSize() const
{
std::lock_guard lock(snapshots_lock);
if (latest_snapshot_buf)
return latest_snapshot_buf->size();
return 0;
auto snapshot_info = [&]
{
std::lock_guard lock(snapshots_lock);
return latest_snapshot_info;
}();
if (snapshot_info == nullptr || snapshot_info->disk == nullptr)
return 0;
/// there is a possibility multiple threads can try to get size
/// this can happen in rare cases while it's not a heavy operation
size_t size = snapshot_info->size.load(std::memory_order_relaxed);
if (size == 0)
{
size = snapshot_info->disk->getFileSize(snapshot_info->path);
snapshot_info->size.store(size, std::memory_order_relaxed);
}
return size;
}
ClusterConfigPtr KeeperStateMachine::getClusterConfig() const

View File

@ -124,7 +124,7 @@ public:
uint64_t getTotalEphemeralNodesCount() const;
uint64_t getApproximateDataSize() const;
uint64_t getKeyArenaSize() const;
uint64_t getLatestSnapshotBufSize() const;
uint64_t getLatestSnapshotSize() const;
void recalculateStorageStats();
@ -135,7 +135,7 @@ private:
/// In our state machine we always have a single snapshot which is stored
/// in memory in compressed (serialized) format.
SnapshotMetadataPtr latest_snapshot_meta = nullptr;
SnapshotFileInfo latest_snapshot_info;
std::shared_ptr<SnapshotFileInfo> latest_snapshot_info;
nuraft::ptr<nuraft::buffer> latest_snapshot_buf = nullptr;
/// Main state machine logic

Some files were not shown because too many files have changed in this diff Show More