Merge remote-tracking branch 'origin/master' into musysroot

This commit is contained in:
Michael Kolupaev 2024-08-15 21:39:38 +00:00
commit 82fc6bfa9e
291 changed files with 6514 additions and 2695 deletions

18
.github/actions/debug/action.yml vendored Normal file
View File

@ -0,0 +1,18 @@
name: DebugInfo
description: Prints workflow debug info
runs:
using: "composite"
steps:
- name: Print envs
shell: bash
run: |
echo "::group::Envs"
env
echo "::endgroup::"
- name: Print Event.json
shell: bash
run: |
echo "::group::Event.json"
python3 -m json.tool "$GITHUB_EVENT_PATH"
echo "::endgroup::"

109
.github/workflows/auto_releases.yml vendored Normal file
View File

@ -0,0 +1,109 @@
name: AutoReleases
env:
PYTHONUNBUFFERED: 1
concurrency:
group: autoreleases
on:
# schedule:
# - cron: '0 9 * * *'
workflow_dispatch:
inputs:
dry-run:
description: 'Dry run'
required: false
default: true
type: boolean
jobs:
AutoReleaseInfo:
runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.info.outputs.AUTO_RELEASE_PARAMS }}
dry_run: ${{ steps.info.outputs.DRY_RUN }}
steps:
- name: Debug Info
uses: ./.github/actions/debug
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
EOF
echo "DRY_RUN=true" >> "$GITHUB_ENV"
- name: Check out repository code
uses: ClickHouse/checkout@v1
- name: Prepare Info
id: info
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 auto_release.py --prepare
echo "::group::Auto Release Info"
python3 -m json.tool /tmp/autorelease_info.json
echo "::endgroup::"
{
echo 'AUTO_RELEASE_PARAMS<<EOF'
cat /tmp/autorelease_info.json
echo 'EOF'
} >> "$GITHUB_ENV"
{
echo 'AUTO_RELEASE_PARAMS<<EOF'
cat /tmp/autorelease_info.json
echo 'EOF'
} >> "$GITHUB_OUTPUT"
echo "DRY_RUN=true" >> "$GITHUB_OUTPUT"
- name: Post Release Branch statuses
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 auto_release.py --post-status
- name: Clean up
uses: ./.github/actions/clean
Release_0:
needs: AutoReleaseInfo
name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].release_branch }}
if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].ready }}
uses: ./.github/workflows/create_release.yml
with:
ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].commit_sha }}
type: patch
dry-run: ${{ needs.AutoReleaseInfo.outputs.dry_run }}
#
# Release_1:
# needs: [AutoReleaseInfo, Release_0]
# name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].release_branch }}
# if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].ready }}
# uses: ./.github/workflows/create_release.yml
# with:
# ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].commit_sha }}
# type: patch
# dry-run: ${{ env.DRY_RUN }}
#
# Release_2:
# needs: [AutoReleaseInfo, Release_1]
# name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[2].release_branch }}
# if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[2].ready }}
# uses: ./.github/workflow/create_release.yml
# with:
# ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].commit_sha }}
# type: patch
# dry-run: ${{ env.DRY_RUN }}
#
# Release_3:
# needs: [AutoReleaseInfo, Release_2]
# name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].release_branch }}
# if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].ready }}
# uses: ./.github/workflow/create_release.yml
# with:
# ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].commit_sha }}
# type: patch
# dry-run: ${{ env.DRY_RUN }}
# - name: Post Slack Message
# if: ${{ !cancelled() }}
# run: |
# cd "$GITHUB_WORKSPACE/tests/ci"
# python3 auto_release.py --post-auto-release-complete --wf-status ${{ job.status }}

View File

@ -2,6 +2,7 @@ name: CreateRelease
concurrency:
group: release
'on':
workflow_dispatch:
inputs:
@ -26,6 +27,26 @@ concurrency:
required: false
default: false
type: boolean
workflow_call:
inputs:
ref:
description: 'Git reference (branch or commit sha) from which to create the release'
required: true
type: string
type:
description: 'The type of release: "new" for a new release or "patch" for a patch release'
required: true
type: string
only-repo:
description: 'Run only repos updates including docker (repo-recovery, tests)'
required: false
default: false
type: boolean
dry-run:
description: 'Dry run'
required: false
default: false
type: boolean
jobs:
CreateRelease:
@ -101,6 +122,7 @@ jobs:
--volume=".:/wd" --workdir="/wd" \
clickhouse/style-test \
./tests/ci/changelog.py -v --debug-helpers \
--gh-user-or-token ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }} \
--jobs=5 \
--output="./docs/changelogs/${{ env.RELEASE_TAG }}.md" ${{ env.RELEASE_TAG }}
git add ./docs/changelogs/${{ env.RELEASE_TAG }}.md
@ -129,9 +151,9 @@ jobs:
if: ${{ inputs.type == 'patch' && ! inputs.only-repo }}
shell: bash
run: |
python3 ./tests/ci/create_release.py --set-progress-completed
git reset --hard HEAD
git checkout "$GITHUB_REF_NAME"
python3 ./tests/ci/create_release.py --set-progress-completed
- name: Create GH Release
if: ${{ inputs.type == 'patch' && ! inputs.only-repo }}
shell: bash

3
.gitmodules vendored
View File

@ -345,9 +345,6 @@
[submodule "contrib/FP16"]
path = contrib/FP16
url = https://github.com/Maratyszcza/FP16.git
[submodule "contrib/robin-map"]
path = contrib/robin-map
url = https://github.com/Tessil/robin-map.git
[submodule "contrib/aklomp-base64"]
path = contrib/aklomp-base64
url = https://github.com/aklomp/base64.git

View File

@ -322,17 +322,21 @@ if (DISABLE_OMIT_FRAME_POINTER)
set (CMAKE_ASM_FLAGS_ADD "${CMAKE_ASM_FLAGS_ADD} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
endif()
# Before you start hating your debugger because it refuses to show variables ('<optimized out>'), try building with -DDEBUG_O_LEVEL="0"
# https://stackoverflow.com/questions/63386189/whats-the-difference-between-a-compilers-o0-option-and-og-option/63386263#63386263
set(DEBUG_O_LEVEL "g" CACHE STRING "The -Ox level used for debug builds")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMPILER_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_RELWITHDEBINFO "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
if (OS_DARWIN)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")

View File

@ -27,27 +27,6 @@ bool cgroupsV2Enabled()
#endif
}
bool cgroupsV2MemoryControllerEnabled()
{
#if defined(OS_LINUX)
chassert(cgroupsV2Enabled());
/// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available
/// for the current + child cgroups. The set of available controllers can be restricted from level to level using file
/// "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file.
fs::path cgroup_dir = cgroupV2PathOfProcess();
if (cgroup_dir.empty())
return false;
std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
if (!controllers_file.is_open())
return false;
std::string controllers;
std::getline(controllers_file, controllers);
return controllers.find("memory") != std::string::npos;
#else
return false;
#endif
}
fs::path cgroupV2PathOfProcess()
{
#if defined(OS_LINUX)
@ -71,3 +50,28 @@ fs::path cgroupV2PathOfProcess()
return {};
#endif
}
std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name)
{
#if defined(OS_LINUX)
if (!cgroupsV2Enabled())
return {};
fs::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};
/// Return the bottom-most nested file. If there is no such file at the current
/// level, try again at the parent level as settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
const auto path = current_cgroup / file_name;
if (fs::exists(path))
return {current_cgroup};
current_cgroup = current_cgroup.parent_path();
}
return {};
#else
return {};
#endif
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <filesystem>
#include <string_view>
#if defined(OS_LINUX)
/// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
@ -11,11 +12,11 @@ static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgrou
/// Is cgroups v2 enabled on the system?
bool cgroupsV2Enabled();
/// Is the memory controller of cgroups v2 enabled on the system?
/// Assumes that cgroupsV2Enabled() is enabled.
bool cgroupsV2MemoryControllerEnabled();
/// Detects which cgroup v2 the process belongs to and returns the filesystem path to the cgroup.
/// Returns an empty path the cgroup cannot be determined.
/// Assumes that cgroupsV2Enabled() is enabled.
std::filesystem::path cgroupV2PathOfProcess();
/// Returns the most nested cgroup dir containing the specified file.
/// If cgroups v2 is not enabled - returns an empty optional.
std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name);

View File

@ -19,9 +19,6 @@ std::optional<uint64_t> getCgroupsV2MemoryLimit()
if (!cgroupsV2Enabled())
return {};
if (!cgroupsV2MemoryControllerEnabled())
return {};
std::filesystem::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};

View File

@ -2,11 +2,11 @@
# NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION,
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
SET(VERSION_REVISION 54489)
SET(VERSION_REVISION 54490)
SET(VERSION_MAJOR 24)
SET(VERSION_MINOR 8)
SET(VERSION_MINOR 9)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH 3f8b27d7accd2b5ec4afe7d0dd459115323304af)
SET(VERSION_DESCRIBE v24.8.1.1-testing)
SET(VERSION_STRING 24.8.1.1)
SET(VERSION_GITHASH e02b434d2fc0c4fbee29ca675deab7474d274608)
SET(VERSION_DESCRIBE v24.9.1.1-testing)
SET(VERSION_STRING 24.9.1.1)
# end of autochange

View File

@ -209,9 +209,8 @@ endif()
option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES})
if (ENABLE_USEARCH)
add_contrib (FP16-cmake FP16)
add_contrib (robin-map-cmake robin-map)
add_contrib (SimSIMD-cmake SimSIMD)
add_contrib (usearch-cmake usearch) # requires: FP16, robin-map, SimdSIMD
add_contrib (usearch-cmake usearch) # requires: FP16, SimdSIMD
else ()
message(STATUS "Not using USearch")
endif ()

2
contrib/SimSIMD vendored

@ -1 +1 @@
Subproject commit de2cb75b9e9e3389d5e1e51fd9f8ed151f3c17cf
Subproject commit 91a76d1ac519b3b9dc8957734a3dabd985f00c26

2
contrib/libunwind vendored

@ -1 +1 @@
Subproject commit a89d904befea07814628c6ce0b44083c4e149c62
Subproject commit 601db0b0e03018c01710470a37703b618f9cf08b

1
contrib/robin-map vendored

@ -1 +0,0 @@
Subproject commit 851a59e0e3063ee0e23089062090a73fd3de482d

View File

@ -1 +0,0 @@
# See contrib/usearch-cmake/CMakeLists.txt

2
contrib/usearch vendored

@ -1 +1 @@
Subproject commit 30810452bec5d3d3aa0931bb5d761e2f09aa6356
Subproject commit e21a5778a0d4469ddaf38c94b7be0196bb701ee4

View File

@ -1,5 +1,4 @@
set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
@ -7,7 +6,6 @@ add_library(_usearch INTERFACE)
target_include_directories(_usearch SYSTEM INTERFACE
${FP16_PROJECT_DIR}/include
${ROBIN_MAP_PROJECT_DIR}/include
${SIMSIMD_PROJECT_DIR}/include
${USEARCH_PROJECT_DIR}/include)

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.3.7.30-lts (c8a28cf4331) FIXME as compared to v24.3.6.48-lts (b2d33c3c45d)
#### Improvement
* Backported in [#68103](https://github.com/ClickHouse/ClickHouse/issues/68103): Distinguish booleans and integers while parsing values for custom settings: ``` SET custom_a = true; SET custom_b = 1; ```. [#62206](https://github.com/ClickHouse/ClickHouse/pull/62206) ([Vitaly Baranov](https://github.com/vitlibar)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#67931](https://github.com/ClickHouse/ClickHouse/issues/67931): Fixing the `Not-ready Set` error after the `PREWHERE` optimization for StorageMerge. [#65057](https://github.com/ClickHouse/ClickHouse/pull/65057) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Backported in [#68062](https://github.com/ClickHouse/ClickHouse/issues/68062): Fix boolean literals in query sent to external database (for engines like `PostgreSQL`). [#66282](https://github.com/ClickHouse/ClickHouse/pull/66282) ([vdimir](https://github.com/vdimir)).
* Backported in [#67812](https://github.com/ClickHouse/ClickHouse/issues/67812): Only relevant to the experimental Variant data type. Fix crash with Variant + AggregateFunction type. [#67122](https://github.com/ClickHouse/ClickHouse/pull/67122) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#67848](https://github.com/ClickHouse/ClickHouse/issues/67848): Fixes [#66026](https://github.com/ClickHouse/ClickHouse/issues/66026). Avoid unresolved table function arguments traversal in `ReplaceTableNodeToDummyVisitor`. [#67522](https://github.com/ClickHouse/ClickHouse/pull/67522) ([Dmitry Novik](https://github.com/novikd)).
* Backported in [#68271](https://github.com/ClickHouse/ClickHouse/issues/68271): Fix inserting into stream like engines (Kafka, RabbitMQ, NATS) through HTTP interface. [#67554](https://github.com/ClickHouse/ClickHouse/pull/67554) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Backported in [#67806](https://github.com/ClickHouse/ClickHouse/issues/67806): Fix reloading SQL UDFs with UNION. Previously, restarting the server could make UDF invalid. [#67665](https://github.com/ClickHouse/ClickHouse/pull/67665) ([Antonio Andelic](https://github.com/antonio2368)).
* Backported in [#67834](https://github.com/ClickHouse/ClickHouse/issues/67834): Fix potential stack overflow in `JSONMergePatch` function. Renamed this function from `jsonMergePatch` to `JSONMergePatch` because the previous name was wrong. The previous name is still kept for compatibility. Improved diagnostic of errors in the function. This closes [#67304](https://github.com/ClickHouse/ClickHouse/issues/67304). [#67756](https://github.com/ClickHouse/ClickHouse/pull/67756) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Backported in [#68206](https://github.com/ClickHouse/ClickHouse/issues/68206): Fix wrong `count()` result when there is non-deterministic function in predicate. [#67922](https://github.com/ClickHouse/ClickHouse/pull/67922) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Backported in [#68089](https://github.com/ClickHouse/ClickHouse/issues/68089): Fixed the calculation of the maximum thread soft limit in containerized environments where the usable CPU count is limited. [#67963](https://github.com/ClickHouse/ClickHouse/pull/67963) ([Robert Schulze](https://github.com/rschu1ze)).
* Backported in [#68120](https://github.com/ClickHouse/ClickHouse/issues/68120): Fixed skipping of untouched parts in mutations with new analyzer. Previously with enabled analyzer data in part could be rewritten by mutation even if mutation doesn't affect this part according to predicate. [#68052](https://github.com/ClickHouse/ClickHouse/pull/68052) ([Anton Popov](https://github.com/CurtizJ)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Update version after release. [#67676](https://github.com/ClickHouse/ClickHouse/pull/67676) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Backported in [#68074](https://github.com/ClickHouse/ClickHouse/issues/68074): Add an explicit error for `ALTER MODIFY SQL SECURITY` on non-view tables. [#67953](https://github.com/ClickHouse/ClickHouse/pull/67953) ([pufit](https://github.com/pufit)).

View File

@ -59,6 +59,8 @@ Parameters:
- `ef_construction`: (optional, default: 128)
- `ef_search`: (optional, default: 64)
Value 0 for parameters `m`, `ef_construction`, and `ef_search` refers to the default value.
Example:
```sql

View File

@ -359,13 +359,14 @@ DESC format(JSONEachRow, '{"int" : 42, "float" : 42.42, "string" : "Hello, World
Dates, DateTimes:
```sql
DESC format(JSONEachRow, '{"date" : "2022-01-01", "datetime" : "2022-01-01 00:00:00"}')
DESC format(JSONEachRow, '{"date" : "2022-01-01", "datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}')
```
```response
┌─name─────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ date │ Nullable(Date) │ │ │ │ │ │
│ datetime │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ date │ Nullable(Date) │ │ │ │ │ │
│ datetime │ Nullable(DateTime) │ │ │ │ │ │
│ datetime64 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
Arrays:
@ -759,12 +760,13 @@ DESC format(CSV, 'Hello world!,World hello!')
Dates, DateTimes:
```sql
DESC format(CSV, '"2020-01-01","2020-01-01 00:00:00"')
DESC format(CSV, '"2020-01-01","2020-01-01 00:00:00","2022-01-01 00:00:00.000"')
```
```response
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Date) │ │ │ │ │ │
│ c2 │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ c2 │ Nullable(DateTime) │ │ │ │ │ │
│ c3 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
@ -956,12 +958,13 @@ DESC format(TSKV, 'int=42 float=42.42 bool=true string=Hello,World!\n')
Dates, DateTimes:
```sql
DESC format(TSV, '2020-01-01 2020-01-01 00:00:00')
DESC format(TSV, '2020-01-01 2020-01-01 00:00:00 2022-01-01 00:00:00.000')
```
```response
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Date) │ │ │ │ │ │
│ c2 │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ c2 │ Nullable(DateTime) │ │ │ │ │ │
│ c3 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
@ -1126,12 +1129,13 @@ DESC format(Values, $$(42, 42.42, true, 'Hello,World!')$$)
Dates, DateTimes:
```sql
DESC format(Values, $$('2020-01-01', '2020-01-01 00:00:00')$$)
```
DESC format(Values, $$('2020-01-01', '2020-01-01 00:00:00', '2022-01-01 00:00:00.000')$$)
```
```response
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Date) │ │ │ │ │ │
│ c2 │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ c2 │ Nullable(DateTime) │ │ │ │ │ │
│ c3 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
@ -1504,8 +1508,8 @@ DESC format(JSONEachRow, $$
#### input_format_try_infer_datetimes
If enabled, ClickHouse will try to infer type `DateTime64` from string fields in schema inference for text formats.
If all fields from a column in sample data were successfully parsed as datetimes, the result type will be `DateTime64(9)`,
If enabled, ClickHouse will try to infer type `DateTime` or `DateTime64` from string fields in schema inference for text formats.
If all fields from a column in sample data were successfully parsed as datetimes, the result type will be `DateTime` or `DateTime64(9)` (if any datetime had fractional part),
if at least one field was not parsed as datetime, the result type will be `String`.
Enabled by default.
@ -1513,39 +1517,66 @@ Enabled by default.
**Examples**
```sql
SET input_format_try_infer_datetimes = 0
SET input_format_try_infer_datetimes = 0;
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00.000"}
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
$$)
```
```response
┌─name─────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
│ datetime64 │ Nullable(String) │ │ │ │ │ │
└────────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
```sql
SET input_format_try_infer_datetimes = 1
SET input_format_try_infer_datetimes = 1;
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00.000"}
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
$$)
```
```response
┌─name─────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(DateTime) │ │ │ │ │ │
│ datetime64 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
```sql
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00.000"}
{"datetime" : "unknown"}
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "unknown", "datetime64" : "unknown"}
$$)
```
```response
┌─name─────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
│ datetime64 │ Nullable(String) │ │ │ │ │ │
└────────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
#### input_format_try_infer_datetimes_only_datetime64
If enabled, ClickHouse will always infer `DateTime64(9)` when `input_format_try_infer_datetimes` is enabled even if datetime values don't contain fractional part.
Disabled by default.
**Examples**
```sql
SET input_format_try_infer_datetimes = 1;
SET input_format_try_infer_datetimes_only_datetime64 = 1;
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
$$)
```
```text
┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ datetime64 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
Note: Parsing datetimes during schema inference respect setting [date_time_input_format](/docs/en/operations/settings/settings-formats.md#date_time_input_format)

View File

@ -1042,10 +1042,23 @@ Compression rates of LZ4 or ZSTD improve on average by 20-40%.
This setting works best for tables with no primary key or a low-cardinality primary key, i.e. a table with only few distinct primary key values.
High-cardinality primary keys, e.g. involving timestamp columns of type `DateTime64`, are not expected to benefit from this setting.
### deduplicate_merge_projection_mode
## lightweight_mutation_projection_mode
By default, lightweight delete `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation. So the default value would be `throw`.
However, this option can change the behavior. With the value either `drop` or `rebuild`, deletes will work with projections. `drop` would delete the projection so it might be fast in the current query as projection gets deleted but slow in future queries as no projection attached.
`rebuild` would rebuild the projection which might affect the performance of the current query, but might speedup for future queries. A good thing is that these options would only work in the part level,
which means projections in the part that don't get touched would stay intact instead of triggering any action like drop or rebuild.
Possible values:
- throw, drop, rebuild
Default value: throw
## deduplicate_merge_projection_mode
Whether to allow create projection for the table with non-classic MergeTree, that is not (Replicated, Shared) MergeTree. If allowed, what is the action when merge projections, either drop or rebuild. So classic MergeTree would ignore this setting.
It also controls `OPTIMIZE DEDUPLICATE` as well, but has effect on all MergeTree family members.
It also controls `OPTIMIZE DEDUPLICATE` as well, but has effect on all MergeTree family members. Similar to the option `lightweight_mutation_projection_mode`, it is also part level.
Possible values:

View File

@ -5654,3 +5654,9 @@ Possible values:
- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
Default value: `0`.
## create_if_not_exists
Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
Default value: `false`.

View File

@ -38,8 +38,7 @@ If you anticipate frequent deletes, consider using a [custom partitioning key](/
### Lightweight `DELETE`s with projections
By default, `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation and may require the projection to be rebuilt, negatively affecting `DELETE` performance.
However, there is an option to change this behavior. By changing setting `lightweight_mutation_projection_mode = 'drop'`, deletes will work with projections.
By default, `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation. But there is a [MergeTree setting](https://clickhouse.com/docs/en/operations/settings/merge-tree-settings) `lightweight_mutation_projection_mode` can change the behavior.
## Performance considerations when using lightweight `DELETE`

View File

@ -75,6 +75,8 @@ public:
const String & default_database_,
const String & user_,
const String & password_,
const String & proto_send_chunked_,
const String & proto_recv_chunked_,
const String & quota_key_,
const String & stage,
bool randomize_,
@ -128,7 +130,9 @@ public:
connections.emplace_back(std::make_unique<ConnectionPool>(
concurrency,
cur_host, cur_port,
default_database_, user_, password_, quota_key_,
default_database_, user_, password_,
proto_send_chunked_, proto_recv_chunked_,
quota_key_,
/* cluster_= */ "",
/* cluster_secret_= */ "",
/* client_name_= */ std::string(DEFAULT_CLIENT_NAME),
@ -662,6 +666,50 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv)
Strings hosts = options.count("host") ? options["host"].as<Strings>() : Strings({"localhost"});
String proto_send_chunked {"notchunked"};
String proto_recv_chunked {"notchunked"};
if (options.count("proto_caps"))
{
std::string proto_caps_str = options["proto_caps"].as<std::string>();
std::vector<std::string_view> proto_caps;
splitInto<','>(proto_caps, proto_caps_str);
for (auto cap_str : proto_caps)
{
std::string direction;
if (cap_str.starts_with("send_"))
{
direction = "send";
cap_str = cap_str.substr(std::string_view("send_").size());
}
else if (cap_str.starts_with("recv_"))
{
direction = "recv";
cap_str = cap_str.substr(std::string_view("recv_").size());
}
if (cap_str != "chunked" && cap_str != "notchunked" && cap_str != "chunked_optional" && cap_str != "notchunked_optional")
throw Exception(ErrorCodes::BAD_ARGUMENTS, "proto_caps option is incorrect ({})", proto_caps_str);
if (direction.empty())
{
proto_send_chunked = cap_str;
proto_recv_chunked = cap_str;
}
else
{
if (direction == "send")
proto_send_chunked = cap_str;
else
proto_recv_chunked = cap_str;
}
}
}
Benchmark benchmark(
options["concurrency"].as<unsigned>(),
options["delay"].as<double>(),
@ -673,6 +721,8 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv)
options["database"].as<std::string>(),
options["user"].as<std::string>(),
options["password"].as<std::string>(),
proto_send_chunked,
proto_recv_chunked,
options["quota_key"].as<std::string>(),
options["stage"].as<std::string>(),
options.count("randomize"),

View File

@ -38,6 +38,21 @@
<production>{display_name} \e[1;31m:)\e[0m </production> <!-- if it matched to the substring "production" in the server display name -->
</prompt_by_server_display_name>
<!-- Chunked capabilities for native protocol by client.
Can be enabled separately for send and receive channels.
Supported modes:
- chunked - client will only work with server supporting chunked protocol;
- chunked_optional - client prefer server to enable chunked protocol, but can switch to notchunked if server does not support this;
- notchunked - client will only work with server supporting notchunked protocol (current default);
- notchunked_optional - client prefer server notchunked protocol, but can switch to chunked if server does not support this.
-->
<!--
<proto_caps>
<send>chunked_optional</send>
<recv>chunked_optional</recv>
</proto_caps>
-->
<!--
Settings adjustable via command-line parameters
can take their defaults from that config file, see examples:

View File

@ -150,6 +150,21 @@
-->
<tcp_port>9000</tcp_port>
<!-- Chunked capabilities for native protocol by server.
Can be enabled separately for send and receive channels.
Supported modes:
- chunked - server requires from client to have chunked enabled;
- chunked_optional - server supports both chunked and notchunked protocol;
- notchunked - server requires from client notchunked protocol (current default);
- notchunked_optional - server supports both chunked and notchunked protocol.
-->
<!--
<proto_caps>
<send>notchunked_optional</send>
<recv>notchunked_optional</recv>
</proto_caps>
-->
<!-- Compatibility with MySQL protocol.
ClickHouse will pretend to be MySQL for applications connecting to this port.
-->

View File

@ -242,7 +242,8 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const
/// Avoid cast for `IN tuple(...)` expression.
/// Tuples could be quite big, and adding a type may significantly increase query size.
/// It should be safe because set type for `column IN tuple` is deduced from `column` type.
if (isNameOfInFunction(function_name) && argument_nodes.size() > 1 && argument_nodes[1]->getNodeType() == QueryTreeNodeType::CONSTANT)
if (isNameOfInFunction(function_name) && argument_nodes.size() > 1 && argument_nodes[1]->getNodeType() == QueryTreeNodeType::CONSTANT
&& !static_cast<const ConstantNode *>(argument_nodes[1].get())->hasSourceExpression())
new_options.add_cast_for_constants = false;
const auto & parameters = getParameters();

View File

@ -490,8 +490,6 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
/// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
auto process_list_element = context_in_use->getProcessListElement();
/// Update context to preserve query information in processlist (settings, current_database)
process_list_element->updateContext(context_in_use);
thread_pool.scheduleOrThrowOnError(
[this,
@ -855,8 +853,6 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
/// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously.
auto process_list_element = context_in_use->getProcessListElement();
/// Update context to preserve query information in processlist (settings, current_database)
process_list_element->updateContext(context_in_use);
thread_pool.scheduleOrThrowOnError(
[this,

View File

@ -158,6 +158,8 @@ void ClientApplicationBase::init(int argc, char ** argv)
("config-file,C", po::value<std::string>(), "config-file path")
("proto_caps", po::value<std::string>(), "enable/disable chunked protocol: chunked_optional, notchunked, notchunked_optional, send_chunked, send_chunked_optional, send_notchunked, send_notchunked_optional, recv_chunked, recv_chunked_optional, recv_notchunked, recv_notchunked_optional")
("query,q", po::value<std::vector<std::string>>()->multitoken(), R"(Query. Can be specified multiple times (--query "SELECT 1" --query "SELECT 2") or once with multiple comma-separated queries (--query "SELECT 1; SELECT 2;"). In the latter case, INSERT queries with non-VALUE format must be separated by empty lines.)")
("queries-file", po::value<std::vector<std::string>>()->multitoken(), "file path with queries to execute; multiple files can be specified (--queries-file file1 file2...)")
("multiquery,n", "Obsolete, does nothing")
@ -337,6 +339,41 @@ void ClientApplicationBase::init(int argc, char ** argv)
if (options.count("server_logs_file"))
server_logs_file = options["server_logs_file"].as<std::string>();
if (options.count("proto_caps"))
{
std::string proto_caps_str = options["proto_caps"].as<std::string>();
std::vector<std::string_view> proto_caps;
splitInto<','>(proto_caps, proto_caps_str);
for (auto cap_str : proto_caps)
{
std::string direction;
if (cap_str.starts_with("send_"))
{
direction = "send";
cap_str = cap_str.substr(std::string_view("send_").size());
}
else if (cap_str.starts_with("recv_"))
{
direction = "recv";
cap_str = cap_str.substr(std::string_view("recv_").size());
}
if (cap_str != "chunked" && cap_str != "notchunked" && cap_str != "chunked_optional" && cap_str != "notchunked_optional")
throw Exception(ErrorCodes::BAD_ARGUMENTS, "proto_caps option is incorrect ({})", proto_caps_str);
if (direction.empty())
{
config().setString("proto_caps.send", std::string(cap_str));
config().setString("proto_caps.recv", std::string(cap_str));
}
else
config().setString("proto_caps." + direction, std::string(cap_str));
}
}
query_processing_stage = QueryProcessingStage::fromString(options["stage"].as<std::string>());
query_kind = parseQueryKind(options["query_kind"].as<std::string>());
profile_events.print = options.count("print-profile-events");

View File

@ -73,9 +73,11 @@
#include <limits>
#include <map>
#include <memory>
#include <string_view>
#include <unordered_map>
#include <Common/config_version.h>
#include <base/find_symbols.h>
#include "config.h"
#include <IO/ReadHelpers.h>
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
@ -914,6 +916,8 @@ void ClientBase::processTextAsSingleQuery(const String & full_query)
}
catch (Exception & e)
{
if (server_exception)
server_exception->rethrow();
if (!is_interactive)
e.addMessage("(in query: {})", full_query);
throw;
@ -1032,19 +1036,28 @@ void ClientBase::processOrdinaryQuery(const String & query_to_execute, ASTPtr pa
query_interrupt_handler.start(signals_before_stop);
SCOPE_EXIT({ query_interrupt_handler.stop(); });
connection->sendQuery(
connection_parameters.timeouts,
query,
query_parameters,
client_context->getCurrentQueryId(),
query_processing_stage,
&client_context->getSettingsRef(),
&client_context->getClientInfo(),
true,
[&](const Progress & progress) { onProgress(progress); });
try {
connection->sendQuery(
connection_parameters.timeouts,
query,
query_parameters,
client_context->getCurrentQueryId(),
query_processing_stage,
&client_context->getSettingsRef(),
&client_context->getClientInfo(),
true,
[&](const Progress & progress) { onProgress(progress); });
if (send_external_tables)
sendExternalTables(parsed_query);
}
catch (const NetException &)
{
// We still want to attempt to process whatever we already received or can receive (socket receive buffer can be not empty)
receiveResult(parsed_query, signals_before_stop, settings.partial_result_on_first_cancel);
throw;
}
if (send_external_tables)
sendExternalTables(parsed_query);
receiveResult(parsed_query, signals_before_stop, settings.partial_result_on_first_cancel);
break;

View File

@ -5,8 +5,6 @@
#include <Core/Settings.h>
#include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedWriteBuffer.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
@ -85,6 +83,7 @@ Connection::~Connection()
Connection::Connection(const String & host_, UInt16 port_,
const String & default_database_,
const String & user_, const String & password_,
const String & proto_send_chunked_, const String & proto_recv_chunked_,
[[maybe_unused]] const SSHKey & ssh_private_key_,
const String & jwt_,
const String & quota_key_,
@ -95,6 +94,7 @@ Connection::Connection(const String & host_, UInt16 port_,
Protocol::Secure secure_)
: host(host_), port(port_), default_database(default_database_)
, user(user_), password(password_)
, proto_send_chunked(proto_send_chunked_), proto_recv_chunked(proto_recv_chunked_)
#if USE_SSH
, ssh_private_key(ssh_private_key_)
#endif
@ -211,10 +211,10 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
, tcp_keep_alive_timeout_in_sec);
}
in = std::make_shared<ReadBufferFromPocoSocket>(*socket);
in = std::make_shared<ReadBufferFromPocoSocketChunked>(*socket);
in->setAsyncCallback(async_callback);
out = std::make_shared<WriteBufferFromPocoSocket>(*socket);
out = std::make_shared<WriteBufferFromPocoSocketChunked>(*socket);
out->setAsyncCallback(async_callback);
connected = true;
setDescription();
@ -222,9 +222,61 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
sendHello();
receiveHello(timeouts.handshake_timeout);
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS)
{
/// Client side of chunked protocol negotiation.
/// Server advertises its protocol capabilities (separate for send and receive channels) by sending
/// in its 'Hello' response one of four types - chunked, notchunked, chunked_optional, notchunked_optional.
/// Not optional types are strict meaning that server only supports this type, optional means that
/// server prefer this type but capable to work in opposite.
/// Client selects which type it is going to communicate based on the settings from config or arguments,
/// and sends either "chunked" or "notchunked" protocol request in addendum section of handshake.
/// Client can detect if server's protocol capabilities are not compatible with client's settings (for example
/// server strictly requires chunked protocol but client's settings only allows notchunked protocol) - in such case
/// client should interrupt this connection. However if client continues with incompatible protocol type request, server
/// will send appropriate exception and disconnect client.
auto is_chunked = [](const String & chunked_srv_str, const String & chunked_cl_str, const String & direction)
{
bool chunked_srv = chunked_srv_str.starts_with("chunked");
bool optional_srv = chunked_srv_str.ends_with("_optional");
bool chunked_cl = chunked_cl_str.starts_with("chunked");
bool optional_cl = chunked_cl_str.ends_with("_optional");
if (optional_srv)
return chunked_cl;
if (optional_cl)
return chunked_srv;
if (chunked_cl != chunked_srv)
throw NetException(
ErrorCodes::NETWORK_ERROR,
"Incompatible protocol: {} set to {}, server requires {}",
direction,
chunked_cl ? "chunked" : "notchunked",
chunked_srv ? "chunked" : "notchunked");
return chunked_srv;
};
proto_send_chunked = is_chunked(proto_recv_chunked_srv, proto_send_chunked, "send") ? "chunked" : "notchunked";
proto_recv_chunked = is_chunked(proto_send_chunked_srv, proto_recv_chunked, "recv") ? "chunked" : "notchunked";
}
else
{
if (proto_send_chunked == "chunked" || proto_recv_chunked == "chunked")
throw NetException(
ErrorCodes::NETWORK_ERROR,
"Incompatible protocol: server's version is too old and doesn't support chunked protocol while client settings require it.");
}
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_ADDENDUM)
sendAddendum();
if (proto_send_chunked == "chunked")
out->enableChunked();
if (proto_recv_chunked == "chunked")
in->enableChunked();
LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.",
server_name, server_version_major, server_version_minor, server_version_patch);
}
@ -393,6 +445,13 @@ void Connection::sendAddendum()
{
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_QUOTA_KEY)
writeStringBinary(quota_key, *out);
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS)
{
writeStringBinary(proto_send_chunked, *out);
writeStringBinary(proto_recv_chunked, *out);
}
out->next();
}
@ -472,6 +531,12 @@ void Connection::receiveHello(const Poco::Timespan & handshake_timeout)
else
server_version_patch = server_revision;
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS)
{
readStringBinary(proto_send_chunked_srv, *in);
readStringBinary(proto_recv_chunked_srv, *in);
}
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_PASSWORD_COMPLEXITY_RULES)
{
UInt64 rules_size;
@ -611,6 +676,7 @@ bool Connection::ping(const ConnectionTimeouts & timeouts)
UInt64 pong = 0;
writeVarUInt(Protocol::Client::Ping, *out);
out->finishChunk();
out->next();
if (in->eof())
@ -660,6 +726,7 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time
writeVarUInt(Protocol::Client::TablesStatusRequest, *out);
request.write(*out, server_revision);
out->finishChunk();
out->next();
UInt64 response_type = 0;
@ -813,6 +880,8 @@ void Connection::sendQuery(
block_profile_events_in.reset();
block_out.reset();
out->finishChunk();
/// Send empty block which means end of data.
if (!with_pending_data)
{
@ -829,6 +898,7 @@ void Connection::sendCancel()
return;
writeVarUInt(Protocol::Client::Cancel, *out);
out->finishChunk();
out->next();
}
@ -854,7 +924,10 @@ void Connection::sendData(const Block & block, const String & name, bool scalar)
size_t prev_bytes = out->count();
block_out->write(block);
maybe_compressed_out->next();
if (maybe_compressed_out != out)
maybe_compressed_out->next();
if (!block)
out->finishChunk();
out->next();
if (throttler)
@ -865,6 +938,7 @@ void Connection::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
{
writeVarUInt(Protocol::Client::IgnoredPartUUIDs, *out);
writeVectorBinary(uuids, *out);
out->finishChunk();
out->next();
}
@ -874,6 +948,7 @@ void Connection::sendReadTaskResponse(const String & response)
writeVarUInt(Protocol::Client::ReadTaskResponse, *out);
writeVarUInt(DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION, *out);
writeStringBinary(response, *out);
out->finishChunk();
out->next();
}
@ -882,6 +957,7 @@ void Connection::sendMergeTreeReadTaskResponse(const ParallelReadResponse & resp
{
writeVarUInt(Protocol::Client::MergeTreeReadTaskResponse, *out);
response.serialize(*out);
out->finishChunk();
out->next();
}
@ -899,6 +975,8 @@ void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String
copyData(input, *out);
else
copyData(input, *out, size);
out->finishChunk();
out->next();
}
@ -927,6 +1005,8 @@ void Connection::sendScalarsData(Scalars & data)
sendData(elem.second, elem.first, true /* scalar */);
}
out->finishChunk();
out_bytes = out->count() - out_bytes;
maybe_compressed_out_bytes = maybe_compressed_out->count() - maybe_compressed_out_bytes;
double elapsed = watch.elapsedSeconds();
@ -1069,13 +1149,13 @@ std::optional<Poco::Net::SocketAddress> Connection::getResolvedAddress() const
bool Connection::poll(size_t timeout_microseconds)
{
return static_cast<ReadBufferFromPocoSocket &>(*in).poll(timeout_microseconds);
return in->poll(timeout_microseconds);
}
bool Connection::hasReadPendingData() const
{
return last_input_packet_type.has_value() || static_cast<const ReadBufferFromPocoSocket &>(*in).hasPendingData();
return last_input_packet_type.has_value() || in->hasBufferedData();
}
@ -1349,6 +1429,8 @@ ServerConnectionPtr Connection::createConnection(const ConnectionParameters & pa
parameters.default_database,
parameters.user,
parameters.password,
parameters.proto_send_chunked,
parameters.proto_recv_chunked,
parameters.ssh_private_key,
parameters.jwt,
parameters.quota_key,

View File

@ -8,8 +8,8 @@
#include <Core/Defines.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <IO/ReadBufferFromPocoSocketChunked.h>
#include <IO/WriteBufferFromPocoSocketChunked.h>
#include <Interpreters/TablesStatus.h>
#include <Interpreters/Context_fwd.h>
@ -52,6 +52,7 @@ public:
Connection(const String & host_, UInt16 port_,
const String & default_database_,
const String & user_, const String & password_,
const String & proto_send_chunked_, const String & proto_recv_chunked_,
const SSHKey & ssh_private_key_,
const String & jwt_,
const String & quota_key_,
@ -170,6 +171,10 @@ private:
String default_database;
String user;
String password;
String proto_send_chunked;
String proto_recv_chunked;
String proto_send_chunked_srv;
String proto_recv_chunked_srv;
#if USE_SSH
SSHKey ssh_private_key;
#endif
@ -209,8 +214,8 @@ private:
String server_display_name;
std::unique_ptr<Poco::Net::StreamSocket> socket;
std::shared_ptr<ReadBufferFromPocoSocket> in;
std::shared_ptr<WriteBufferFromPocoSocket> out;
std::shared_ptr<ReadBufferFromPocoSocketChunked> in;
std::shared_ptr<WriteBufferFromPocoSocketChunked> out;
std::optional<UInt64> last_input_packet_type;
String query_id;

View File

@ -107,6 +107,9 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati
}
}
proto_send_chunked = config.getString("proto_caps.send", "notchunked");
proto_recv_chunked = config.getString("proto_caps.recv", "notchunked");
quota_key = config.getString("quota_key", "");
/// By default compression is disabled if address looks like localhost.

View File

@ -20,6 +20,8 @@ struct ConnectionParameters
std::string default_database;
std::string user;
std::string password;
std::string proto_send_chunked = "notchunked";
std::string proto_recv_chunked = "notchunked";
std::string quota_key;
SSHKey ssh_private_key;
std::string jwt;

View File

@ -13,6 +13,8 @@ ConnectionPoolPtr ConnectionPoolFactory::get(
String default_database,
String user,
String password,
String proto_send_chunked,
String proto_recv_chunked,
String quota_key,
String cluster,
String cluster_secret,
@ -22,7 +24,7 @@ ConnectionPoolPtr ConnectionPoolFactory::get(
Priority priority)
{
Key key{
max_connections, host, port, default_database, user, password, quota_key, cluster, cluster_secret, client_name, compression, secure, priority};
max_connections, host, port, default_database, user, password, proto_send_chunked, proto_recv_chunked, quota_key, cluster, cluster_secret, client_name, compression, secure, priority};
std::lock_guard lock(mutex);
auto [it, inserted] = pools.emplace(key, ConnectionPoolPtr{});
@ -39,6 +41,8 @@ ConnectionPoolPtr ConnectionPoolFactory::get(
default_database,
user,
password,
proto_send_chunked,
proto_recv_chunked,
quota_key,
cluster,
cluster_secret,

View File

@ -73,6 +73,8 @@ public:
const String & default_database_,
const String & user_,
const String & password_,
const String & proto_send_chunked_,
const String & proto_recv_chunked_,
const String & quota_key_,
const String & cluster_,
const String & cluster_secret_,
@ -85,6 +87,8 @@ public:
, default_database(default_database_)
, user(user_)
, password(password_)
, proto_send_chunked(proto_send_chunked_)
, proto_recv_chunked(proto_recv_chunked_)
, quota_key(quota_key_)
, cluster(cluster_)
, cluster_secret(cluster_secret_)
@ -116,7 +120,9 @@ protected:
{
return std::make_shared<Connection>(
host, port,
default_database, user, password, SSHKey(), /*jwt*/ "", quota_key,
default_database, user, password,
proto_send_chunked, proto_recv_chunked,
SSHKey(), /*jwt*/ "", quota_key,
cluster, cluster_secret,
client_name, compression, secure);
}
@ -125,6 +131,8 @@ private:
String default_database;
String user;
String password;
String proto_send_chunked;
String proto_recv_chunked;
String quota_key;
/// For inter-server authorization
@ -150,6 +158,8 @@ public:
String default_database;
String user;
String password;
String proto_send_chunked;
String proto_recv_chunked;
String quota_key;
String cluster;
String cluster_secret;
@ -173,6 +183,8 @@ public:
String default_database,
String user,
String password,
String proto_send_chunked,
String proto_recv_chunked,
String quota_key,
String cluster,
String cluster_secret,
@ -190,6 +202,7 @@ inline bool operator==(const ConnectionPoolFactory::Key & lhs, const ConnectionP
{
return lhs.max_connections == rhs.max_connections && lhs.host == rhs.host && lhs.port == rhs.port
&& lhs.default_database == rhs.default_database && lhs.user == rhs.user && lhs.password == rhs.password
&& lhs.proto_send_chunked == rhs.proto_send_chunked && lhs.proto_recv_chunked == rhs.proto_recv_chunked
&& lhs.quota_key == rhs.quota_key
&& lhs.cluster == rhs.cluster && lhs.cluster_secret == rhs.cluster_secret && lhs.client_name == rhs.client_name
&& lhs.compression == rhs.compression && lhs.secure == rhs.secure && lhs.priority == rhs.priority;

View File

@ -46,8 +46,8 @@ public:
return Base::create(std::move(column_unique), std::move(indexes), is_shared);
}
std::string getName() const override { return "ColumnLowCardinality"; }
const char * getFamilyName() const override { return "ColumnLowCardinality"; }
std::string getName() const override { return "LowCardinality(" + getDictionary().getNestedColumn()->getName() + ")"; }
const char * getFamilyName() const override { return "LowCardinality"; }
TypeIndex getDataType() const override { return TypeIndex::LowCardinality; }
ColumnPtr convertToFullColumn() const { return getDictionary().getNestedColumn()->index(getIndexes(), 0); }

View File

@ -48,6 +48,8 @@ private:
ColumnUnique(const ColumnUnique & other);
public:
std::string getName() const override { return "Unique(" + getNestedColumn()->getName() + ")"; }
MutableColumnPtr cloneEmpty() const override;
const ColumnPtr & getNestedColumn() const override;

View File

@ -73,7 +73,7 @@ public:
/// Returns dictionary hash which is SipHash is applied to each row of nested column.
virtual UInt128 getHash() const = 0;
const char * getFamilyName() const override { return "ColumnUnique"; }
const char * getFamilyName() const override { return "Unique"; }
TypeIndex getDataType() const override { return getNestedColumn()->getDataType(); }
void insert(const Field &) override

View File

@ -10,7 +10,7 @@ TEST(IColumn, dumpStructure)
{
auto type_lc = std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>());
ColumnPtr column_lc = type_lc->createColumn();
String expected_structure = "ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))";
String expected_structure = "LowCardinality(size = 0, UInt8(size = 0), Unique(size = 1, String(size = 1)))";
std::vector<std::thread> threads;
for (size_t i = 0; i < 6; ++i)

View File

@ -1,18 +1,24 @@
#include <Common/formatReadable.h>
#include <Common/AsynchronousMetrics.h>
#include <Common/Exception.h>
#include <Common/setThreadName.h>
#include <Common/CurrentMetrics.h>
#include <Common/filesystemHelpers.h>
#include <Common/logger_useful.h>
#include <IO/UncompressedCache.h>
#include <IO/MMappedFileCache.h>
#include <IO/ReadHelpers.h>
#include <IO/UncompressedCache.h>
#include <base/cgroupsv2.h>
#include <base/errnoToString.h>
#include <base/find_symbols.h>
#include <base/getPageSize.h>
#include <sys/resource.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/filesystemHelpers.h>
#include <Common/formatReadable.h>
#include <Common/logger_useful.h>
#include <Common/setThreadName.h>
#include <boost/locale/date_time_facet.hpp>
#include <chrono>
#include <string_view>
#include "config.h"
@ -52,6 +58,12 @@ static std::unique_ptr<ReadBufferFromFilePRead> openFileIfExists(const std::stri
return {};
}
static void openCgroupv2MetricFile(const std::string & filename, std::optional<ReadBufferFromFilePRead> & out)
{
if (auto path = getCgroupsV2PathContainingFile(filename))
openFileIfExists((path.value() + filename).c_str(), out);
};
#endif
@ -63,21 +75,15 @@ AsynchronousMetrics::AsynchronousMetrics(
, protocol_server_metrics_func(protocol_server_metrics_func_)
{
#if defined(OS_LINUX)
openFileIfExists("/proc/meminfo", meminfo);
openFileIfExists("/proc/loadavg", loadavg);
openFileIfExists("/proc/stat", proc_stat);
openFileIfExists("/proc/cpuinfo", cpuinfo);
openFileIfExists("/proc/sys/fs/file-nr", file_nr);
openFileIfExists("/proc/uptime", uptime);
openFileIfExists("/proc/net/dev", net_dev);
/// CGroups v2
openFileIfExists("/sys/fs/cgroup/memory.max", cgroupmem_limit_in_bytes);
if (cgroupmem_limit_in_bytes)
{
openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes);
}
openFileIfExists("/sys/fs/cgroup/cpu.max", cgroupcpu_max);
openCgroupv2MetricFile("memory.max", cgroupmem_limit_in_bytes);
openCgroupv2MetricFile("memory.current", cgroupmem_usage_in_bytes);
openCgroupv2MetricFile("cpu.max", cgroupcpu_max);
openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
/// CGroups v1
if (!cgroupmem_limit_in_bytes)
@ -90,6 +96,21 @@ AsynchronousMetrics::AsynchronousMetrics(
openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_period_us", cgroupcpu_cfs_period);
openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", cgroupcpu_cfs_quota);
}
if (!cgroupcpu_stat)
openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
if (!cgroupcpu_stat && !cgroupcpuacct_stat)
{
/// The following metrics are not cgroup-aware and we've found cgroup-specific metric files for the similar metrics,
/// so we're better not reporting them at all to avoid confusion
openFileIfExists("/proc/loadavg", loadavg);
openFileIfExists("/proc/stat", proc_stat);
openFileIfExists("/proc/uptime", uptime);
}
/// The same story for memory metrics
if (!cgroupmem_limit_in_bytes)
openFileIfExists("/proc/meminfo", meminfo);
openFileIfExists("/proc/sys/vm/max_map_count", vm_max_map_count);
openFileIfExists("/proc/self/maps", vm_maps);
@ -570,6 +591,151 @@ AsynchronousMetrics::NetworkInterfaceStatValues::operator-(const AsynchronousMet
#endif
#if defined(OS_LINUX)
void AsynchronousMetrics::applyCPUMetricsUpdate(
AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier)
{
new_values["OSUserTime" + cpu_suffix]
= {delta_values.user * multiplier,
"The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the "
"host machine, not just clickhouse-server."
" This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
"stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSNiceTime" + cpu_suffix]
= {delta_values.nice * multiplier,
"The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all "
"the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSSystemTime" + cpu_suffix]
= {delta_values.system * multiplier,
"The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIdleTime" + cpu_suffix]
= {delta_values.idle * multiplier,
"The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This "
"is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
"stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIOWaitTime" + cpu_suffix]
= {delta_values.iowait * multiplier,
"The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as "
"the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just "
"clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIrqTime" + cpu_suffix]
= {delta_values.irq * multiplier,
"The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate hardware misconfiguration or a very high network load."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSSoftIrqTime" + cpu_suffix]
= {delta_values.softirq * multiplier,
"The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate inefficient software running on the system."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSStealTime" + cpu_suffix]
= {delta_values.steal * multiplier,
"The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide "
"metric, it includes all the processes on the host machine, not just clickhouse-server."
" Not every virtualized environments present this metric, and most of them don't."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSGuestTime" + cpu_suffix]
= {delta_values.guest * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man "
"procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSGuestNiceTime" + cpu_suffix]
= {delta_values.guest_nice * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest "
"was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host "
"machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
}
void AsynchronousMetrics::applyNormalizedCPUMetricsUpdate(
AsynchronousMetricValues & new_values, double num_cpus_to_normalize, const ProcStatValuesCPU & delta_values_all_cpus, double multiplier)
{
chassert(num_cpus_to_normalize);
new_values["OSUserTimeNormalized"]
= {delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSNiceTimeNormalized"]
= {delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSSystemTimeNormalized"]
= {delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIdleTimeNormalized"]
= {delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIOWaitTimeNormalized"]
= {delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIrqTimeNormalized"]
= {delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of "
"the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSSoftIrqTimeNormalized"]
= {delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
"regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSStealTimeNormalized"]
= {delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestTimeNormalized"]
= {delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestNiceTimeNormalized"]
= {delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
"regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
}
#endif
void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
{
Stopwatch watch;
@ -831,7 +997,68 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."};
}
if (proc_stat)
if (cgroupcpu_stat || cgroupcpuacct_stat)
{
try
{
ReadBufferFromFilePRead & in = cgroupcpu_stat ? *cgroupcpu_stat : *cgroupcpuacct_stat;
ProcStatValuesCPU current_values{};
/// We re-read the file from the beginning each time
in.rewind();
while (!in.eof())
{
String name;
readStringUntilWhitespace(name, in);
skipWhitespaceIfAny(in);
/// `user_usec` for cgroup v2 and `user` for cgroup v1
if (name.starts_with("user"))
{
readText(current_values.user, in);
skipToNextLineOrEOF(in);
}
/// `system_usec` for cgroup v2 and `system` for cgroup v1
else if (name.starts_with("system"))
{
readText(current_values.system, in);
skipToNextLineOrEOF(in);
}
else
skipToNextLineOrEOF(in);
}
if (!first_run)
{
auto get_clock_ticks = [&]()
{
if (auto hz = sysconf(_SC_CLK_TCK); hz != -1)
return hz;
else
throw ErrnoException(ErrorCodes::CANNOT_SYSCONF, "Cannot call 'sysconf' to obtain system HZ");
};
const auto cgroup_version_specific_divisor = cgroupcpu_stat ? 1e6 : get_clock_ticks();
const double multiplier = 1.0 / cgroup_version_specific_divisor
/ (std::chrono::duration_cast<std::chrono::nanoseconds>(time_since_previous_update).count() / 1e9);
const ProcStatValuesCPU delta_values = current_values - proc_stat_values_all_cpus;
applyCPUMetricsUpdate(new_values, /*cpu_suffix=*/"", delta_values, multiplier);
if (max_cpu_cgroups > 0)
applyNormalizedCPUMetricsUpdate(new_values, max_cpu_cgroups, delta_values, multiplier);
}
proc_stat_values_all_cpus = current_values;
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
if (!cgroupcpu_stat)
openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
}
}
else if (proc_stat)
{
try
{
@ -886,43 +1113,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
else
delta_values_all_cpus = delta_values;
new_values["OSUserTime" + cpu_suffix] = { delta_values.user * multiplier,
"The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSNiceTime" + cpu_suffix] = { delta_values.nice * multiplier,
"The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSSystemTime" + cpu_suffix] = { delta_values.system * multiplier,
"The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIdleTime" + cpu_suffix] = { delta_values.idle * multiplier,
"The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIOWaitTime" + cpu_suffix] = { delta_values.iowait * multiplier,
"The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIrqTime" + cpu_suffix] = { delta_values.irq * multiplier,
"The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate hardware misconfiguration or a very high network load."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSSoftIrqTime" + cpu_suffix] = { delta_values.softirq * multiplier,
"The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate inefficient software running on the system."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSStealTime" + cpu_suffix] = { delta_values.steal * multiplier,
"The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" Not every virtualized environments present this metric, and most of them don't."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSGuestTime" + cpu_suffix] = { delta_values.guest * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSGuestNiceTime" + cpu_suffix] = { delta_values.guest_nice * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
applyCPUMetricsUpdate(new_values, cpu_suffix, delta_values, multiplier);
}
prev_values = current_values;
@ -978,38 +1169,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus;
if (num_cpus_to_normalize > 0)
{
new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
}
applyNormalizedCPUMetricsUpdate(new_values, num_cpus_to_normalize, delta_values_all_cpus, multiplier);
}
proc_stat_values_other = current_other_values;
@ -1042,8 +1202,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
if (meminfo)
else if (meminfo)
{
try
{

View File

@ -126,6 +126,8 @@ private:
std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_period TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_quota TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_max TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_stat TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpuacct_stat TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> vm_max_map_count TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> vm_maps TSA_GUARDED_BY(data_mutex);
@ -221,6 +223,16 @@ private:
void openBlockDevices();
void openSensorsChips();
void openEDAC();
void applyCPUMetricsUpdate(
AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier);
void applyNormalizedCPUMetricsUpdate(
AsynchronousMetricValues & new_values,
double num_cpus_to_normalize,
const ProcStatValuesCPU & delta_values_all_cpus,
double multiplier);
#endif
void run();

View File

@ -144,31 +144,6 @@ private:
/// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
/// systems existed only for a short transition period.
std::optional<std::string> getCgroupsV2Path()
{
if (!cgroupsV2Enabled())
return {};
if (!cgroupsV2MemoryControllerEnabled())
return {};
fs::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};
/// Return the bottom-most nested current memory file. If there is no such file at the current
/// level, try again at the parent level as memory settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
const auto current_path = current_cgroup / "memory.current";
const auto stat_path = current_cgroup / "memory.stat";
if (fs::exists(current_path) && fs::exists(stat_path))
return {current_cgroup};
current_cgroup = current_cgroup.parent_path();
}
return {};
}
std::optional<std::string> getCgroupsV1Path()
{
auto path = default_cgroups_mount / "memory/memory.stat";
@ -179,7 +154,7 @@ std::optional<std::string> getCgroupsV1Path()
std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsPath()
{
auto v2_path = getCgroupsV2Path();
auto v2_path = getCgroupsV2PathContainingFile("memory.current");
if (v2_path.has_value())
return {*v2_path, CgroupsMemoryUsageObserver::CgroupsVersion::V2};

View File

@ -307,7 +307,7 @@
M(FilteringMarksWithPrimaryKey, "Number of threads currently doing filtering of mark ranges by the primary key") \
M(FilteringMarksWithSecondaryKeys, "Number of threads currently doing filtering of mark ranges by secondary keys") \
\
M(S3DiskNoKeyErrors, "The number of `NoSuchKey` errors that occur when reading data from S3 cloud storage through ClickHouse disks.") \
M(DiskS3NoSuchKeyErrors, "The number of `NoSuchKey` errors that occur when reading data from S3 cloud storage through ClickHouse disks.") \
#ifdef APPLY_FOR_EXTERNAL_METRICS
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)

View File

@ -244,33 +244,43 @@ const char * analyzeImpl(
is_trivial = false;
if (!in_square_braces)
{
/// Check for case-insensitive flag.
if (pos + 1 < end && pos[1] == '?')
/// it means flag negation
/// there are various possible flags
/// actually only imsU are supported by re2
auto is_flag_char = [](char x)
{
for (size_t offset = 2; pos + offset < end; ++offset)
return x == '-' || x == 'i' || x == 'm' || x == 's' || x == 'U' || x == 'u';
};
/// Check for case-insensitive flag.
if (pos + 2 < end && pos[1] == '?' && is_flag_char(pos[2]))
{
size_t offset = 2;
for (; pos + offset < end; ++offset)
{
if (pos[offset] == '-' /// it means flag negation
/// various possible flags, actually only imsU are supported by re2
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
if (pos[offset] == 'i')
{
if (pos[offset] == 'i')
{
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
break;
}
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
}
else
else if (!is_flag_char(pos[offset]))
break;
}
pos += offset;
if (pos == end)
return pos;
/// if this group only contains flags, we have nothing to do.
if (*pos == ')')
{
++pos;
break;
}
}
/// (?:regex) means non-capturing parentheses group
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
else if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
{
pos += 2;
}
if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
else if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
{
pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end);
}

View File

@ -209,8 +209,35 @@
\
M(Merge, "Number of launched background merges.") \
M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \
M(MergedColumns, "Number of columns merged during the horizontal stage of merges.") \
M(GatheredColumns, "Number of columns gathered during the vertical stage of merges.") \
M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \
M(MergesTimeMilliseconds, "Total time spent for background merges.")\
M(MergeTotalMilliseconds, "Total time spent for background merges") \
M(MergeExecuteMilliseconds, "Total busy time spent for execution of background merges") \
M(MergeHorizontalStageTotalMilliseconds, "Total time spent for horizontal stage of background merges") \
M(MergeHorizontalStageExecuteMilliseconds, "Total busy time spent for execution of horizontal stage of background merges") \
M(MergeVerticalStageTotalMilliseconds, "Total time spent for vertical stage of background merges") \
M(MergeVerticalStageExecuteMilliseconds, "Total busy time spent for execution of vertical stage of background merges") \
M(MergeProjectionStageTotalMilliseconds, "Total time spent for projection stage of background merges") \
M(MergeProjectionStageExecuteMilliseconds, "Total busy time spent for execution of projection stage of background merges") \
\
M(MergingSortedMilliseconds, "Total time spent while merging sorted columns") \
M(AggregatingSortedMilliseconds, "Total time spent while aggregating sorted columns") \
M(CollapsingSortedMilliseconds, "Total time spent while collapsing sorted columns") \
M(ReplacingSortedMilliseconds, "Total time spent while replacing sorted columns") \
M(SummingSortedMilliseconds, "Total time spent while summing sorted columns") \
M(VersionedCollapsingSortedMilliseconds, "Total time spent while version collapsing sorted columns") \
M(GatheringColumnMilliseconds, "Total time spent while gathering columns for vertical merge") \
\
M(MutationTotalParts, "Number of total parts for which mutations tried to be applied") \
M(MutationUntouchedParts, "Number of total parts for which mutations tried to be applied but which was completely skipped according to predicate") \
M(MutatedRows, "Rows read for mutations. This is the number of rows before mutation") \
M(MutatedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for mutations. This is the number before mutation.") \
M(MutationTotalMilliseconds, "Total time spent for mutations.") \
M(MutationExecuteMilliseconds, "Total busy time spent for execution of mutations.") \
M(MutationAllPartColumns, "Number of times when task to mutate all columns in part was created") \
M(MutationSomePartColumns, "Number of times when task to mutate some columns in part was created") \
M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections in mutations.") \
\
M(MergeTreeDataWriterRows, "Number of rows INSERTed to MergeTree tables.") \
M(MergeTreeDataWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables.") \
@ -225,7 +252,6 @@
M(MergeTreeDataWriterProjectionsCalculationMicroseconds, "Time spent calculating projections") \
M(MergeTreeDataProjectionWriterSortingBlocksMicroseconds, "Time spent sorting blocks (for projection it might be a key different from table's sorting key)") \
M(MergeTreeDataProjectionWriterMergingBlocksMicroseconds, "Time spent merging blocks") \
M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections") \
\
M(InsertedWideParts, "Number of parts inserted in Wide format.") \
M(InsertedCompactParts, "Number of parts inserted in Compact format.") \

View File

@ -184,14 +184,20 @@ void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfi
// Resource update leads to loss of runtime data of nodes and may lead to temporary violation of constraints (e.g. limits)
// Try to minimise this by reusing "equal" resources (initialized with the same configuration).
std::vector<State::ResourcePtr> resources_to_attach;
for (auto & [name, new_resource] : new_state->resources)
{
if (auto iter = state->resources.find(name); iter != state->resources.end()) // Resource update
{
State::ResourcePtr old_resource = iter->second;
if (old_resource->equals(*new_resource))
{
new_resource = old_resource; // Rewrite with older version to avoid loss of runtime data
continue;
}
}
// It is new or updated resource
resources_to_attach.emplace_back(new_resource);
}
// Commit new state
@ -199,17 +205,14 @@ void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfi
state = new_state;
// Attach new and updated resources to the scheduler
for (auto & [name, resource] : new_state->resources)
for (auto & resource : resources_to_attach)
{
const SchedulerNodePtr & root = resource->nodes.find("/")->second.ptr;
if (root->parent == nullptr)
resource->attached_to = &scheduler;
scheduler.event_queue->enqueue([this, root]
{
resource->attached_to = &scheduler;
scheduler.event_queue->enqueue([this, root]
{
scheduler.attachChild(root);
});
}
scheduler.attachChild(root);
});
}
// NOTE: after mutex unlock `state` became available for Classifier(s) and must be immutable

View File

@ -19,6 +19,9 @@ TEST(OptimizeRE, analyze)
};
test_f("abc", "abc", {}, true, true);
test_f("c([^k]*)de", "");
test_f("(?-s)bob", "bob", {}, false, true);
test_f("(?s)bob", "bob", {}, false, true);
test_f("(?ssss", "");
test_f("abc(de)fg", "abcdefg", {}, false, true);
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);

View File

@ -83,6 +83,9 @@ static constexpr auto DBMS_MIN_REVISION_WITH_SYSTEM_KEYWORDS_TABLE = 54468;
static constexpr auto DBMS_MIN_REVISION_WITH_ROWS_BEFORE_AGGREGATION = 54469;
/// Packets size header
static constexpr auto DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS = 54470;
/// Version of ClickHouse TCP protocol.
///
/// Should be incremented manually on protocol changes.
@ -90,6 +93,6 @@ static constexpr auto DBMS_MIN_REVISION_WITH_ROWS_BEFORE_AGGREGATION = 54469;
/// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION,
/// later is just a number for server version (one number instead of commit SHA)
/// for simplicity (sometimes it may be more convenient in some use cases).
static constexpr auto DBMS_TCP_PROTOCOL_VERSION = 54469;
static constexpr auto DBMS_TCP_PROTOCOL_VERSION = 54470;
}

View File

@ -325,6 +325,7 @@ class IColumn;
\
M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
\
M(Int32, join_output_by_rowlist_perkey_rows_threshold, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join.", 0) \
M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
M(Bool, single_join_prefer_left_table, true, "For single JOIN in case of identifier ambiguity prefer left table", IMPORTANT) \
@ -593,7 +594,6 @@ class IColumn;
M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \
M(Bool, enable_lightweight_delete, true, "Enable lightweight DELETE mutations for mergetree tables.", 0) ALIAS(allow_experimental_lightweight_delete) \
M(UInt64, lightweight_deletes_sync, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes", 0) \
M(LightweightMutationProjectionMode, lightweight_mutation_projection_mode, LightweightMutationProjectionMode::THROW, "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop all projection related to this table then do lightweight delete.", 0) \
M(Bool, apply_deleted_mask, true, "Enables filtering out rows deleted with lightweight DELETE. If disabled, a query will be able to read those rows. This is useful for debugging and \"undelete\" scenarios", 0) \
M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \
M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \
@ -897,6 +897,7 @@ class IColumn;
M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory.", 0) ALIAS(extract_kvp_max_pairs_per_row) \
M(Bool, restore_replace_external_engines_to_null, false, "Replace all the external table engines to Null on restore. Useful for testing purposes", 0) \
M(Bool, restore_replace_external_table_functions_to_null, false, "Replace all table functions to Null on restore. Useful for testing purposes", 0) \
M(Bool, create_if_not_exists, false, "Enable IF NOT EXISTS for CREATE statements by default", 0) \
\
\
/* ###################################### */ \
@ -1136,6 +1137,7 @@ class IColumn;
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes_only_datetime64, false, "When input_format_try_infer_datetimes is enabled, infer only DateTime64 but not DateTime types", 0) \
M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \

View File

@ -75,6 +75,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
},
{"24.8",
{
{"create_if_not_exists", false, false, "New setting."},
{"rows_before_aggregation", true, true, "Provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation"},
{"restore_replace_external_table_functions_to_null", false, false, "New setting."},
{"restore_replace_external_engines_to_null", false, false, "New setting."},
@ -87,7 +88,9 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"},
{"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
{"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
{"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
{"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"}
}
},
{"24.7",
@ -103,7 +106,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"dictionary_validate_primary_key_type", false, false, "Validate primary key type for dictionaries. By default id type for simple layouts will be implicitly converted to UInt64."},
{"collect_hash_table_stats_during_joins", false, true, "New setting."},
{"max_size_to_preallocate_for_joins", 0, 100'000'000, "New setting."},
{"input_format_orc_reader_time_zone_name", "GMT", "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT."}, {"lightweight_mutation_projection_mode", "throw", "throw", "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop all projection related to this table then do lightweight delete."},
{"input_format_orc_reader_time_zone_name", "GMT", "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT."},
{"database_replicated_allow_heavy_create", true, false, "Long-running DDL queries (CREATE AS SELECT and POPULATE) for Replicated database engine was forbidden"},
{"query_plan_merge_filters", false, false, "Allow to merge filters in the query plan"},
{"azure_sdk_max_retries", 10, 10, "Maximum number of retries in azure sdk"},

View File

@ -175,7 +175,8 @@ IMPLEMENT_SETTING_ENUM(ParallelReplicasCustomKeyFilterType, ErrorCodes::BAD_ARGU
IMPLEMENT_SETTING_ENUM(LightweightMutationProjectionMode, ErrorCodes::BAD_ARGUMENTS,
{{"throw", LightweightMutationProjectionMode::THROW},
{"drop", LightweightMutationProjectionMode::DROP}})
{"drop", LightweightMutationProjectionMode::DROP},
{"rebuild", LightweightMutationProjectionMode::REBUILD}})
IMPLEMENT_SETTING_ENUM(DeduplicateMergeProjectionMode, ErrorCodes::BAD_ARGUMENTS,
{{"throw", DeduplicateMergeProjectionMode::THROW},

View File

@ -311,6 +311,7 @@ enum class LightweightMutationProjectionMode : uint8_t
{
THROW,
DROP,
REBUILD,
};
DECLARE_SETTING_ENUM(LightweightMutationProjectionMode)

View File

@ -196,7 +196,7 @@ PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList(
}
else
{
std::tuple<std::string, std::string, std::string, uint16_t, std::string, std::string, std::string> row;
std::tuple<std::string, std::string, std::string, uint16_t, std::string, std::string, std::string, std::string> row;
while (stream >> row)
{
const auto column_name = std::get<0>(row);
@ -206,13 +206,14 @@ PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList(
std::get<3>(row));
columns.push_back(NameAndTypePair(column_name, data_type));
auto attgenerated = std::get<6>(row);
auto attgenerated = std::get<7>(row);
attributes.emplace(
column_name,
PostgreSQLTableStructure::PGAttribute{
.atttypid = parse<int>(std::get<4>(row)),
.atttypmod = parse<int>(std::get<5>(row)),
.attnum = parse<int>(std::get<6>(row)),
.atthasdef = false,
.attgenerated = attgenerated.empty() ? char{} : char(attgenerated[0]),
.attr_def = {}
@ -308,6 +309,7 @@ PostgreSQLTableStructure fetchPostgreSQLTableStructure(
"attndims AS dims, " /// array dimensions
"atttypid as type_id, "
"atttypmod as type_modifier, "
"attnum as att_num, "
"attgenerated as generated " /// if column has GENERATED
"FROM pg_attribute "
"WHERE attrelid = (SELECT oid FROM pg_class WHERE {}) "
@ -338,17 +340,29 @@ PostgreSQLTableStructure fetchPostgreSQLTableStructure(
"WHERE adrelid = (SELECT oid FROM pg_class WHERE {});", where);
pqxx::result result{tx.exec(attrdef_query)};
for (const auto row : result)
if (static_cast<uint64_t>(result.size()) > table.physical_columns->names.size())
{
size_t adnum = row[0].as<int>();
if (!adnum || adnum > table.physical_columns->names.size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Received {} attrdef, but currently fetched columns list has {} columns",
result.size(), table.physical_columns->attributes.size());
}
for (const auto & column_attrs : table.physical_columns->attributes)
{
if (column_attrs.second.attgenerated != 's') /// e.g. not a generated column
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Received adnum {}, but currently fetched columns list has {} columns",
adnum, table.physical_columns->attributes.size());
continue;
}
for (const auto row : result)
{
int adnum = row[0].as<int>();
if (column_attrs.second.attnum == adnum)
{
table.physical_columns->attributes.at(column_attrs.first).attr_def = row[1].as<std::string>();
break;
}
}
const auto column_name = table.physical_columns->names[adnum - 1];
table.physical_columns->attributes.at(column_name).attr_def = row[1].as<std::string>();
}
}

View File

@ -16,6 +16,7 @@ struct PostgreSQLTableStructure
{
Int32 atttypid;
Int32 atttypmod;
Int32 attnum;
bool atthasdef;
char attgenerated;
std::string attr_def;

View File

@ -51,6 +51,8 @@ namespace
configuration.db,
configuration.user,
configuration.password,
configuration.proto_send_chunked,
configuration.proto_recv_chunked,
configuration.quota_key,
"", /* cluster */
"", /* cluster_secret */
@ -222,7 +224,7 @@ void registerDictionarySourceClickHouse(DictionarySourceFactory & factory)
{
validateNamedCollection(
*named_collection, {}, ValidateKeysMultiset<ExternalDatabaseEqualKeysSet>{
"secure", "host", "hostname", "port", "user", "username", "password", "quota_key", "name",
"secure", "host", "hostname", "port", "user", "username", "password", "proto_send_chunked", "proto_recv_chunked", "quota_key", "name",
"db", "database", "table","query", "where", "invalidate_query", "update_field", "update_lag"});
const auto secure = named_collection->getOrDefault("secure", false);
@ -234,6 +236,8 @@ void registerDictionarySourceClickHouse(DictionarySourceFactory & factory)
.host = host,
.user = named_collection->getAnyOrDefault<String>({"user", "username"}, "default"),
.password = named_collection->getOrDefault<String>("password", ""),
.proto_send_chunked = named_collection->getOrDefault<String>("proto_send_chunked", "notchunked"),
.proto_recv_chunked = named_collection->getOrDefault<String>("proto_recv_chunked", "notchunked"),
.quota_key = named_collection->getOrDefault<String>("quota_key", ""),
.db = named_collection->getAnyOrDefault<String>({"db", "database"}, default_database),
.table = named_collection->getOrDefault<String>("table", ""),
@ -258,6 +262,8 @@ void registerDictionarySourceClickHouse(DictionarySourceFactory & factory)
.host = host,
.user = config.getString(settings_config_prefix + ".user", "default"),
.password = config.getString(settings_config_prefix + ".password", ""),
.proto_send_chunked = config.getString(settings_config_prefix + ".proto_caps.send", "notchunked"),
.proto_recv_chunked = config.getString(settings_config_prefix + ".proto_caps.recv", "notchunked"),
.quota_key = config.getString(settings_config_prefix + ".quota_key", ""),
.db = config.getString(settings_config_prefix + ".db", default_database),
.table = config.getString(settings_config_prefix + ".table", ""),

View File

@ -23,6 +23,8 @@ public:
const std::string host;
const std::string user;
const std::string password;
const std::string proto_send_chunked;
const std::string proto_recv_chunked;
const std::string quota_key;
const std::string db;
const std::string table;

View File

@ -645,8 +645,9 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegment & file_segment)
ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromSourceBytes, current_impl_buffer_size);
std::string failure_reason;
bool continue_predownload = file_segment.reserve(
current_predownload_size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds);
current_predownload_size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds, failure_reason);
if (continue_predownload)
{
LOG_TEST(log, "Left to predownload: {}, buffer size: {}", bytes_to_predownload, current_impl_buffer_size);
@ -1002,7 +1003,8 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
{
chassert(file_offset_of_buffer_end + size - 1 <= file_segment.range().right);
bool success = file_segment.reserve(size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds);
std::string failure_reason;
bool success = file_segment.reserve(size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds, failure_reason);
if (success)
{
chassert(file_segment.getCurrentWriteOffset() == static_cast<size_t>(implementation_buffer->getPosition()));
@ -1028,7 +1030,8 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
LOG_TRACE(log, "Bypassing cache because writeCache method failed");
}
else
LOG_TRACE(log, "No space left in cache to reserve {} bytes, will continue without cache download", size);
LOG_TRACE(log, "No space left in cache to reserve {} bytes, reason: {}, "
"will continue without cache download", failure_reason, size);
if (!success)
{

View File

@ -91,7 +91,8 @@ bool FileSegmentRangeWriter::write(char * data, size_t size, size_t offset, File
size_t size_to_write = std::min(available_size, size);
bool reserved = file_segment->reserve(size_to_write, reserve_space_lock_wait_timeout_milliseconds);
std::string failure_reason;
bool reserved = file_segment->reserve(size_to_write, reserve_space_lock_wait_timeout_milliseconds, failure_reason);
if (!reserved)
{
appendFilesystemCacheLog(*file_segment);

View File

@ -63,7 +63,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
{
const auto & err = response.GetError();
throw S3Exception(
fmt::format("{} (Code: {}, s3 exception: {})",
fmt::format("{} (Code: {}, S3 exception: '{}')",
err.GetMessage(), static_cast<size_t>(err.GetErrorType()), err.GetExceptionName()),
err.GetErrorType());
}

View File

@ -419,10 +419,11 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
String result = getAdditionalFormatInfoForAllRowBasedFormats(settings);
/// First, settings that are common for all text formats:
result += fmt::format(
", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}",
", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}, try_infer_datetimes_only_datetime64={}",
settings.try_infer_integers,
settings.try_infer_dates,
settings.try_infer_datetimes);
settings.try_infer_datetimes,
settings.try_infer_datetimes_only_datetime64);
/// Second, format-specific settings:
switch (escaping_rule)

View File

@ -266,6 +266,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.try_infer_integers = settings.input_format_try_infer_integers;
format_settings.try_infer_dates = settings.input_format_try_infer_dates;
format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes;
format_settings.try_infer_datetimes_only_datetime64 = settings.input_format_try_infer_datetimes_only_datetime64;
format_settings.try_infer_exponent_floats = settings.input_format_try_infer_exponent_floats;
format_settings.markdown.escape_special_characters = settings.output_format_markdown_escape_special_characters;
format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string;

View File

@ -46,6 +46,7 @@ struct FormatSettings
bool try_infer_integers = true;
bool try_infer_dates = true;
bool try_infer_datetimes = true;
bool try_infer_datetimes_only_datetime64 = false;
bool try_infer_exponent_floats = false;
enum class DateTimeInputFormat : uint8_t

View File

@ -306,37 +306,45 @@ namespace
type_indexes.erase(TypeIndex::UInt64);
}
/// If we have only Date and DateTime types, convert Date to DateTime,
/// otherwise, convert all Date and DateTime to String.
/// If we have only date/datetimes types (Date/DateTime/DateTime64), convert all of them to the common type,
/// otherwise, convert all Date, DateTime and DateTime64 to String.
void transformDatesAndDateTimes(DataTypes & data_types, TypeIndexesSet & type_indexes)
{
bool have_dates = type_indexes.contains(TypeIndex::Date);
bool have_datetimes = type_indexes.contains(TypeIndex::DateTime64);
bool all_dates_or_datetimes = (type_indexes.size() == (static_cast<size_t>(have_dates) + static_cast<size_t>(have_datetimes)));
bool have_datetimes = type_indexes.contains(TypeIndex::DateTime);
bool have_datetimes64 = type_indexes.contains(TypeIndex::DateTime64);
bool all_dates_or_datetimes = (type_indexes.size() == (static_cast<size_t>(have_dates) + static_cast<size_t>(have_datetimes) + static_cast<size_t>(have_datetimes64)));
if (!all_dates_or_datetimes && (have_dates || have_datetimes))
if (!all_dates_or_datetimes && (have_dates || have_datetimes || have_datetimes64))
{
for (auto & type : data_types)
{
if (isDate(type) || isDateTime64(type))
if (isDate(type) || isDateTime(type) || isDateTime64(type))
type = std::make_shared<DataTypeString>();
}
type_indexes.erase(TypeIndex::Date);
type_indexes.erase(TypeIndex::DateTime);
type_indexes.erase(TypeIndex::DateTime64);
type_indexes.insert(TypeIndex::String);
return;
}
if (have_dates && have_datetimes)
for (auto & type : data_types)
{
for (auto & type : data_types)
if (isDate(type) && (have_datetimes || have_datetimes64))
{
if (isDate(type))
if (have_datetimes64)
type = std::make_shared<DataTypeDateTime64>(9);
else
type = std::make_shared<DataTypeDateTime>();
type_indexes.erase(TypeIndex::Date);
}
else if (isDateTime(type) && have_datetimes64)
{
type = std::make_shared<DataTypeDateTime64>(9);
type_indexes.erase(TypeIndex::DateTime);
}
type_indexes.erase(TypeIndex::Date);
}
}
@ -697,55 +705,87 @@ namespace
bool tryInferDate(std::string_view field)
{
if (field.empty())
/// Minimum length of Date text representation is 8 (YYYY-M-D) and maximum is 10 (YYYY-MM-DD)
if (field.size() < 8 || field.size() > 10)
return false;
ReadBufferFromString buf(field);
Float64 tmp_float;
/// Check if it's just a number, and if so, don't try to infer Date from it,
/// because we can interpret this number as a Date (for example 20000101 will be 2000-01-01)
/// and it will lead to inferring Date instead of simple Int64/UInt64 in some cases.
if (tryReadFloatText(tmp_float, buf) && buf.eof())
return false;
buf.seek(0, SEEK_SET); /// Return position to the beginning
DayNum tmp;
return tryReadDateText(tmp, buf) && buf.eof();
}
bool tryInferDateTime(std::string_view field, const FormatSettings & settings)
{
if (field.empty())
if (std::all_of(field.begin(), field.end(), isNumericASCII))
return false;
ReadBufferFromString buf(field);
Float64 tmp_float;
DayNum tmp;
return tryReadDateText(tmp, buf, DateLUT::instance(), /*allowed_delimiters=*/"-/:") && buf.eof();
}
DataTypePtr tryInferDateTimeOrDateTime64(std::string_view field, const FormatSettings & settings)
{
/// Don't try to infer DateTime if string is too long.
/// It's difficult to say what is the real maximum length of
/// DateTime we can parse using BestEffort approach.
/// 50 symbols is more or less valid limit for date times that makes sense.
if (field.empty() || field.size() > 50)
return nullptr;
/// Check that we have at least one digit, don't infer datetime form strings like "Apr"/"May"/etc.
if (!std::any_of(field.begin(), field.end(), isNumericASCII))
return nullptr;
/// Check if it's just a number, and if so, don't try to infer DateTime from it,
/// because we can interpret this number as a timestamp and it will lead to
/// inferring DateTime instead of simple Int64/Float64 in some cases.
/// inferring DateTime instead of simple Int64 in some cases.
if (std::all_of(field.begin(), field.end(), isNumericASCII))
return nullptr;
ReadBufferFromString buf(field);
Float64 tmp_float;
/// Check if it's a float value, and if so, don't try to infer DateTime from it,
/// because it will lead to inferring DateTime instead of simple Float64 in some cases.
if (tryReadFloatText(tmp_float, buf) && buf.eof())
return false;
return nullptr;
buf.seek(0, SEEK_SET); /// Return position to the beginning
if (!settings.try_infer_datetimes_only_datetime64)
{
time_t tmp;
switch (settings.date_time_input_format)
{
case FormatSettings::DateTimeInputFormat::Basic:
if (tryReadDateTimeText(tmp, buf, DateLUT::instance(), /*allowed_date_delimiters=*/"-/:", /*allowed_time_delimiters=*/":") && buf.eof())
return std::make_shared<DataTypeDateTime>();
break;
case FormatSettings::DateTimeInputFormat::BestEffort:
if (tryParseDateTimeBestEffortStrict(tmp, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
return std::make_shared<DataTypeDateTime>();
break;
case FormatSettings::DateTimeInputFormat::BestEffortUS:
if (tryParseDateTimeBestEffortUSStrict(tmp, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
return std::make_shared<DataTypeDateTime>();
break;
}
}
buf.seek(0, SEEK_SET); /// Return position to the beginning
DateTime64 tmp;
switch (settings.date_time_input_format)
{
case FormatSettings::DateTimeInputFormat::Basic:
if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
return true;
if (tryReadDateTime64Text(tmp, 9, buf, DateLUT::instance(), /*allowed_date_delimiters=*/"-/:", /*allowed_time_delimiters=*/":") && buf.eof())
return std::make_shared<DataTypeDateTime64>(9);
break;
case FormatSettings::DateTimeInputFormat::BestEffort:
if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
if (tryParseDateTime64BestEffortStrict(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
return std::make_shared<DataTypeDateTime64>(9);
break;
case FormatSettings::DateTimeInputFormat::BestEffortUS:
if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
if (tryParseDateTime64BestEffortUSStrict(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
return std::make_shared<DataTypeDateTime64>(9);
break;
}
return false;
return nullptr;
}
template <bool is_json>
@ -1439,8 +1479,11 @@ DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const Forma
if (settings.try_infer_dates && tryInferDate(field))
return std::make_shared<DataTypeDate>();
if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
return std::make_shared<DataTypeDateTime64>(9);
if (settings.try_infer_datetimes)
{
if (auto type = tryInferDateTimeOrDateTime64(field, settings))
return type;
}
return nullptr;
}

58
src/IO/NetUtils.h Normal file
View File

@ -0,0 +1,58 @@
#pragma once
#include <concepts>
#include <bit>
namespace DB
{
template<std::integral T>
constexpr T netToHost(T value) noexcept
{
if constexpr (std::endian::native != std::endian::big)
return std::byteswap(value);
return value;
}
template<std::integral T>
constexpr T hostToNet(T value) noexcept
{
if constexpr (std::endian::native != std::endian::big)
return std::byteswap(value);
return value;
}
template<std::integral T>
constexpr T toLittleEndian(T value) noexcept
{
if constexpr (std::endian::native == std::endian::big)
return std::byteswap(value);
return value;
}
template<std::integral T>
constexpr T toBigEndian(T value) noexcept
{
if constexpr (std::endian::native != std::endian::big)
return std::byteswap(value);
return value;
}
template<std::integral T>
constexpr T fromLittleEndian(T value) noexcept
{
if constexpr (std::endian::native == std::endian::big)
return std::byteswap(value);
return value;
}
template<std::integral T>
constexpr T fromBigEndian(T value) noexcept
{
if constexpr (std::endian::native != std::endian::big)
return std::byteswap(value);
return value;
}
}

View File

@ -32,7 +32,7 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
bool ReadBufferFromPocoSocket::nextImpl()
ssize_t ReadBufferFromPocoSocketBase::socketReceiveBytesImpl(char * ptr, size_t size)
{
ssize_t bytes_read = 0;
Stopwatch watch;
@ -43,14 +43,11 @@ bool ReadBufferFromPocoSocket::nextImpl()
ProfileEvents::increment(ProfileEvents::NetworkReceiveBytes, bytes_read);
});
CurrentMetrics::Increment metric_increment(CurrentMetrics::NetworkReceive);
/// Add more details to exceptions.
try
{
CurrentMetrics::Increment metric_increment(CurrentMetrics::NetworkReceive);
if (internal_buffer.size() > INT_MAX)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow");
/// If async_callback is specified, set socket to non-blocking mode
/// and try to read data from it, if socket is not ready for reading,
/// run async_callback and try again later.
@ -61,7 +58,7 @@ bool ReadBufferFromPocoSocket::nextImpl()
socket.setBlocking(false);
SCOPE_EXIT(socket.setBlocking(true));
bool secure = socket.secure();
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
bytes_read = socket.impl()->receiveBytes(ptr, static_cast<int>(size));
/// Check EAGAIN and ERR_SSL_WANT_READ/ERR_SSL_WANT_WRITE for secure socket (reading from secure socket can write too).
while (bytes_read < 0 && (errno == EAGAIN || (secure && (checkSSLWantRead(bytes_read) || checkSSLWantWrite(bytes_read)))))
@ -73,12 +70,12 @@ bool ReadBufferFromPocoSocket::nextImpl()
async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR);
/// Try to read again.
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
bytes_read = socket.impl()->receiveBytes(ptr, static_cast<int>(size));
}
}
else
{
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
bytes_read = socket.impl()->receiveBytes(ptr, static_cast<int>(size));
}
}
catch (const Poco::Net::NetException & e)
@ -99,6 +96,16 @@ bool ReadBufferFromPocoSocket::nextImpl()
if (bytes_read < 0)
throw NetException(ErrorCodes::CANNOT_READ_FROM_SOCKET, "Cannot read from socket (peer: {}, local: {})", peer_address.toString(), socket.address().toString());
return bytes_read;
}
bool ReadBufferFromPocoSocketBase::nextImpl()
{
if (internal_buffer.size() > INT_MAX)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow");
ssize_t bytes_read = socketReceiveBytesImpl(internal_buffer.begin(), internal_buffer.size());
if (read_event != ProfileEvents::end())
ProfileEvents::increment(read_event, bytes_read);
@ -110,7 +117,7 @@ bool ReadBufferFromPocoSocket::nextImpl()
return true;
}
ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size)
ReadBufferFromPocoSocketBase::ReadBufferFromPocoSocketBase(Poco::Net::Socket & socket_, size_t buf_size)
: BufferWithOwnMemory<ReadBuffer>(buf_size)
, socket(socket_)
, peer_address(socket.peerAddress())
@ -119,19 +126,22 @@ ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_,
{
}
ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size)
: ReadBufferFromPocoSocket(socket_, buf_size)
ReadBufferFromPocoSocketBase::ReadBufferFromPocoSocketBase(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size)
: ReadBufferFromPocoSocketBase(socket_, buf_size)
{
read_event = read_event_;
}
bool ReadBufferFromPocoSocket::poll(size_t timeout_microseconds) const
bool ReadBufferFromPocoSocketBase::poll(size_t timeout_microseconds) const
{
if (available())
/// For secure socket it is important to check if any remaining data available in underlying decryption buffer -
/// read always retrieves the whole encrypted frame from the wire and puts it into underlying buffer while returning only requested size -
/// further poll() can block though there is still data to read in the underlying decryption buffer.
if (available() || socket.impl()->available())
return true;
Stopwatch watch;
bool res = socket.poll(timeout_microseconds, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR);
bool res = socket.impl()->poll(timeout_microseconds, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR);
ProfileEvents::increment(ProfileEvents::NetworkReceiveElapsedMicroseconds, watch.elapsedMicroseconds());
return res;
}

View File

@ -9,7 +9,7 @@ namespace DB
{
/// Works with the ready Poco::Net::Socket. Blocking operations.
class ReadBufferFromPocoSocket : public BufferWithOwnMemory<ReadBuffer>
class ReadBufferFromPocoSocketBase : public BufferWithOwnMemory<ReadBuffer>
{
protected:
Poco::Net::Socket & socket;
@ -25,16 +25,29 @@ protected:
bool nextImpl() override;
public:
explicit ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
explicit ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
explicit ReadBufferFromPocoSocketBase(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
explicit ReadBufferFromPocoSocketBase(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
bool poll(size_t timeout_microseconds) const;
void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); }
ssize_t socketReceiveBytesImpl(char * ptr, size_t size);
private:
AsyncCallback async_callback;
std::string socket_description;
};
class ReadBufferFromPocoSocket : public ReadBufferFromPocoSocketBase
{
public:
explicit ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE)
: ReadBufferFromPocoSocketBase(socket_, buf_size)
{}
explicit ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE)
: ReadBufferFromPocoSocketBase(socket_, read_event_, buf_size)
{}
};
}

View File

@ -0,0 +1,166 @@
#include <IO/ReadBufferFromPocoSocketChunked.h>
#include <Common/logger_useful.h>
#include <IO/NetUtils.h>
namespace DB::ErrorCodes
{
extern const int LOGICAL_ERROR;
}
namespace DB
{
ReadBufferFromPocoSocketChunked::ReadBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, size_t buf_size)
: ReadBufferFromPocoSocketChunked(socket_, ProfileEvents::end(), buf_size)
{}
ReadBufferFromPocoSocketChunked::ReadBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size)
: ReadBufferFromPocoSocketBase(
socket_, read_event_,
std::min(buf_size, static_cast<size_t>(std::numeric_limits<decltype(chunk_left)>::max()))),
our_address(socket_.address()), log(getLogger("Protocol"))
{}
void ReadBufferFromPocoSocketChunked::enableChunked()
{
if (chunked)
return;
chunked = 1;
data_end = buffer().end();
/// Resize working buffer so any next read will call nextImpl
working_buffer.resize(offset());
chunk_left = 0;
next_chunk = 0;
}
bool ReadBufferFromPocoSocketChunked::hasBufferedData() const
{
if (available())
return true;
return chunked && (static_cast<size_t>(data_end - working_buffer.end()) > sizeof(next_chunk));
}
bool ReadBufferFromPocoSocketChunked::poll(size_t timeout_microseconds) const
{
if (chunked)
if (available() || static_cast<size_t>(data_end - working_buffer.end()) > sizeof(next_chunk))
return true;
return ReadBufferFromPocoSocketBase::poll(timeout_microseconds);
}
bool ReadBufferFromPocoSocketChunked::loadNextChunk(Position c_pos, bool cont)
{
auto buffered = std::min(static_cast<size_t>(data_end - c_pos), sizeof(next_chunk));
if (buffered)
std::memcpy(&next_chunk, c_pos, buffered);
if (buffered < sizeof(next_chunk))
if (socketReceiveBytesImpl(reinterpret_cast<char *>(&next_chunk) + buffered, sizeof(next_chunk) - buffered) < static_cast<ssize_t>(sizeof(next_chunk) - buffered))
return false;
next_chunk = fromLittleEndian(next_chunk);
if (next_chunk)
{
if (cont)
LOG_TEST(log, "{} <- {} Chunk receive continued. Size {}", ourAddress().toString(), peerAddress().toString(), next_chunk);
}
else
LOG_TEST(log, "{} <- {} Chunk receive ended.", ourAddress().toString(), peerAddress().toString());
return true;
}
bool ReadBufferFromPocoSocketChunked::processChunkLeft(Position c_pos)
{
if (data_end - c_pos < chunk_left)
{
working_buffer.resize(data_end - buffer().begin());
nextimpl_working_buffer_offset = c_pos - buffer().begin();
chunk_left -= (data_end - c_pos);
return true;
}
nextimpl_working_buffer_offset = c_pos - buffer().begin();
working_buffer.resize(nextimpl_working_buffer_offset + chunk_left);
c_pos += chunk_left;
if (!loadNextChunk(c_pos, true))
return false;
chunk_left = 0;
return true;
}
bool ReadBufferFromPocoSocketChunked::nextImpl()
{
if (!chunked)
return ReadBufferFromPocoSocketBase::nextImpl();
auto * c_pos = pos;
if (chunk_left == 0)
{
if (next_chunk == 0)
{
if (chunked == 1)
chunked = 2; // first chunked block - no end marker
else
c_pos = pos + sizeof(next_chunk); // bypass chunk end marker
if (c_pos > data_end)
c_pos = data_end;
if (!loadNextChunk(c_pos))
return false;
chunk_left = next_chunk;
next_chunk = 0;
if (chunk_left == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Native protocol: empty chunk received");
c_pos += sizeof(next_chunk);
if (c_pos >= data_end)
{
if (!ReadBufferFromPocoSocketBase::nextImpl())
return false;
data_end = buffer().end();
c_pos = buffer().begin();
}
LOG_TEST(log, "{} <- {} Chunk receive started. Message {}, size {}", ourAddress().toString(), peerAddress().toString(), static_cast<unsigned int>(*c_pos), chunk_left);
}
else
{
c_pos += sizeof(next_chunk);
if (c_pos >= data_end)
{
if (!ReadBufferFromPocoSocketBase::nextImpl())
return false;
data_end = buffer().end();
c_pos = buffer().begin();
}
chunk_left = next_chunk;
next_chunk = 0;
}
}
else
{
if (!ReadBufferFromPocoSocketBase::nextImpl())
return false;
data_end = buffer().end();
c_pos = buffer().begin();
}
return processChunkLeft(c_pos);
}
}

View File

@ -0,0 +1,109 @@
#pragma once
#include <IO/ReadBuffer.h>
#include <IO/ReadBufferFromPocoSocket.h>
/*
Handshake +=============
| 'Hello' type
| handshake exchange
| chunked protocol negotiation
+=============
Basic chunk:
+=============
Chunk begins | 0x12345678 chunk size, 4 bytes little endian
+-------------
| Packet type always follows beginning of the chunk
| packet data
+-------------
Chunk ends | 0x00000000 4 zero bytes
+=============
Datastream chunk:
+=============
Chunk begins | 0x12345678
+-------------
| Packet type
| packet data
+-------------
| Packet type
| packet data
+-------------
...arbitrary number .....
of packets... .....
+-------------
| Packet type
| packet data
+-------------
Chunk ends | 0x00000000
+=============
Multipart chunk:
+=============
Chunk begins | 0x12345678 chunk part size, 4 bytes little endian
+-------------
| Packet type
| packet data
+-------------
| Packet type
| (partial) packet data
+=============
Chunk continues | 0x12345678 chunk next part size, 4 bytes little endian
+=============
| possibly previous packet's data
+-------------
| Packet type
| packet data
+-------------
...arbitrary number .....
of chunk parts... .....
+-------------
| Packet type
| packet data
+-------------
Chunk ends | 0x00000000
+=============
*/
namespace DB
{
class ReadBufferFromPocoSocketChunked: public ReadBufferFromPocoSocketBase
{
public:
using ReadBufferFromPocoSocketBase::setAsyncCallback;
explicit ReadBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
explicit ReadBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, const ProfileEvents::Event & read_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
void enableChunked();
bool hasBufferedData() const;
bool poll(size_t timeout_microseconds) const;
Poco::Net::SocketAddress peerAddress() { return peer_address; }
Poco::Net::SocketAddress ourAddress() { return our_address; }
protected:
bool loadNextChunk(Position c_pos, bool cont = false);
bool processChunkLeft(Position c_pos);
bool nextImpl() override;
Poco::Net::SocketAddress our_address;
private:
LoggerPtr log;
Position data_end = nullptr; // end position of data in the internal_buffer
UInt32 chunk_left = 0; // chunk left to read from socket
UInt32 next_chunk = 0; // size of the next cnunk
UInt8 chunked = 0; // 0 - disabled; 1 - started; 2 - enabled;
};
}

View File

@ -1271,7 +1271,7 @@ template void readJSONArrayInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt
template bool readJSONArrayInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template <typename ReturnType>
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
@ -1318,6 +1318,9 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
}
else
{
if (!isSymbolIn(*buf.position(), allowed_delimiters))
return error();
++buf.position();
if (!append_digit(month))
@ -1325,7 +1328,11 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
append_digit(month);
if (!buf.eof() && !isNumericASCII(*buf.position()))
{
if (!isSymbolIn(*buf.position(), allowed_delimiters))
return error();
++buf.position();
}
else
return error();
@ -1338,12 +1345,12 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
return ReturnType(true);
}
template void readDateTextFallback<void>(LocalDate &, ReadBuffer &);
template bool readDateTextFallback<bool>(LocalDate &, ReadBuffer &);
template void readDateTextFallback<void>(LocalDate &, ReadBuffer &, const char * allowed_delimiters);
template bool readDateTextFallback<bool>(LocalDate &, ReadBuffer &, const char * allowed_delimiters);
template <typename ReturnType, bool dt64_mode>
ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters, const char * allowed_time_delimiters)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
@ -1413,6 +1420,9 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3])
|| !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9]))
return false;
if (!isSymbolIn(s[4], allowed_date_delimiters) || !isSymbolIn(s[7], allowed_date_delimiters))
return false;
}
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
@ -1443,6 +1453,9 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[3]) || !isNumericASCII(s[4])
|| !isNumericASCII(s[6]) || !isNumericASCII(s[7]))
return false;
if (!isSymbolIn(s[2], allowed_time_delimiters) || !isSymbolIn(s[5], allowed_time_delimiters))
return false;
}
hour = (s[0] - '0') * 10 + (s[1] - '0');
@ -1488,10 +1501,10 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
return ReturnType(true);
}
template void readDateTimeTextFallback<void, false>(time_t &, ReadBuffer &, const DateLUTImpl &);
template void readDateTimeTextFallback<void, true>(time_t &, ReadBuffer &, const DateLUTImpl &);
template bool readDateTimeTextFallback<bool, false>(time_t &, ReadBuffer &, const DateLUTImpl &);
template bool readDateTimeTextFallback<bool, true>(time_t &, ReadBuffer &, const DateLUTImpl &);
template void readDateTimeTextFallback<void, false>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
template void readDateTimeTextFallback<void, true>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
template bool readDateTimeTextFallback<bool, false>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
template bool readDateTimeTextFallback<bool, true>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
template <typename ReturnType>

View File

@ -703,13 +703,28 @@ struct NullOutput
};
template <typename ReturnType>
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf);
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters);
inline bool isSymbolIn(char symbol, const char * symbols)
{
if (symbols == nullptr)
return true;
const char * pos = symbols;
while (*pos)
{
if (*pos == symbol)
return true;
++pos;
}
return false;
}
/// In YYYY-MM-DD format.
/// For convenience, Month and Day parts can have single digit instead of two digits.
/// Any separators other than '-' are supported.
template <typename ReturnType = void>
inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters = nullptr)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
@ -753,6 +768,9 @@ inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
}
else
{
if (!isSymbolIn(pos[-1], allowed_delimiters))
return error();
if (!isNumericASCII(pos[0]))
return error();
@ -768,6 +786,9 @@ inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
if (isNumericASCII(pos[-1]) || !isNumericASCII(pos[0]))
return error();
if (!isSymbolIn(pos[-1], allowed_delimiters))
return error();
day = pos[0] - '0';
if (isNumericASCII(pos[1]))
{
@ -783,7 +804,7 @@ inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
return ReturnType(true);
}
else
return readDateTextFallback<ReturnType>(date, buf);
return readDateTextFallback<ReturnType>(date, buf, allowed_delimiters);
}
inline void convertToDayNum(DayNum & date, ExtendedDayNum & from)
@ -797,15 +818,15 @@ inline void convertToDayNum(DayNum & date, ExtendedDayNum & from)
}
template <typename ReturnType = void>
inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut)
inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_delimiters = nullptr)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
LocalDate local_date;
if constexpr (throw_exception)
readDateTextImpl<ReturnType>(local_date, buf);
else if (!readDateTextImpl<ReturnType>(local_date, buf))
readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters);
else if (!readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters))
return false;
ExtendedDayNum ret = date_lut.makeDayNum(local_date.year(), local_date.month(), local_date.day());
@ -814,15 +835,15 @@ inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLU
}
template <typename ReturnType = void>
inline ReturnType readDateTextImpl(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut)
inline ReturnType readDateTextImpl(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_delimiters = nullptr)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
LocalDate local_date;
if constexpr (throw_exception)
readDateTextImpl<ReturnType>(local_date, buf);
else if (!readDateTextImpl<ReturnType>(local_date, buf))
readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters);
else if (!readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters))
return false;
/// When the parameter is out of rule or out of range, Date32 uses 1925-01-01 as the default value (-DateLUT::instance().getDayNumOffsetEpoch(), -16436) and Date uses 1970-01-01.
@ -846,19 +867,19 @@ inline void readDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTI
readDateTextImpl<void>(date, buf, date_lut);
}
inline bool tryReadDateText(LocalDate & date, ReadBuffer & buf)
inline bool tryReadDateText(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters = nullptr)
{
return readDateTextImpl<bool>(date, buf);
return readDateTextImpl<bool>(date, buf, allowed_delimiters);
}
inline bool tryReadDateText(DayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
inline bool tryReadDateText(DayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance(), const char * allowed_delimiters = nullptr)
{
return readDateTextImpl<bool>(date, buf, time_zone);
return readDateTextImpl<bool>(date, buf, time_zone, allowed_delimiters);
}
inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance(), const char * allowed_delimiters = nullptr)
{
return readDateTextImpl<bool>(date, buf, time_zone);
return readDateTextImpl<bool>(date, buf, time_zone, allowed_delimiters);
}
UUID parseUUID(std::span<const UInt8> src);
@ -975,13 +996,13 @@ inline T parseFromString(std::string_view str)
template <typename ReturnType = void, bool dt64_mode = false>
ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut);
ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr);
/** In YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format, according to specified time zone.
* As an exception, also supported parsing of unix timestamp in form of decimal number.
*/
template <typename ReturnType = void, bool dt64_mode = false>
inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
@ -1014,6 +1035,9 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3])
|| !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9]))
return ReturnType(false);
if (!isSymbolIn(s[4], allowed_date_delimiters) || !isSymbolIn(s[7], allowed_date_delimiters))
return ReturnType(false);
}
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
@ -1033,6 +1057,9 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
if (!isNumericASCII(s[11]) || !isNumericASCII(s[12]) || !isNumericASCII(s[14]) || !isNumericASCII(s[15])
|| !isNumericASCII(s[17]) || !isNumericASCII(s[18]))
return ReturnType(false);
if (!isSymbolIn(s[13], allowed_time_delimiters) || !isSymbolIn(s[16], allowed_time_delimiters))
return ReturnType(false);
}
hour = (s[11] - '0') * 10 + (s[12] - '0');
@ -1057,11 +1084,11 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
return readIntTextImpl<time_t, ReturnType, ReadIntTextCheckOverflow::CHECK_OVERFLOW>(datetime, buf);
}
else
return readDateTimeTextFallback<ReturnType, dt64_mode>(datetime, buf, date_lut);
return readDateTimeTextFallback<ReturnType, dt64_mode>(datetime, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
}
template <typename ReturnType>
inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut)
inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
@ -1075,7 +1102,7 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
{
try
{
readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut);
readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
}
catch (const DB::Exception &)
{
@ -1085,7 +1112,7 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
}
else
{
auto ok = readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut);
auto ok = readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
if (!ok && (buf.eof() || *buf.position() != '.'))
return ReturnType(false);
}
@ -1168,14 +1195,14 @@ inline void readDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer
readDateTimeTextImpl<void>(datetime64, scale, buf, date_lut);
}
inline bool tryReadDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
inline bool tryReadDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance(), const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
{
return readDateTimeTextImpl<bool>(datetime, buf, time_zone);
return readDateTimeTextImpl<bool>(datetime, buf, time_zone, allowed_date_delimiters, allowed_time_delimiters);
}
inline bool tryReadDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
inline bool tryReadDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance(), const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
{
return readDateTimeTextImpl<bool>(datetime64, scale, buf, date_lut);
return readDateTimeTextImpl<bool>(datetime64, scale, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
}
inline void readDateTimeText(LocalDateTime & datetime, ReadBuffer & buf)

View File

@ -46,7 +46,7 @@ namespace ProfileEvents
namespace CurrentMetrics
{
extern const Metric S3DiskNoKeyErrors;
extern const Metric DiskS3NoSuchKeyErrors;
}
namespace DB
@ -701,7 +701,7 @@ RequestResult Client::processRequestResult(RequestResult && outcome) const
return std::forward<RequestResult>(outcome);
if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY)
CurrentMetrics::add(CurrentMetrics::S3DiskNoKeyErrors);
CurrentMetrics::add(CurrentMetrics::DiskS3NoSuchKeyErrors);
String enriched_message = fmt::format(
"{} {}",

View File

@ -145,12 +145,16 @@ Aws::String AWSEC2MetadataClient::getDefaultCredentialsSecurely() const
{
String user_agent_string = awsComputeUserAgentString();
auto [new_token, response_code] = getEC2MetadataToken(user_agent_string);
if (response_code == Aws::Http::HttpResponseCode::BAD_REQUEST)
if (response_code == Aws::Http::HttpResponseCode::BAD_REQUEST
|| response_code == Aws::Http::HttpResponseCode::REQUEST_NOT_MADE)
{
/// At least the host should be available and reply, otherwise neither IMDSv2 nor IMDSv1 are usable.
return {};
}
else if (response_code != Aws::Http::HttpResponseCode::OK || new_token.empty())
{
LOG_TRACE(logger, "Calling EC2MetadataService to get token failed, "
"falling back to less secure way. HTTP response code: {}", response_code);
"falling back to a less secure way. HTTP response code: {}", response_code);
return getDefaultCredentials();
}
@ -247,7 +251,7 @@ static Aws::String getAWSMetadataEndpoint()
return ec2_metadata_service_endpoint;
}
std::shared_ptr<AWSEC2MetadataClient> InitEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration)
std::shared_ptr<AWSEC2MetadataClient> createEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration)
{
auto endpoint = getAWSMetadataEndpoint();
return std::make_shared<AWSEC2MetadataClient>(client_configuration, endpoint.c_str());
@ -781,11 +785,13 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
/// EC2MetadataService throttles by delaying the response so the service client should set a large read timeout.
/// EC2MetadataService delay is in order of seconds so it only make sense to retry after a couple of seconds.
aws_client_configuration.connectTimeoutMs = 1000;
/// But the connection timeout should be small because there is the case when there is no IMDS at all,
/// like outside of the cloud, on your own machines.
aws_client_configuration.connectTimeoutMs = 10;
aws_client_configuration.requestTimeoutMs = 1000;
aws_client_configuration.retryStrategy = std::make_shared<Aws::Client::DefaultRetryStrategy>(1, 1000);
auto ec2_metadata_client = InitEC2MetadataClient(aws_client_configuration);
auto ec2_metadata_client = createEC2MetadataClient(aws_client_configuration);
auto config_loader = std::make_shared<AWSEC2InstanceProfileConfigLoader>(ec2_metadata_client, !credentials_configuration.use_insecure_imds_request);
AddProvider(std::make_shared<AWSInstanceProfileCredentialsProvider>(config_loader));

View File

@ -70,7 +70,7 @@ private:
LoggerPtr logger;
};
std::shared_ptr<AWSEC2MetadataClient> InitEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration);
std::shared_ptr<AWSEC2MetadataClient> createEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration);
class AWSEC2InstanceProfileConfigLoader : public Aws::Config::AWSProfileConfigLoader
{

View File

@ -128,7 +128,7 @@ void PocoHTTPClientConfiguration::updateSchemeAndRegion()
}
else
{
/// In global mode AWS C++ SDK send `us-east-1` but accept switching to another one if being suggested.
/// In global mode AWS C++ SDK sends `us-east-1` but accepts switching to another one if being suggested.
region = Aws::Region::AWS_GLOBAL;
}
}

View File

@ -1,8 +1,8 @@
#include <IO/S3/URI.h>
#include <Interpreters/Context.h>
#include <Storages/NamedCollectionsHelpers.h>
#include "Common/Macros.h"
#if USE_AWS_S3
#include <Interpreters/Context.h>
#include <Common/Macros.h>
#include <Common/Exception.h>
#include <Common/quoteString.h>
#include <Common/re2.h>
@ -10,6 +10,7 @@
#include <boost/algorithm/string/case_conv.hpp>
namespace DB
{
@ -40,21 +41,13 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
/// Case when AWS Private Link Interface is being used
/// E.g. (bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/bucket-name/key)
/// https://docs.aws.amazon.com/AmazonS3/latest/userguide/privatelink-interface-endpoints.html
static const RE2 aws_private_link_style_pattern(R"(bucket\.vpce\-([a-z0-9\-.]+)\.vpce.amazonaws.com(:\d{1,5})?)");
static const RE2 aws_private_link_style_pattern(R"(bucket\.vpce\-([a-z0-9\-.]+)\.vpce\.amazonaws\.com(:\d{1,5})?)");
/// Case when bucket name and key represented in path of S3 URL.
/// Case when bucket name and key represented in the path of S3 URL.
/// E.g. (https://s3.region.amazonaws.com/bucket-name/key)
/// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access
static const RE2 path_style_pattern("^/([^/]*)/(.*)");
static constexpr auto S3 = "S3";
static constexpr auto S3EXPRESS = "S3EXPRESS";
static constexpr auto COSN = "COSN";
static constexpr auto COS = "COS";
static constexpr auto OBS = "OBS";
static constexpr auto OSS = "OSS";
static constexpr auto EOS = "EOS";
if (allow_archive_path_syntax)
std::tie(uri_str, archive_pattern) = getURIAndArchivePattern(uri_);
else
@ -85,7 +78,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
URIConverter::modifyURI(uri, mapper);
}
storage_name = S3;
storage_name = "S3";
if (uri.getHost().empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");
@ -93,11 +86,13 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
/// Extract object version ID from query string.
bool has_version_id = false;
for (const auto & [query_key, query_value] : uri.getQueryParameters())
{
if (query_key == "versionId")
{
version_id = query_value;
has_version_id = true;
}
}
/// Poco::URI will ignore '?' when parsing the path, but if there is a versionId in the http parameter,
/// '?' can not be used as a wildcard, otherwise it will be ambiguous.
@ -129,15 +124,8 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
}
boost::to_upper(name);
/// For S3Express it will look like s3express-eun1-az1, i.e. contain region and AZ info
if (name != S3 && !name.starts_with(S3EXPRESS) && name != COS && name != OBS && name != OSS && name != EOS)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Object storage system name is unrecognized in virtual hosted style S3 URI: {}",
quoteString(name));
if (name == COS)
storage_name = COSN;
if (name == "COS")
storage_name = "COSN";
else
storage_name = name;
}
@ -148,13 +136,22 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
validateBucket(bucket, uri);
}
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
{
/// Custom endpoint, e.g. a public domain of Cloudflare R2,
/// which could be served by a custom server-side code.
storage_name = "S3";
bucket = "default";
is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();
if (!uri.getPath().empty())
key = uri.getPath().substr(1);
}
}
void URI::addRegionToURI(const std::string &region)
{
if (auto pos = endpoint.find("amazonaws.com"); pos != std::string::npos)
endpoint = endpoint.substr(0, pos) + region + "." + endpoint.substr(pos);
if (auto pos = endpoint.find(".amazonaws.com"); pos != std::string::npos)
endpoint = endpoint.substr(0, pos) + "." + region + endpoint.substr(pos);
}
void URI::validateBucket(const String & bucket, const Poco::URI & uri)

View File

@ -1,14 +1,14 @@
#pragma once
#include <optional>
#include <string>
#include "config.h"
#if USE_AWS_S3
#include <optional>
#include <string>
#include <Poco/URI.h>
namespace DB::S3
{
@ -23,7 +23,7 @@ namespace DB::S3
struct URI
{
Poco::URI uri;
// Custom endpoint if URI scheme is not S3.
// Custom endpoint if URI scheme, if not S3.
std::string endpoint;
std::string bucket;
std::string key;

View File

@ -64,7 +64,8 @@ public:
}
bytes += bytes_in_buffer;
pos = working_buffer.begin();
pos = working_buffer.begin() + nextimpl_working_buffer_offset;
nextimpl_working_buffer_offset = 0;
}
/// Calling finalize() in the destructor of derived classes is a bad practice.
@ -164,6 +165,11 @@ protected:
bool finalized = false;
bool canceled = false;
/// The number of bytes to preserve from the initial position of `working_buffer`
/// buffer. Apparently this is an additional out-parameter for nextImpl(),
/// not a real field.
size_t nextimpl_working_buffer_offset = 0;
private:
/** Write the data in the buffer (from the beginning of the buffer to the current position).
* Throw an exception if something is wrong.

View File

@ -183,6 +183,7 @@ WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_
, socket(socket_)
, peer_address(socket.peerAddress())
, our_address(socket.address())
, write_event(ProfileEvents::end())
, socket_description("socket (" + peer_address.toString() + ")")
{
}

View File

@ -0,0 +1,210 @@
#include <IO/WriteBufferFromPocoSocketChunked.h>
#include <Common/logger_useful.h>
#include <IO/NetUtils.h>
namespace
{
template <typename T>
void setValue(T * typed_ptr, std::type_identity_t<T> val)
{
memcpy(static_cast<void*>(typed_ptr), &val, sizeof(T));
}
}
namespace DB
{
WriteBufferFromPocoSocketChunked::WriteBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, size_t buf_size)
: WriteBufferFromPocoSocketChunked(socket_, ProfileEvents::end(), buf_size)
{}
WriteBufferFromPocoSocketChunked::WriteBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, const ProfileEvents::Event & write_event_, size_t buf_size)
: WriteBufferFromPocoSocket(
socket_, write_event_,
std::clamp(buf_size, sizeof(*chunk_size_ptr) + 1, static_cast<size_t>(std::numeric_limits<std::remove_reference_t<decltype(*chunk_size_ptr)>>::max()))),
log(getLogger("Protocol"))
{}
void WriteBufferFromPocoSocketChunked::enableChunked()
{
chunked = true;
/// Initialize next chunk
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(pos);
pos += std::min(available(), sizeof(*chunk_size_ptr));
/// Pretend finishChunk() was just called to prevent sending empty chunk if finishChunk() called immediately
last_finish_chunk = chunk_size_ptr;
}
void WriteBufferFromPocoSocketChunked::finishChunk()
{
if (!chunked)
return;
if (pos <= reinterpret_cast<Position>(chunk_size_ptr) + sizeof(*chunk_size_ptr))
{
/// Prevent duplicate finish chunk (and finish chunk right after enableChunked())
if (chunk_size_ptr == last_finish_chunk)
return;
/// If current chunk is empty it means we are finishing a chunk previously sent by next(),
/// we want to convert current chunk header into end-of-chunk marker and initialize next chunk.
/// We don't need to worry about if it's the end of the buffer because next() always sends the whole buffer
/// so it should be a beginning of the buffer.
chassert(reinterpret_cast<Position>(chunk_size_ptr) == working_buffer.begin());
setValue(chunk_size_ptr, 0);
/// Initialize next chunk
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(pos);
pos += std::min(available(), sizeof(*chunk_size_ptr));
last_finish_chunk = chunk_size_ptr;
return;
}
/// Previously finished chunk wasn't sent yet
if (last_finish_chunk == chunk_size_ptr)
{
chunk_started = false;
LOG_TEST(log, "{} -> {} Chunk send ended.", ourAddress().toString(), peerAddress().toString());
}
/// Fill up current chunk size
setValue(chunk_size_ptr, toLittleEndian(static_cast<UInt32>(pos - reinterpret_cast<Position>(chunk_size_ptr) - sizeof(*chunk_size_ptr))));
if (!chunk_started)
LOG_TEST(log, "{} -> {} Chunk send started. Message {}, size {}",
ourAddress().toString(), peerAddress().toString(),
static_cast<unsigned int>(*(reinterpret_cast<char *>(chunk_size_ptr) + sizeof(*chunk_size_ptr))),
*chunk_size_ptr);
else
{
chunk_started = false;
LOG_TEST(log, "{} -> {} Chunk send continued. Size {}", ourAddress().toString(), peerAddress().toString(), *chunk_size_ptr);
}
LOG_TEST(log, "{} -> {} Chunk send ended.", ourAddress().toString(), peerAddress().toString());
if (available() < sizeof(*chunk_size_ptr))
{
finishing = available();
pos += available();
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(pos);
last_finish_chunk = chunk_size_ptr;
return;
}
/// Buffer end-of-chunk
setValue(reinterpret_cast<decltype(chunk_size_ptr)>(pos), 0);
pos += sizeof(*chunk_size_ptr);
/// Initialize next chunk
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(pos);
pos += std::min(available(), sizeof(*chunk_size_ptr));
last_finish_chunk = chunk_size_ptr;
}
WriteBufferFromPocoSocketChunked::~WriteBufferFromPocoSocketChunked()
{
try
{
finalize();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
void WriteBufferFromPocoSocketChunked::nextImpl()
{
if (!chunked)
{
WriteBufferFromPocoSocket::nextImpl();
return;
}
/// next() after finishChunk at the end of the buffer
if (finishing < sizeof(*chunk_size_ptr))
{
pos -= finishing;
/// Send current chunk
WriteBufferFromPocoSocket::nextImpl();
/// Send end-of-chunk directly
UInt32 s = 0;
socketSendBytes(reinterpret_cast<const char *>(&s), sizeof(s));
finishing = sizeof(*chunk_size_ptr);
/// Initialize next chunk
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(working_buffer.begin());
nextimpl_working_buffer_offset = sizeof(*chunk_size_ptr);
last_finish_chunk = chunk_size_ptr;
return;
}
/// Prevent sending empty chunk
if (offset() == sizeof(*chunk_size_ptr))
{
nextimpl_working_buffer_offset = sizeof(*chunk_size_ptr);
return;
}
/// Finish chunk at the end of the buffer
if (working_buffer.end() - reinterpret_cast<Position>(chunk_size_ptr) <= static_cast<std::ptrdiff_t>(sizeof(*chunk_size_ptr)))
{
pos = reinterpret_cast<Position>(chunk_size_ptr);
/// Send current chunk
WriteBufferFromPocoSocket::nextImpl();
/// Initialize next chunk
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(working_buffer.begin());
nextimpl_working_buffer_offset = sizeof(*chunk_size_ptr);
last_finish_chunk = nullptr;
return;
}
bool initialize_last_finish_chunk = false;
if (pos - reinterpret_cast<Position>(chunk_size_ptr) == sizeof(*chunk_size_ptr)) // next() after finishChunk
{
pos -= sizeof(*chunk_size_ptr);
initialize_last_finish_chunk = true;
}
else // fill up current chunk size
{
setValue(chunk_size_ptr, toLittleEndian(static_cast<UInt32>(pos - reinterpret_cast<Position>(chunk_size_ptr) - sizeof(*chunk_size_ptr))));
if (!chunk_started)
{
chunk_started = true;
LOG_TEST(log, "{} -> {} Chunk send started. Message {}, size {}",
ourAddress().toString(), peerAddress().toString(),
static_cast<unsigned int>(*(reinterpret_cast<char *>(chunk_size_ptr) + sizeof(*chunk_size_ptr))),
*chunk_size_ptr);
}
else
LOG_TEST(log, "{} -> {} Chunk send continued. Size {}", ourAddress().toString(), peerAddress().toString(), *chunk_size_ptr);
}
/// Send current chunk
WriteBufferFromPocoSocket::nextImpl();
/// Initialize next chunk
chunk_size_ptr = reinterpret_cast<decltype(chunk_size_ptr)>(working_buffer.begin());
nextimpl_working_buffer_offset = sizeof(*chunk_size_ptr);
last_finish_chunk = initialize_last_finish_chunk ? chunk_size_ptr : nullptr;
}
void WriteBufferFromPocoSocketChunked::finalizeImpl()
{
if (chunked && offset() == sizeof(*chunk_size_ptr))
pos -= sizeof(*chunk_size_ptr);
WriteBufferFromPocoSocket::finalizeImpl();
}
}

View File

@ -0,0 +1,36 @@
#pragma once
#include <Common/logger_useful.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <algorithm>
namespace DB
{
class WriteBufferFromPocoSocketChunked: public WriteBufferFromPocoSocket
{
public:
explicit WriteBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
explicit WriteBufferFromPocoSocketChunked(Poco::Net::Socket & socket_, const ProfileEvents::Event & write_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
void enableChunked();
void finishChunk();
~WriteBufferFromPocoSocketChunked() override;
protected:
void nextImpl() override;
void finalizeImpl() override;
Poco::Net::SocketAddress peerAddress() const { return peer_address; }
Poco::Net::SocketAddress ourAddress() const { return our_address; }
private:
LoggerPtr log;
bool chunked = false;
UInt32 * last_finish_chunk = nullptr; // pointer to the last chunk header created by finishChunk
bool chunk_started = false; // chunk started flag
UInt32 * chunk_size_ptr = nullptr; // pointer to the chunk size holder in the buffer
size_t finishing = sizeof(*chunk_size_ptr); // indicates not enough buffer for end-of-chunk marker
};
}

View File

@ -82,13 +82,14 @@ struct DateTimeSubsecondPart
UInt8 digits;
};
template <typename ReturnType, bool is_us_style>
template <typename ReturnType, bool is_us_style, bool strict = false, bool is_64 = false>
ReturnType parseDateTimeBestEffortImpl(
time_t & res,
ReadBuffer & in,
const DateLUTImpl & local_time_zone,
const DateLUTImpl & utc_time_zone,
DateTimeSubsecondPart * fractional)
DateTimeSubsecondPart * fractional,
const char * allowed_date_delimiters = nullptr)
{
auto on_error = [&]<typename... FmtArgs>(int error_code [[maybe_unused]],
FormatStringHelper<FmtArgs...> fmt_string [[maybe_unused]],
@ -170,22 +171,36 @@ ReturnType parseDateTimeBestEffortImpl(
fractional->digits = 3;
readDecimalNumber<3>(fractional->value, digits + 10);
}
else if constexpr (strict)
{
/// Fractional part is not allowed.
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected fractional part");
}
return ReturnType(true);
}
else if (num_digits == 10 && !year && !has_time)
{
if (strict && month)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
/// This is unix timestamp.
readDecimalNumber<10>(res, digits);
return ReturnType(true);
}
else if (num_digits == 9 && !year && !has_time)
{
if (strict && month)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
/// This is unix timestamp.
readDecimalNumber<9>(res, digits);
return ReturnType(true);
}
else if (num_digits == 14 && !year && !has_time)
{
if (strict && month)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
/// This is YYYYMMDDhhmmss
readDecimalNumber<4>(year, digits);
readDecimalNumber<2>(month, digits + 4);
@ -197,6 +212,9 @@ ReturnType parseDateTimeBestEffortImpl(
}
else if (num_digits == 8 && !year)
{
if (strict && month)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
/// This is YYYYMMDD
readDecimalNumber<4>(year, digits);
readDecimalNumber<2>(month, digits + 4);
@ -272,6 +290,9 @@ ReturnType parseDateTimeBestEffortImpl(
else
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after year and month: {}", num_digits);
}
if (!isSymbolIn(delimiter_after_year, allowed_date_delimiters))
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: '{}' delimiter between date parts is not allowed", delimiter_after_year);
}
}
else if (num_digits == 2 || num_digits == 1)
@ -403,9 +424,16 @@ ReturnType parseDateTimeBestEffortImpl(
else
{
if (day_of_month)
{
if (strict && hour)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: hour component is duplicated");
hour = hour_or_day_of_month_or_month;
}
else
{
day_of_month = hour_or_day_of_month_or_month;
}
}
}
else if (num_digits != 0)
@ -446,6 +474,11 @@ ReturnType parseDateTimeBestEffortImpl(
fractional->digits = num_digits;
readDecimalNumber(fractional->value, num_digits, digits);
}
else if (strict)
{
/// Fractional part is not allowed.
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected fractional part");
}
}
else if (c == '+' || c == '-')
{
@ -582,12 +615,24 @@ ReturnType parseDateTimeBestEffortImpl(
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: neither Date nor Time was parsed successfully");
if (!day_of_month)
{
if constexpr (strict)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: day of month is required");
day_of_month = 1;
}
if (!month)
{
if constexpr (strict)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month is required");
month = 1;
}
if (!year)
{
if constexpr (strict)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: year is required");
/// If year is not specified, it will be the current year if the date is unknown or not greater than today,
/// otherwise it will be the previous year.
/// This convoluted logic is needed to parse the syslog format, which looks as follows: "Mar 3 01:33:48".
@ -641,6 +686,20 @@ ReturnType parseDateTimeBestEffortImpl(
}
};
if constexpr (strict)
{
if constexpr (is_64)
{
if (year < 1900)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime64: year {} is less than minimum supported year 1900", year);
}
else
{
if (year < 1970)
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: year {} is less than minimum supported year 1970", year);
}
}
if (has_time_zone_offset)
{
res = utc_time_zone.makeDateTime(year, month, day_of_month, hour, minute, second);
@ -654,20 +713,20 @@ ReturnType parseDateTimeBestEffortImpl(
return ReturnType(true);
}
template <typename ReturnType, bool is_us_style>
ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
template <typename ReturnType, bool is_us_style, bool strict = false>
ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters = nullptr)
{
time_t whole;
DateTimeSubsecondPart subsecond = {0, 0}; // needs to be explicitly initialized sine it could be missing from input string
if constexpr (std::is_same_v<ReturnType, bool>)
{
if (!parseDateTimeBestEffortImpl<bool, is_us_style>(whole, in, local_time_zone, utc_time_zone, &subsecond))
if (!parseDateTimeBestEffortImpl<bool, is_us_style, strict, true>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters))
return false;
}
else
{
parseDateTimeBestEffortImpl<ReturnType, is_us_style>(whole, in, local_time_zone, utc_time_zone, &subsecond);
parseDateTimeBestEffortImpl<ReturnType, is_us_style, strict, true>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters);
}
@ -730,4 +789,24 @@ bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer &
return parseDateTime64BestEffortImpl<bool, true>(res, scale, in, local_time_zone, utc_time_zone);
}
bool tryParseDateTimeBestEffortStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
{
return parseDateTimeBestEffortImpl<bool, false, true>(res, in, local_time_zone, utc_time_zone, nullptr, allowed_date_delimiters);
}
bool tryParseDateTimeBestEffortUSStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
{
return parseDateTimeBestEffortImpl<bool, true, true>(res, in, local_time_zone, utc_time_zone, nullptr, allowed_date_delimiters);
}
bool tryParseDateTime64BestEffortStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
{
return parseDateTime64BestEffortImpl<bool, false, true>(res, scale, in, local_time_zone, utc_time_zone, allowed_date_delimiters);
}
bool tryParseDateTime64BestEffortUSStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
{
return parseDateTime64BestEffortImpl<bool, true, true>(res, scale, in, local_time_zone, utc_time_zone, allowed_date_delimiters);
}
}

View File

@ -63,4 +63,12 @@ void parseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in,
bool tryParseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
void parseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
/// More strict version of best effort parsing. Requires day, month and year to be present, checks for allowed
/// delimiters between date components, makes additional correctness checks. Used in schema inference if date times.
bool tryParseDateTimeBestEffortStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
bool tryParseDateTimeBestEffortUSStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
bool tryParseDateTime64BestEffortStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
bool tryParseDateTime64BestEffortUSStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
}

View File

@ -206,11 +206,6 @@ TEST(S3UriTest, validPatterns)
}
}
TEST_P(S3UriTest, invalidPatterns)
{
ASSERT_ANY_THROW(S3::URI new_uri(GetParam()));
}
TEST(S3UriTest, versionIdChecks)
{
for (const auto& test_case : TestCases)
@ -223,19 +218,5 @@ TEST(S3UriTest, versionIdChecks)
}
}
INSTANTIATE_TEST_SUITE_P(
S3,
S3UriTest,
testing::Values(
"https:///",
"https://.s3.amazonaws.com/key",
"https://s3.amazonaws.com/key",
"https://jokserfn.s3amazonaws.com/key",
"https://s3.amazonaws.com//",
"https://amazonaws.com/",
"https://amazonaws.com//",
"https://amazonaws.com//key"));
}
#endif

View File

@ -804,7 +804,8 @@ bool FileCache::tryReserve(
const size_t size,
FileCacheReserveStat & reserve_stat,
const UserInfo & user,
size_t lock_wait_timeout_milliseconds)
size_t lock_wait_timeout_milliseconds,
std::string & failure_reason)
{
ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheReserveMicroseconds);
@ -817,6 +818,7 @@ bool FileCache::tryReserve(
if (cache_is_being_resized.load(std::memory_order_relaxed))
{
ProfileEvents::increment(ProfileEvents::FilesystemCacheFailToReserveSpaceBecauseOfCacheResize);
failure_reason = "cache is being resized";
return false;
}
@ -824,6 +826,7 @@ bool FileCache::tryReserve(
if (!cache_lock)
{
ProfileEvents::increment(ProfileEvents::FilesystemCacheFailToReserveSpaceBecauseOfLockContention);
failure_reason = "cache contention";
return false;
}
@ -847,6 +850,7 @@ bool FileCache::tryReserve(
LOG_TEST(log, "Query limit exceeded, space reservation failed, "
"recache_on_query_limit_exceeded is disabled (while reserving for {}:{})",
file_segment.key(), file_segment.offset());
failure_reason = "query limit exceeded";
return false;
}
@ -877,6 +881,7 @@ bool FileCache::tryReserve(
if (!query_priority->collectCandidatesForEviction(
size, required_elements_num, reserve_stat, eviction_candidates, {}, user.user_id, cache_lock))
{
failure_reason = "cannot evict enough space for query limit";
return false;
}
@ -891,11 +896,15 @@ bool FileCache::tryReserve(
if (!main_priority->collectCandidatesForEviction(
size, required_elements_num, reserve_stat, eviction_candidates, queue_iterator, user.user_id, cache_lock))
{
failure_reason = "cannot evict enough space";
return false;
}
if (!file_segment.getKeyMetadata()->createBaseDirectory())
{
failure_reason = "not enough space on device";
return false;
}
if (eviction_candidates.size() > 0)
{

View File

@ -165,7 +165,8 @@ public:
size_t size,
FileCacheReserveStat & stat,
const UserInfo & user,
size_t lock_wait_timeout_milliseconds);
size_t lock_wait_timeout_milliseconds,
std::string & failure_reason);
std::vector<FileSegment::Info> getFileSegmentInfos(const UserID & user_id);

View File

@ -502,7 +502,11 @@ LockedKeyPtr FileSegment::lockKeyMetadata(bool assert_exists) const
return metadata->tryLock();
}
bool FileSegment::reserve(size_t size_to_reserve, size_t lock_wait_timeout_milliseconds, FileCacheReserveStat * reserve_stat)
bool FileSegment::reserve(
size_t size_to_reserve,
size_t lock_wait_timeout_milliseconds,
std::string & failure_reason,
FileCacheReserveStat * reserve_stat)
{
if (!size_to_reserve)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Zero space reservation is not allowed");
@ -554,7 +558,7 @@ bool FileSegment::reserve(size_t size_to_reserve, size_t lock_wait_timeout_milli
if (!reserve_stat)
reserve_stat = &dummy_stat;
bool reserved = cache->tryReserve(*this, size_to_reserve, *reserve_stat, getKeyMetadata()->user, lock_wait_timeout_milliseconds);
bool reserved = cache->tryReserve(*this, size_to_reserve, *reserve_stat, getKeyMetadata()->user, lock_wait_timeout_milliseconds, failure_reason);
if (!reserved)
setDownloadFailedUnlocked(lock());

View File

@ -201,7 +201,11 @@ public:
/// Try to reserve exactly `size` bytes (in addition to the getDownloadedSize() bytes already downloaded).
/// Returns true if reservation was successful, false otherwise.
bool reserve(size_t size_to_reserve, size_t lock_wait_timeout_milliseconds, FileCacheReserveStat * reserve_stat = nullptr);
bool reserve(
size_t size_to_reserve,
size_t lock_wait_timeout_milliseconds,
std::string & failure_reason,
FileCacheReserveStat * reserve_stat = nullptr);
/// Write data into reserved space.
void write(char * from, size_t size, size_t offset_in_file);

View File

@ -705,7 +705,8 @@ void CacheMetadata::downloadImpl(FileSegment & file_segment, std::optional<Memor
{
auto size = reader->available();
if (!file_segment.reserve(size, reserve_space_lock_wait_timeout_milliseconds))
std::string failure_reason;
if (!file_segment.reserve(size, reserve_space_lock_wait_timeout_milliseconds, failure_reason))
{
LOG_TEST(
log, "Failed to reserve space during background download "

View File

@ -75,7 +75,8 @@ void WriteBufferToFileSegment::nextImpl()
FileCacheReserveStat reserve_stat;
/// In case of an error, we don't need to finalize the file segment
/// because it will be deleted soon and completed in the holder's destructor.
bool ok = file_segment->reserve(bytes_to_write, reserve_space_lock_wait_timeout_milliseconds, &reserve_stat);
std::string failure_reason;
bool ok = file_segment->reserve(bytes_to_write, reserve_space_lock_wait_timeout_milliseconds, failure_reason, &reserve_stat);
if (!ok)
{
@ -84,9 +85,10 @@ void WriteBufferToFileSegment::nextImpl()
reserve_stat_msg += fmt::format("{} hold {}, can release {}; ",
toString(kind), ReadableSize(stat.non_releasable_size), ReadableSize(stat.releasable_size));
throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Failed to reserve {} bytes for {}: {}(segment info: {})",
throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Failed to reserve {} bytes for {}: reason {}, {}(segment info: {})",
bytes_to_write,
file_segment->getKind() == FileSegmentKind::Temporary ? "temporary file" : "the file in cache",
failure_reason,
reserve_stat_msg,
file_segment->getInfoForLog()
);

View File

@ -113,6 +113,9 @@ Cluster::Address::Address(
secure = ConfigHelper::getBool(config, config_prefix + ".secure", false, /* empty_as */true) ? Protocol::Secure::Enable : Protocol::Secure::Disable;
priority = Priority{config.getInt(config_prefix + ".priority", 1)};
proto_send_chunked = config.getString(config_prefix + ".proto_caps.send", "notchunked");
proto_recv_chunked = config.getString(config_prefix + ".proto_caps.recv", "notchunked");
const char * port_type = secure == Protocol::Secure::Enable ? "tcp_port_secure" : "tcp_port";
auto default_port = config.getInt(port_type, 0);
@ -425,7 +428,9 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config,
auto pool = ConnectionPoolFactory::instance().get(
static_cast<unsigned>(settings.distributed_connections_pool_size),
address.host_name, address.port,
address.default_database, address.user, address.password, address.quota_key,
address.default_database, address.user, address.password,
address.proto_send_chunked, address.proto_recv_chunked,
address.quota_key,
address.cluster, address.cluster_secret,
"server", address.compression,
address.secure, address.priority);
@ -589,6 +594,8 @@ void Cluster::addShard(
replica.default_database,
replica.user,
replica.password,
replica.proto_send_chunked,
replica.proto_recv_chunked,
replica.quota_key,
replica.cluster,
replica.cluster_secret,
@ -744,6 +751,8 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti
address.default_database,
address.user,
address.password,
address.proto_send_chunked,
address.proto_recv_chunked,
address.quota_key,
address.cluster,
address.cluster_secret,

View File

@ -114,6 +114,8 @@ public:
UInt16 port{0};
String user;
String password;
String proto_send_chunked = "notchunked";
String proto_recv_chunked = "notchunked";
String quota_key;
/// For inter-server authorization

View File

@ -15,48 +15,115 @@ JoinOnKeyColumns::JoinOnKeyColumns(const Block & block, const Names & key_names_
{
}
template<> void AddedColumns<false>::buildOutput()
{
}
template<>
void AddedColumns<false>::buildOutput() {}
template<>
void AddedColumns<false>::buildJoinGetOutput() {}
template<>
template<bool from_row_list>
void AddedColumns<false>::buildOutputFromBlocks() {}
template<>
void AddedColumns<true>::buildOutput()
{
for (size_t i = 0; i < this->size(); ++i)
if (!output_by_row_list)
buildOutputFromBlocks<false>();
else
{
auto& col = columns[i];
size_t default_count = 0;
auto apply_default = [&]()
if (join_data_avg_perkey_rows < output_by_row_list_threshold)
buildOutputFromBlocks<true>();
else
{
if (default_count > 0)
for (size_t i = 0; i < this->size(); ++i)
{
JoinCommon::addDefaultValues(*col, type_name[i].type, default_count);
default_count = 0;
}
};
for (size_t j = 0; j < lazy_output.blocks.size(); ++j)
{
if (!lazy_output.blocks[j])
{
default_count++;
continue;
}
apply_default();
const auto & column_from_block = reinterpret_cast<const Block *>(lazy_output.blocks[j])->getByPosition(right_indexes[i]);
/// If it's joinGetOrNull, we need to wrap not-nullable columns in StorageJoin.
if (is_join_get)
{
if (auto * nullable_col = typeid_cast<ColumnNullable *>(col.get());
nullable_col && !column_from_block.column->isNullable())
auto & col = columns[i];
for (auto row_ref_i : lazy_output.row_refs)
{
nullable_col->insertFromNotNullable(*column_from_block.column, lazy_output.row_nums[j]);
continue;
if (row_ref_i)
{
const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
for (auto it = row_ref_list->begin(); it.ok(); ++it)
col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num);
}
else
type_name[i].type->insertDefaultInto(*col);
}
}
col->insertFrom(*column_from_block.column, lazy_output.row_nums[j]);
}
apply_default();
}
}
template<>
void AddedColumns<true>::buildJoinGetOutput()
{
for (size_t i = 0; i < this->size(); ++i)
{
auto & col = columns[i];
for (auto row_ref_i : lazy_output.row_refs)
{
if (!row_ref_i)
{
type_name[i].type->insertDefaultInto(*col);
continue;
}
const auto * row_ref = reinterpret_cast<const RowRef *>(row_ref_i);
const auto & column_from_block = row_ref->block->getByPosition(right_indexes[i]);
if (auto * nullable_col = typeid_cast<ColumnNullable *>(col.get()); nullable_col && !column_from_block.column->isNullable())
nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num);
else
col->insertFrom(*column_from_block.column, row_ref->row_num);
}
}
}
template<>
template<bool from_row_list>
void AddedColumns<true>::buildOutputFromBlocks()
{
if (this->size() == 0)
return;
std::vector<const Block *> blocks;
std::vector<UInt32> row_nums;
blocks.reserve(lazy_output.row_refs.size());
row_nums.reserve(lazy_output.row_refs.size());
for (auto row_ref_i : lazy_output.row_refs)
{
if (row_ref_i)
{
if constexpr (from_row_list)
{
const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
for (auto it = row_ref_list->begin(); it.ok(); ++it)
{
blocks.emplace_back(it->block);
row_nums.emplace_back(it->row_num);
}
}
else
{
const RowRef * row_ref = reinterpret_cast<const RowRefList *>(row_ref_i);
blocks.emplace_back(row_ref->block);
row_nums.emplace_back(row_ref->row_num);
}
}
else
{
blocks.emplace_back(nullptr);
row_nums.emplace_back(0);
}
}
for (size_t i = 0; i < this->size(); ++i)
{
auto & col = columns[i];
for (size_t j = 0; j < blocks.size(); ++j)
{
if (blocks[j])
col->insertFrom(*blocks[j]->getByPosition(right_indexes[i]).column, row_nums[j]);
else
type_name[i].type->insertDefaultInto(*col);
}
}
}
@ -72,29 +139,27 @@ void AddedColumns<false>::applyLazyDefaults()
}
template<>
void AddedColumns<true>::applyLazyDefaults()
{
}
void AddedColumns<true>::applyLazyDefaults() {}
template <>
void AddedColumns<false>::appendFromBlock(const Block & block, size_t row_num,const bool has_defaults)
void AddedColumns<false>::appendFromBlock(const RowRef * row_ref, const bool has_defaults)
{
if (has_defaults)
applyLazyDefaults();
#ifndef NDEBUG
checkBlock(block);
checkBlock(*row_ref->block);
#endif
if (is_join_get)
{
size_t right_indexes_size = right_indexes.size();
for (size_t j = 0; j < right_indexes_size; ++j)
{
const auto & column_from_block = block.getByPosition(right_indexes[j]);
const auto & column_from_block = row_ref->block->getByPosition(right_indexes[j]);
if (auto * nullable_col = nullable_column_ptrs[j])
nullable_col->insertFromNotNullable(*column_from_block.column, row_num);
nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num);
else
columns[j]->insertFrom(*column_from_block.column, row_num);
columns[j]->insertFrom(*column_from_block.column, row_ref->row_num);
}
}
else
@ -102,22 +167,21 @@ void AddedColumns<false>::appendFromBlock(const Block & block, size_t row_num,co
size_t right_indexes_size = right_indexes.size();
for (size_t j = 0; j < right_indexes_size; ++j)
{
const auto & column_from_block = block.getByPosition(right_indexes[j]);
columns[j]->insertFrom(*column_from_block.column, row_num);
const auto & column_from_block = row_ref->block->getByPosition(right_indexes[j]);
columns[j]->insertFrom(*column_from_block.column, row_ref->row_num);
}
}
}
template <>
void AddedColumns<true>::appendFromBlock(const Block & block, size_t row_num, bool)
void AddedColumns<true>::appendFromBlock(const RowRef * row_ref, bool)
{
#ifndef NDEBUG
checkBlock(block);
checkBlock(*row_ref->block);
#endif
if (has_columns_to_add)
{
lazy_output.blocks.emplace_back(reinterpret_cast<UInt64>(&block));
lazy_output.row_nums.emplace_back(static_cast<uint32_t>(row_num));
lazy_output.row_refs.emplace_back(reinterpret_cast<UInt64>(row_ref));
}
}
template<>
@ -131,8 +195,7 @@ void AddedColumns<true>::appendDefaultRow()
{
if (has_columns_to_add)
{
lazy_output.blocks.emplace_back(0);
lazy_output.row_nums.emplace_back(0);
lazy_output.row_refs.emplace_back(0);
}
}
}

View File

@ -50,8 +50,7 @@ public:
struct LazyOutput
{
PaddedPODArray<UInt64> blocks;
PaddedPODArray<UInt32> row_nums;
PaddedPODArray<UInt64> row_refs;
};
AddedColumns(
@ -76,8 +75,7 @@ public:
if constexpr (lazy)
{
has_columns_to_add = num_columns_to_add > 0;
lazy_output.blocks.reserve(rows_to_add);
lazy_output.row_nums.reserve(rows_to_add);
lazy_output.row_refs.reserve(rows_to_add);
}
columns.reserve(num_columns_to_add);
@ -115,18 +113,22 @@ public:
if (columns[j]->isNullable() && !saved_column->isNullable())
nullable_column_ptrs[j] = typeid_cast<ColumnNullable *>(columns[j].get());
}
join_data_avg_perkey_rows = join.getJoinedData()->avgPerKeyRows();
output_by_row_list_threshold = join.getTableJoin().outputByRowListPerkeyRowsThreshold();
}
size_t size() const { return columns.size(); }
void buildOutput();
void buildJoinGetOutput();
ColumnWithTypeAndName moveColumn(size_t i)
{
return ColumnWithTypeAndName(std::move(columns[i]), type_name[i].type, type_name[i].qualified_name);
}
void appendFromBlock(const Block & block, size_t row_num, bool has_default);
void appendFromBlock(const RowRef * row_ref, bool has_default);
void appendDefaultRow();
@ -134,6 +136,8 @@ public:
const IColumn & leftAsofKey() const { return *left_asof_key; }
static constexpr bool isLazy() { return lazy; }
Block left_block;
std::vector<JoinOnKeyColumns> join_on_keys;
ExpressionActionsPtr additional_filter_expression;
@ -142,6 +146,9 @@ public:
size_t rows_to_add;
std::unique_ptr<IColumn::Offsets> offsets_to_replicate;
bool need_filter = false;
bool output_by_row_list = false;
size_t join_data_avg_perkey_rows = 0;
size_t output_by_row_list_threshold = 0;
IColumn::Filter filter;
void reserve(bool need_replicate)
@ -212,15 +219,22 @@ private:
columns.back()->reserve(src_column.column->size());
type_name.emplace_back(src_column.type, src_column.name, qualified_name);
}
/** Build output from the blocks that extract from `RowRef` or `RowRefList`, to avoid block cache miss which may cause performance slow down.
* And This problem would happen it we directly build output from `RowRef` or `RowRefList`.
*/
template<bool from_row_list>
void buildOutputFromBlocks();
};
/// Adapter class to pass into addFoundRowAll
/// In joinRightColumnsWithAdditionalFilter we don't want to add rows directly into AddedColumns,
/// because they need to be filtered by additional_filter_expression.
class PreSelectedRows : public std::vector<RowRef>
class PreSelectedRows : public std::vector<const RowRef *>
{
public:
void appendFromBlock(const Block & block, size_t row_num, bool /* has_default */) { this->emplace_back(&block, row_num); }
void appendFromBlock(const RowRef * row_ref, bool /* has_default */) { this->emplace_back(row_ref); }
static constexpr bool isLazy() { return false; }
};
}

View File

@ -495,7 +495,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
}
size_t rows = source_block.rows();
data->rows_to_join += rows;
const auto & right_key_names = table_join->getAllNames(JoinTableSide::Right);
ColumnPtrMap all_key_columns(right_key_names.size());
for (const auto & column_name : right_key_names)
@ -647,7 +647,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
total_bytes = getTotalByteCount();
}
}
data->keys_to_join = total_rows;
shrinkStoredBlocksToFit(total_bytes);
return table_join->sizeLimits().check(total_rows, total_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);

View File

@ -345,6 +345,18 @@ public:
size_t blocks_allocated_size = 0;
size_t blocks_nullmaps_allocated_size = 0;
/// Number of rows of right table to join
size_t rows_to_join = 0;
/// Number of keys of right table to join
size_t keys_to_join = 0;
size_t avgPerKeyRows() const
{
if (keys_to_join == 0)
return 0;
return rows_to_join / keys_to_join;
}
};
using RightTableDataPtr = std::shared_ptr<RightTableData>;

View File

@ -83,6 +83,7 @@ public:
const Block & block_with_columns_to_add,
const MapsTemplateVector & maps_,
bool is_join_get = false);
private:
template <typename KeyGetter, bool is_asof_join>
static KeyGetter createKeyGetter(const ColumnRawPtrs & key_columns, const Sizes & key_sizes);
@ -128,7 +129,7 @@ private:
template <typename AddedColumns>
static ColumnPtr buildAdditionalFilter(
size_t left_start_row,
const std::vector<RowRef> & selected_rows,
const std::vector<const RowRef *> & selected_rows,
const std::vector<size_t> & row_replicate_offset,
AddedColumns & added_columns);

View File

@ -95,7 +95,10 @@ Block HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinBlockImpl(
added_columns.join_on_keys.clear();
Block remaining_block = sliceBlock(block, num_joined);
added_columns.buildOutput();
if (is_join_get)
added_columns.buildJoinGetOutput();
else
added_columns.buildOutput();
for (size_t i = 0; i < added_columns.size(); ++i)
block.insert(added_columns.moveColumn(i));
@ -339,6 +342,8 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
size_t rows = added_columns.rows_to_add;
if constexpr (need_filter)
added_columns.filter = IColumn::Filter(rows, 0);
if constexpr (!flag_per_row && (STRICTNESS == JoinStrictness::All || (STRICTNESS == JoinStrictness::Semi && KIND == JoinKind::Right)))
added_columns.output_by_row_list = true;
Arena pool;
@ -354,8 +359,8 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
{
if (unlikely(current_offset >= max_joined_block_rows))
{
added_columns.offsets_to_replicate->resize_assume_reserved(i);
added_columns.filter.resize_assume_reserved(i);
added_columns.offsets_to_replicate->resize(i);
added_columns.filter.resize(i);
break;
}
}
@ -381,15 +386,15 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
const IColumn & left_asof_key = added_columns.leftAsofKey();
auto row_ref = mapped->findAsof(left_asof_key, i);
if (row_ref.block)
if (row_ref && row_ref->block)
{
setUsed<need_filter>(added_columns.filter, i);
if constexpr (flag_per_row)
used_flags.template setUsed<join_features.need_flags, flag_per_row>(row_ref.block, row_ref.row_num, 0);
used_flags.template setUsed<join_features.need_flags, flag_per_row>(row_ref->block, row_ref->row_num, 0);
else
used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
added_columns.appendFromBlock(*row_ref.block, row_ref.row_num, join_features.add_missing);
added_columns.appendFromBlock(row_ref, join_features.add_missing);
}
else
addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, current_offset);
@ -420,7 +425,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
if (used_once)
{
setUsed<need_filter>(added_columns.filter, i);
added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
added_columns.appendFromBlock(&mapped, join_features.add_missing);
}
break;
@ -438,7 +443,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
{
setUsed<need_filter>(added_columns.filter, i);
used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
added_columns.appendFromBlock(&mapped, join_features.add_missing);
if (join_features.is_any_or_semi_join)
{
@ -477,7 +482,7 @@ template <JoinKind KIND, JoinStrictness STRICTNESS, typename MapsTemplate>
template <typename AddedColumns>
ColumnPtr HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::buildAdditionalFilter(
size_t left_start_row,
const std::vector<RowRef> & selected_rows,
const std::vector<const RowRef *> & selected_rows,
const std::vector<size_t> & row_replicate_offset,
AddedColumns & added_columns)
{
@ -489,7 +494,7 @@ ColumnPtr HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::buildAdditionalFilter
result_column = ColumnUInt8::create();
break;
}
const Block & sample_right_block = *selected_rows.begin()->block;
const Block & sample_right_block = *((*selected_rows.begin())->block);
if (!sample_right_block || !added_columns.additional_filter_expression)
{
auto filter = ColumnUInt8::create();
@ -519,8 +524,8 @@ ColumnPtr HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::buildAdditionalFilter
auto new_col = col.column->cloneEmpty();
for (const auto & selected_row : selected_rows)
{
const auto & src_col = selected_row.block->getByPosition(right_col_pos);
new_col->insertFrom(*src_col.column, selected_row.row_num);
const auto & src_col = selected_row->block->getByPosition(right_col_pos);
new_col->insertFrom(*src_col.column, selected_row->row_num);
}
executed_block.insert({std::move(new_col), col.type, col.name});
}
@ -700,26 +705,24 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
{
// For inner join, we need mark each right row'flag, because we only use each right row once.
auto used_once = used_flags.template setUsedOnce<join_features.need_flags, true>(
selected_right_row_it->block, selected_right_row_it->row_num, 0);
(*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
if (used_once)
{
any_matched = true;
total_added_rows += 1;
added_columns.appendFromBlock(
*selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
}
}
}
else
{
auto used_once = used_flags.template setUsedOnce<join_features.need_flags, true>(
selected_right_row_it->block, selected_right_row_it->row_num, 0);
(*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
if (used_once)
{
any_matched = true;
total_added_rows += 1;
added_columns.appendFromBlock(
*selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
}
}
}
@ -727,16 +730,14 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
{
any_matched = true;
if constexpr (join_features.right && join_features.need_flags)
used_flags.template setUsed<true, true>(selected_right_row_it->block, selected_right_row_it->row_num, 0);
used_flags.template setUsed<true, true>((*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
}
else
{
any_matched = true;
total_added_rows += 1;
added_columns.appendFromBlock(
*selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
used_flags.template setUsed<join_features.need_flags, true>(
selected_right_row_it->block, selected_right_row_it->row_num, 0);
added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
used_flags.template setUsed<join_features.need_flags, true>((*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
}
}
@ -756,8 +757,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
if (filter_flags[replicated_row])
{
any_matched = true;
added_columns.appendFromBlock(
*selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
total_added_rows += 1;
}
++selected_right_row_it;
@ -767,8 +767,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
if (filter_flags[replicated_row])
{
any_matched = true;
added_columns.appendFromBlock(
*selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
total_added_rows += 1;
selected_right_row_it = selected_right_row_it + row_replicate_offset[i] - replicated_row;
break;

View File

@ -18,11 +18,25 @@ struct JoinFeatures
static constexpr bool inner = KIND == JoinKind::Inner;
static constexpr bool full = KIND == JoinKind::Full;
/** Whether we may need duplicate rows from the left table.
* For example, when we have row (key1, attr1) in left table
* and rows (key1, attr2), (key1, attr3) in right table,
* then we need to duplicate row (key1, attr1) for each of joined rows from right table, so result will be
* (key1, attr1, key1, attr2)
* (key1, attr1, key1, attr3)
*/
static constexpr bool need_replication = is_all_join || (is_any_join && right) || (is_semi_join && right);
/// Whether we need to filter rows from the left table that do not have matches in the right table.
static constexpr bool need_filter = !need_replication && (inner || right || (is_semi_join && left) || (is_anti_join && left));
/// Whether we need to add default values for columns from the left table.
static constexpr bool add_missing = (left || full) && !is_semi_join;
/// Whether we need to store flags for rows from the right table table
/// that indicates if they have matches in the left table.
static constexpr bool need_flags = MapGetter<KIND, STRICTNESS, std::is_same_v<std::decay_t<Map>, HashJoin::MapsAll>>::flagged;
static constexpr bool is_maps_all = std::is_same_v<std::decay_t<Map>, HashJoin::MapsAll>;
};

View File

@ -104,7 +104,7 @@ void addFoundRowAll(
{
if (!known_rows.isKnown(std::make_pair(it->block, it->row_num)))
{
added.appendFromBlock(*it->block, it->row_num, false);
added.appendFromBlock(*it, false);
++current_offset;
if (!new_known_rows_ptr)
{
@ -124,11 +124,16 @@ void addFoundRowAll(
known_rows.add(std::cbegin(*new_known_rows_ptr), std::cend(*new_known_rows_ptr));
}
}
else if constexpr (AddedColumns::isLazy())
{
added.appendFromBlock(&mapped, false);
current_offset += mapped.rows;
}
else
{
for (auto it = mapped.begin(); it.ok(); ++it)
{
added.appendFromBlock(*it->block, it->row_num, false);
added.appendFromBlock(*it, false);
++current_offset;
}
}

View File

@ -1944,6 +1944,8 @@ BlockIO InterpreterCreateQuery::execute()
FunctionNameNormalizer::visit(query_ptr.get());
auto & create = query_ptr->as<ASTCreateQuery &>();
create.if_not_exists |= getContext()->getSettingsRef().create_if_not_exists;
bool is_create_database = create.database && !create.table;
if (!create.cluster.empty() && !maybeRemoveOnCluster(query_ptr, getContext()))
{

Some files were not shown because too many files have changed in this diff Show More