Merge branch 'master' into fix-nullable-schema-inference

This commit is contained in:
Kruglov Pavel 2024-08-16 15:39:34 +02:00 committed by GitHub
commit 6f7e4ce3aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
730 changed files with 31078 additions and 7267 deletions

18
.github/actions/debug/action.yml vendored Normal file
View File

@ -0,0 +1,18 @@
name: DebugInfo
description: Prints workflow debug info
runs:
using: "composite"
steps:
- name: Print envs
shell: bash
run: |
echo "::group::Envs"
env
echo "::endgroup::"
- name: Print Event.json
shell: bash
run: |
echo "::group::Event.json"
python3 -m json.tool "$GITHUB_EVENT_PATH"
echo "::endgroup::"

109
.github/workflows/auto_releases.yml vendored Normal file
View File

@ -0,0 +1,109 @@
name: AutoReleases
env:
PYTHONUNBUFFERED: 1
concurrency:
group: autoreleases
on:
# schedule:
# - cron: '0 9 * * *'
workflow_dispatch:
inputs:
dry-run:
description: 'Dry run'
required: false
default: true
type: boolean
jobs:
AutoReleaseInfo:
runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.info.outputs.AUTO_RELEASE_PARAMS }}
dry_run: ${{ steps.info.outputs.DRY_RUN }}
steps:
- name: Debug Info
uses: ./.github/actions/debug
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
EOF
echo "DRY_RUN=true" >> "$GITHUB_ENV"
- name: Check out repository code
uses: ClickHouse/checkout@v1
- name: Prepare Info
id: info
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 auto_release.py --prepare
echo "::group::Auto Release Info"
python3 -m json.tool /tmp/autorelease_info.json
echo "::endgroup::"
{
echo 'AUTO_RELEASE_PARAMS<<EOF'
cat /tmp/autorelease_info.json
echo 'EOF'
} >> "$GITHUB_ENV"
{
echo 'AUTO_RELEASE_PARAMS<<EOF'
cat /tmp/autorelease_info.json
echo 'EOF'
} >> "$GITHUB_OUTPUT"
echo "DRY_RUN=true" >> "$GITHUB_OUTPUT"
- name: Post Release Branch statuses
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 auto_release.py --post-status
- name: Clean up
uses: ./.github/actions/clean
Release_0:
needs: AutoReleaseInfo
name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].release_branch }}
if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].ready }}
uses: ./.github/workflows/create_release.yml
with:
ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].commit_sha }}
type: patch
dry-run: ${{ needs.AutoReleaseInfo.outputs.dry_run }}
#
# Release_1:
# needs: [AutoReleaseInfo, Release_0]
# name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].release_branch }}
# if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].ready }}
# uses: ./.github/workflows/create_release.yml
# with:
# ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].commit_sha }}
# type: patch
# dry-run: ${{ env.DRY_RUN }}
#
# Release_2:
# needs: [AutoReleaseInfo, Release_1]
# name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[2].release_branch }}
# if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[2].ready }}
# uses: ./.github/workflow/create_release.yml
# with:
# ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].commit_sha }}
# type: patch
# dry-run: ${{ env.DRY_RUN }}
#
# Release_3:
# needs: [AutoReleaseInfo, Release_2]
# name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].release_branch }}
# if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].ready }}
# uses: ./.github/workflow/create_release.yml
# with:
# ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].commit_sha }}
# type: patch
# dry-run: ${{ env.DRY_RUN }}
# - name: Post Slack Message
# if: ${{ !cancelled() }}
# run: |
# cd "$GITHUB_WORKSPACE/tests/ci"
# python3 auto_release.py --post-auto-release-complete --wf-status ${{ job.status }}

View File

@ -2,6 +2,7 @@ name: CreateRelease
concurrency:
group: release
'on':
workflow_dispatch:
inputs:
@ -26,6 +27,26 @@ concurrency:
required: false
default: false
type: boolean
workflow_call:
inputs:
ref:
description: 'Git reference (branch or commit sha) from which to create the release'
required: true
type: string
type:
description: 'The type of release: "new" for a new release or "patch" for a patch release'
required: true
type: string
only-repo:
description: 'Run only repos updates including docker (repo-recovery, tests)'
required: false
default: false
type: boolean
dry-run:
description: 'Dry run'
required: false
default: false
type: boolean
jobs:
CreateRelease:
@ -101,6 +122,7 @@ jobs:
--volume=".:/wd" --workdir="/wd" \
clickhouse/style-test \
./tests/ci/changelog.py -v --debug-helpers \
--gh-user-or-token ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }} \
--jobs=5 \
--output="./docs/changelogs/${{ env.RELEASE_TAG }}.md" ${{ env.RELEASE_TAG }}
git add ./docs/changelogs/${{ env.RELEASE_TAG }}.md
@ -129,9 +151,9 @@ jobs:
if: ${{ inputs.type == 'patch' && ! inputs.only-repo }}
shell: bash
run: |
python3 ./tests/ci/create_release.py --set-progress-completed
git reset --hard HEAD
git checkout "$GITHUB_REF_NAME"
python3 ./tests/ci/create_release.py --set-progress-completed
- name: Create GH Release
if: ${{ inputs.type == 'patch' && ! inputs.only-repo }}
shell: bash

3
.gitmodules vendored
View File

@ -345,9 +345,6 @@
[submodule "contrib/FP16"]
path = contrib/FP16
url = https://github.com/Maratyszcza/FP16.git
[submodule "contrib/robin-map"]
path = contrib/robin-map
url = https://github.com/Tessil/robin-map.git
[submodule "contrib/aklomp-base64"]
path = contrib/aklomp-base64
url = https://github.com/aklomp/base64.git

View File

@ -322,17 +322,21 @@ if (DISABLE_OMIT_FRAME_POINTER)
set (CMAKE_ASM_FLAGS_ADD "${CMAKE_ASM_FLAGS_ADD} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
endif()
# Before you start hating your debugger because it refuses to show variables ('<optimized out>'), try building with -DDEBUG_O_LEVEL="0"
# https://stackoverflow.com/questions/63386189/whats-the-difference-between-a-compilers-o0-option-and-og-option/63386263#63386263
set(DEBUG_O_LEVEL "g" CACHE STRING "The -Ox level used for debug builds")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMPILER_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_RELWITHDEBINFO "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
if (OS_DARWIN)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")

View File

@ -27,27 +27,6 @@ bool cgroupsV2Enabled()
#endif
}
bool cgroupsV2MemoryControllerEnabled()
{
#if defined(OS_LINUX)
chassert(cgroupsV2Enabled());
/// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available
/// for the current + child cgroups. The set of available controllers can be restricted from level to level using file
/// "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file.
fs::path cgroup_dir = cgroupV2PathOfProcess();
if (cgroup_dir.empty())
return false;
std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
if (!controllers_file.is_open())
return false;
std::string controllers;
std::getline(controllers_file, controllers);
return controllers.find("memory") != std::string::npos;
#else
return false;
#endif
}
fs::path cgroupV2PathOfProcess()
{
#if defined(OS_LINUX)
@ -71,3 +50,28 @@ fs::path cgroupV2PathOfProcess()
return {};
#endif
}
std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name)
{
#if defined(OS_LINUX)
if (!cgroupsV2Enabled())
return {};
fs::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};
/// Return the bottom-most nested file. If there is no such file at the current
/// level, try again at the parent level as settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
const auto path = current_cgroup / file_name;
if (fs::exists(path))
return {current_cgroup};
current_cgroup = current_cgroup.parent_path();
}
return {};
#else
return {};
#endif
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <filesystem>
#include <string_view>
#if defined(OS_LINUX)
/// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
@ -11,11 +12,11 @@ static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgrou
/// Is cgroups v2 enabled on the system?
bool cgroupsV2Enabled();
/// Is the memory controller of cgroups v2 enabled on the system?
/// Assumes that cgroupsV2Enabled() is enabled.
bool cgroupsV2MemoryControllerEnabled();
/// Detects which cgroup v2 the process belongs to and returns the filesystem path to the cgroup.
/// Returns an empty path the cgroup cannot be determined.
/// Assumes that cgroupsV2Enabled() is enabled.
std::filesystem::path cgroupV2PathOfProcess();
/// Returns the most nested cgroup dir containing the specified file.
/// If cgroups v2 is not enabled - returns an empty optional.
std::optional<std::string> getCgroupsV2PathContainingFile([[maybe_unused]] std::string_view file_name);

View File

@ -19,9 +19,6 @@ std::optional<uint64_t> getCgroupsV2MemoryLimit()
if (!cgroupsV2Enabled())
return {};
if (!cgroupsV2MemoryControllerEnabled())
return {};
std::filesystem::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};

View File

@ -2,11 +2,11 @@
# NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION,
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
SET(VERSION_REVISION 54489)
SET(VERSION_REVISION 54490)
SET(VERSION_MAJOR 24)
SET(VERSION_MINOR 8)
SET(VERSION_MINOR 9)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH 3f8b27d7accd2b5ec4afe7d0dd459115323304af)
SET(VERSION_DESCRIBE v24.8.1.1-testing)
SET(VERSION_STRING 24.8.1.1)
SET(VERSION_GITHASH e02b434d2fc0c4fbee29ca675deab7474d274608)
SET(VERSION_DESCRIBE v24.9.1.1-testing)
SET(VERSION_STRING 24.9.1.1)
# end of autochange

View File

@ -209,9 +209,8 @@ endif()
option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES})
if (ENABLE_USEARCH)
add_contrib (FP16-cmake FP16)
add_contrib (robin-map-cmake robin-map)
add_contrib (SimSIMD-cmake SimSIMD)
add_contrib (usearch-cmake usearch) # requires: FP16, robin-map, SimdSIMD
add_contrib (usearch-cmake usearch) # requires: FP16, SimdSIMD
else ()
message(STATUS "Not using USearch")
endif ()

2
contrib/SimSIMD vendored

@ -1 +1 @@
Subproject commit de2cb75b9e9e3389d5e1e51fd9f8ed151f3c17cf
Subproject commit 91a76d1ac519b3b9dc8957734a3dabd985f00c26

2
contrib/libunwind vendored

@ -1 +1 @@
Subproject commit a89d904befea07814628c6ce0b44083c4e149c62
Subproject commit 601db0b0e03018c01710470a37703b618f9cf08b

1
contrib/robin-map vendored

@ -1 +0,0 @@
Subproject commit 851a59e0e3063ee0e23089062090a73fd3de482d

View File

@ -1 +0,0 @@
# See contrib/usearch-cmake/CMakeLists.txt

2
contrib/usearch vendored

@ -1 +1 @@
Subproject commit 30810452bec5d3d3aa0931bb5d761e2f09aa6356
Subproject commit e21a5778a0d4469ddaf38c94b7be0196bb701ee4

View File

@ -1,5 +1,4 @@
set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
@ -7,8 +6,17 @@ add_library(_usearch INTERFACE)
target_include_directories(_usearch SYSTEM INTERFACE
${FP16_PROJECT_DIR}/include
${ROBIN_MAP_PROJECT_DIR}/include
${SIMSIMD_PROJECT_DIR}/include
${USEARCH_PROJECT_DIR}/include)
target_compile_definitions(_usearch INTERFACE USEARCH_USE_FP16LIB)
# target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD)
# ^^ simsimd is not enabled at the moment. Reasons:
# - Vectorization is important for raw scans but not so much for HNSW. We use usearch only for HNSW.
# - Simsimd does compile-time dispatch (choice of SIMD kernels determined by capabilities of the build machine) or dynamic dispatch (SIMD
# kernels chosen at runtime based on cpuid instruction). Since current builds are limited to SSE 4.2 (x86) and NEON (ARM), the speedup of
# the former would be moderate compared to AVX-512 / SVE. The latter is at the moment too fragile with respect to portability across x86
# and ARM machines ... certain conbinations of quantizations / distance functions / SIMD instructions are not implemented at the moment.
add_library(ch_contrib::usearch ALIAS _usearch)

View File

@ -129,6 +129,7 @@ configure
# Check that all new/changed setting were added in settings changes history.
# Some settings can be different for builds with sanitizers, so we check
# Also the automatic value of 'max_threads' and similar was displayed as "'auto(...)'" in previous versions instead of "auto(...)".
# settings changes only for non-sanitizer builds.
IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'")
if [ "${IS_SANITIZED}" -eq "0" ]
@ -145,7 +146,9 @@ then
old_settings.value AS old_value
FROM new_settings
LEFT JOIN old_settings ON new_settings.name = old_settings.name
WHERE (new_settings.value != old_settings.value) AND (name NOT IN (
WHERE (new_value != old_value)
AND NOT (startsWith(new_value, 'auto(') AND old_value LIKE '%auto(%')
AND (name NOT IN (
SELECT arrayJoin(tupleElement(changes, 'name'))
FROM
(
@ -177,7 +180,7 @@ then
if [ -s changed_settings.txt ]
then
mv changed_settings.txt /test_output/
echo -e "Changed settings are not reflected in settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
echo -e "Changed settings are not reflected in the settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
else
echo -e "There are no changed settings or they are reflected in settings changes history$OK" >> /test_output/test_results.tsv
fi

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.3.7.30-lts (c8a28cf4331) FIXME as compared to v24.3.6.48-lts (b2d33c3c45d)
#### Improvement
* Backported in [#68103](https://github.com/ClickHouse/ClickHouse/issues/68103): Distinguish booleans and integers while parsing values for custom settings: ``` SET custom_a = true; SET custom_b = 1; ```. [#62206](https://github.com/ClickHouse/ClickHouse/pull/62206) ([Vitaly Baranov](https://github.com/vitlibar)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#67931](https://github.com/ClickHouse/ClickHouse/issues/67931): Fixing the `Not-ready Set` error after the `PREWHERE` optimization for StorageMerge. [#65057](https://github.com/ClickHouse/ClickHouse/pull/65057) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Backported in [#68062](https://github.com/ClickHouse/ClickHouse/issues/68062): Fix boolean literals in query sent to external database (for engines like `PostgreSQL`). [#66282](https://github.com/ClickHouse/ClickHouse/pull/66282) ([vdimir](https://github.com/vdimir)).
* Backported in [#67812](https://github.com/ClickHouse/ClickHouse/issues/67812): Only relevant to the experimental Variant data type. Fix crash with Variant + AggregateFunction type. [#67122](https://github.com/ClickHouse/ClickHouse/pull/67122) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#67848](https://github.com/ClickHouse/ClickHouse/issues/67848): Fixes [#66026](https://github.com/ClickHouse/ClickHouse/issues/66026). Avoid unresolved table function arguments traversal in `ReplaceTableNodeToDummyVisitor`. [#67522](https://github.com/ClickHouse/ClickHouse/pull/67522) ([Dmitry Novik](https://github.com/novikd)).
* Backported in [#68271](https://github.com/ClickHouse/ClickHouse/issues/68271): Fix inserting into stream like engines (Kafka, RabbitMQ, NATS) through HTTP interface. [#67554](https://github.com/ClickHouse/ClickHouse/pull/67554) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Backported in [#67806](https://github.com/ClickHouse/ClickHouse/issues/67806): Fix reloading SQL UDFs with UNION. Previously, restarting the server could make UDF invalid. [#67665](https://github.com/ClickHouse/ClickHouse/pull/67665) ([Antonio Andelic](https://github.com/antonio2368)).
* Backported in [#67834](https://github.com/ClickHouse/ClickHouse/issues/67834): Fix potential stack overflow in `JSONMergePatch` function. Renamed this function from `jsonMergePatch` to `JSONMergePatch` because the previous name was wrong. The previous name is still kept for compatibility. Improved diagnostic of errors in the function. This closes [#67304](https://github.com/ClickHouse/ClickHouse/issues/67304). [#67756](https://github.com/ClickHouse/ClickHouse/pull/67756) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Backported in [#68206](https://github.com/ClickHouse/ClickHouse/issues/68206): Fix wrong `count()` result when there is non-deterministic function in predicate. [#67922](https://github.com/ClickHouse/ClickHouse/pull/67922) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Backported in [#68089](https://github.com/ClickHouse/ClickHouse/issues/68089): Fixed the calculation of the maximum thread soft limit in containerized environments where the usable CPU count is limited. [#67963](https://github.com/ClickHouse/ClickHouse/pull/67963) ([Robert Schulze](https://github.com/rschu1ze)).
* Backported in [#68120](https://github.com/ClickHouse/ClickHouse/issues/68120): Fixed skipping of untouched parts in mutations with new analyzer. Previously with enabled analyzer data in part could be rewritten by mutation even if mutation doesn't affect this part according to predicate. [#68052](https://github.com/ClickHouse/ClickHouse/pull/68052) ([Anton Popov](https://github.com/CurtizJ)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Update version after release. [#67676](https://github.com/ClickHouse/ClickHouse/pull/67676) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Backported in [#68074](https://github.com/ClickHouse/ClickHouse/issues/68074): Add an explicit error for `ALTER MODIFY SQL SECURITY` on non-view tables. [#67953](https://github.com/ClickHouse/ClickHouse/pull/67953) ([pufit](https://github.com/pufit)).

View File

@ -22,10 +22,10 @@ ORDER BY Distance(vectors, Point)
LIMIT N
```
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md), for example embeddings.
Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function but [other
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
0.33, ...)`, and `N` limits the number of search results.
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md) or Array(Float64), for example
embeddings. Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function
but [other distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point,
e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results.
This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for
situations where `MaxDistance` is difficult to determine in advance.
@ -59,6 +59,8 @@ Parameters:
- `ef_construction`: (optional, default: 128)
- `ef_search`: (optional, default: 64)
Value 0 for parameters `m`, `ef_construction`, and `ef_search` refers to the default value.
Example:
```sql

View File

@ -359,13 +359,14 @@ DESC format(JSONEachRow, '{"int" : 42, "float" : 42.42, "string" : "Hello, World
Dates, DateTimes:
```sql
DESC format(JSONEachRow, '{"date" : "2022-01-01", "datetime" : "2022-01-01 00:00:00"}')
DESC format(JSONEachRow, '{"date" : "2022-01-01", "datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}')
```
```response
┌─name─────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ date │ Nullable(Date) │ │ │ │ │ │
│ datetime │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ date │ Nullable(Date) │ │ │ │ │ │
│ datetime │ Nullable(DateTime) │ │ │ │ │ │
│ datetime64 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
Arrays:
@ -759,12 +760,13 @@ DESC format(CSV, 'Hello world!,World hello!')
Dates, DateTimes:
```sql
DESC format(CSV, '"2020-01-01","2020-01-01 00:00:00"')
DESC format(CSV, '"2020-01-01","2020-01-01 00:00:00","2022-01-01 00:00:00.000"')
```
```response
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Date) │ │ │ │ │ │
│ c2 │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ c2 │ Nullable(DateTime) │ │ │ │ │ │
│ c3 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
@ -956,12 +958,13 @@ DESC format(TSKV, 'int=42 float=42.42 bool=true string=Hello,World!\n')
Dates, DateTimes:
```sql
DESC format(TSV, '2020-01-01 2020-01-01 00:00:00')
DESC format(TSV, '2020-01-01 2020-01-01 00:00:00 2022-01-01 00:00:00.000')
```
```response
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Date) │ │ │ │ │ │
│ c2 │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ c2 │ Nullable(DateTime) │ │ │ │ │ │
│ c3 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
@ -1126,12 +1129,13 @@ DESC format(Values, $$(42, 42.42, true, 'Hello,World!')$$)
Dates, DateTimes:
```sql
DESC format(Values, $$('2020-01-01', '2020-01-01 00:00:00')$$)
```
DESC format(Values, $$('2020-01-01', '2020-01-01 00:00:00', '2022-01-01 00:00:00.000')$$)
```
```response
┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Date) │ │ │ │ │ │
│ c2 │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ c2 │ Nullable(DateTime) │ │ │ │ │ │
│ c3 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
@ -1501,8 +1505,8 @@ DESC format(JSONEachRow, $$
#### input_format_try_infer_datetimes
If enabled, ClickHouse will try to infer type `DateTime64` from string fields in schema inference for text formats.
If all fields from a column in sample data were successfully parsed as datetimes, the result type will be `DateTime64(9)`,
If enabled, ClickHouse will try to infer type `DateTime` or `DateTime64` from string fields in schema inference for text formats.
If all fields from a column in sample data were successfully parsed as datetimes, the result type will be `DateTime` or `DateTime64(9)` (if any datetime had fractional part),
if at least one field was not parsed as datetime, the result type will be `String`.
Enabled by default.
@ -1510,39 +1514,66 @@ Enabled by default.
**Examples**
```sql
SET input_format_try_infer_datetimes = 0
SET input_format_try_infer_datetimes = 0;
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00.000"}
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
$$)
```
```response
┌─name─────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
│ datetime64 │ Nullable(String) │ │ │ │ │ │
└────────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
```sql
SET input_format_try_infer_datetimes = 1
SET input_format_try_infer_datetimes = 1;
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00.000"}
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
$$)
```
```response
┌─name─────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(DateTime64(9)) │ │ │ │ │ │
└──────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(DateTime) │ │ │ │ │ │
│ datetime64 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
```sql
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00.000"}
{"datetime" : "unknown"}
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "unknown", "datetime64" : "unknown"}
$$)
```
```response
┌─name─────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─name───────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(String) │ │ │ │ │ │
│ datetime64 │ Nullable(String) │ │ │ │ │ │
└────────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
#### input_format_try_infer_datetimes_only_datetime64
If enabled, ClickHouse will always infer `DateTime64(9)` when `input_format_try_infer_datetimes` is enabled even if datetime values don't contain fractional part.
Disabled by default.
**Examples**
```sql
SET input_format_try_infer_datetimes = 1;
SET input_format_try_infer_datetimes_only_datetime64 = 1;
DESC format(JSONEachRow, $$
{"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
{"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
$$)
```
```text
┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ datetime │ Nullable(DateTime64(9)) │ │ │ │ │ │
│ datetime64 │ Nullable(DateTime64(9)) │ │ │ │ │ │
└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
Note: Parsing datetimes during schema inference respect setting [date_time_input_format](/docs/en/operations/settings/settings-formats.md#date_time_input_format)

View File

@ -307,8 +307,22 @@ SELECT dictGet('dict', 'B', 2);
## Named collections for accessing PostgreSQL database
The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md).
The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md). Additionally, there are aliases:
- `username` for `user`
- `db` for `database`.
Parameter `addresses_expr` is used in a collection instead of `host:port`. The parameter is optional, because there are other optional ones: `host`, `hostname`, `port`. The following pseudo code explains the priority:
```sql
CASE
WHEN collection['addresses_expr'] != '' THEN collection['addresses_expr']
WHEN collection['host'] != '' THEN collection['host'] || ':' || if(collection['port'] != '', collection['port'], '5432')
WHEN collection['hostname'] != '' THEN collection['hostname'] || ':' || if(collection['port'] != '', collection['port'], '5432')
END
```
Example of creation:
```sql
CREATE NAMED COLLECTION mypg AS
user = 'pguser',
@ -316,7 +330,7 @@ password = 'jw8s0F4',
host = '127.0.0.1',
port = 5432,
database = 'test',
schema = 'test_schema',
schema = 'test_schema'
```
Example of configuration:
@ -369,6 +383,10 @@ SELECT * FROM mypgtable;
└───┘
```
:::note
PostgreSQL copies data from the named collection when the table is being created. A change in the collection does not affect the existing tables.
:::
### Example of using named collections with database with engine PostgreSQL
```sql

View File

@ -1041,3 +1041,27 @@ Compression rates of LZ4 or ZSTD improve on average by 20-40%.
This setting works best for tables with no primary key or a low-cardinality primary key, i.e. a table with only few distinct primary key values.
High-cardinality primary keys, e.g. involving timestamp columns of type `DateTime64`, are not expected to benefit from this setting.
## lightweight_mutation_projection_mode
By default, lightweight delete `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation. So the default value would be `throw`.
However, this option can change the behavior. With the value either `drop` or `rebuild`, deletes will work with projections. `drop` would delete the projection so it might be fast in the current query as projection gets deleted but slow in future queries as no projection attached.
`rebuild` would rebuild the projection which might affect the performance of the current query, but might speedup for future queries. A good thing is that these options would only work in the part level,
which means projections in the part that don't get touched would stay intact instead of triggering any action like drop or rebuild.
Possible values:
- throw, drop, rebuild
Default value: throw
## deduplicate_merge_projection_mode
Whether to allow create projection for the table with non-classic MergeTree, that is not (Replicated, Shared) MergeTree. If allowed, what is the action when merge projections, either drop or rebuild. So classic MergeTree would ignore this setting.
It also controls `OPTIMIZE DEDUPLICATE` as well, but has effect on all MergeTree family members. Similar to the option `lightweight_mutation_projection_mode`, it is also part level.
Possible values:
- throw, drop, rebuild
Default value: throw

View File

@ -5620,6 +5620,19 @@ Minimal size of block to compress in CROSS JOIN. Zero value means - disable this
Default value: `1GiB`.
## use_json_alias_for_old_object_type
When enabled, `JSON` data type alias will be used to create an old [Object('json')](../../sql-reference/data-types/json.md) type instead of the new [JSON](../../sql-reference/data-types/newjson.md) type.
This setting requires server restart to take effect when changed.
Default value: `false`.
## type_json_skip_duplicated_paths
When enabled, ClickHouse will skip duplicated paths during parsing of [JSON](../../sql-reference/data-types/newjson.md) object. Only the value of the first occurrence of each path will be inserted.
Default value: `false`
## restore_replace_external_engines_to_null
For testing purposes. Replaces all external engines to Null to not initiate external connections.
@ -5654,3 +5667,9 @@ Possible values:
- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
Default value: `0`.
## create_if_not_exists
Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
Default value: `false`.

View File

@ -12,57 +12,59 @@ This specification describes the binary format that can be used for binary encod
The table below describes how each data type is represented in binary format. Each data type encoding consist of 1 byte that indicates the type and some optional additional information.
`var_uint` in the binary encoding means that the size is encoded using Variable-Length Quantity compression.
| ClickHouse data type | Binary encoding |
|--------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Nothing` | `0x00` |
| `UInt8` | `0x01` |
| `UInt16` | `0x02` |
| `UInt32` | `0x03` |
| `UInt64` | `0x04` |
| `UInt128` | `0x05` |
| `UInt256` | `0x06` |
| `Int8` | `0x07` |
| `Int16` | `0x08` |
| `Int32` | `0x09` |
| `Int64` | `0x0A` |
| `Int128` | `0x0B` |
| `Int256` | `0x0C` |
| `Float32` | `0x0D` |
| `Float64` | `0x0E` |
| `Date` | `0x0F` |
| `Date32` | `0x10` |
| `DateTime` | `0x11` |
| `DateTime(time_zone)` | `0x12<var_uint_time_zone_name_size><time_zone_name_data>` |
| `DateTime64(P)` | `0x13<uint8_precision>` |
| `DateTime64(P, time_zone)` | `0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data>` |
| `String` | `0x15` |
| `FixedString(N)` | `0x16<var_uint_size>` |
| `Enum8` | `0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N>` |
| `Enum16` | `0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N>` |
| `Decimal32(P, S)` | `0x19<uint8_precision><uint8_scale>` |
| `Decimal64(P, S)` | `0x1A<uint8_precision><uint8_scale>` |
| `Decimal128(P, S)` | `0x1B<uint8_precision><uint8_scale>` |
| `Decimal256(P, S)` | `0x1C<uint8_precision><uint8_scale>` |
| `UUID` | `0x1D` |
| `Array(T)` | `0x1E<nested_type_encoding>` |
| `Tuple(T1, ..., TN)` | `0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N>` |
| `Tuple(name1 T1, ..., nameN TN)` | `0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `Set` | `0x21` |
| `Interval` | `0x22<interval_kind>` (see [interval kind binary encoding](#interval-kind-binary-encoding)) |
| `Nullable(T)` | `0x23<nested_type_encoding>` |
| `Function` | `0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding>` |
| `AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `LowCardinality(T)` | `0x26<nested_type_encoding>` |
| `Map(K, V)` | `0x27<key_type_encoding><value_type_encoding>` |
| `IPv4` | `0x28` |
| `IPv6` | `0x29` |
| `Variant(T1, ..., TN)` | `0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N>` |
| `Dynamic(max_types=N)` | `0x2B<uint8_max_types>` |
| `Custom type` (`Ring`, `Polygon`, etc) | `0x2C<var_uint_type_name_size><type_name_data>` |
| `Bool` | `0x2D` |
| `SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `Nested(name1 T1, ..., nameN TN)` | `0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| ClickHouse data type | Binary encoding |
|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Nothing` | `0x00` |
| `UInt8` | `0x01` |
| `UInt16` | `0x02` |
| `UInt32` | `0x03` |
| `UInt64` | `0x04` |
| `UInt128` | `0x05` |
| `UInt256` | `0x06` |
| `Int8` | `0x07` |
| `Int16` | `0x08` |
| `Int32` | `0x09` |
| `Int64` | `0x0A` |
| `Int128` | `0x0B` |
| `Int256` | `0x0C` |
| `Float32` | `0x0D` |
| `Float64` | `0x0E` |
| `Date` | `0x0F` |
| `Date32` | `0x10` |
| `DateTime` | `0x11` |
| `DateTime(time_zone)` | `0x12<var_uint_time_zone_name_size><time_zone_name_data>` |
| `DateTime64(P)` | `0x13<uint8_precision>` |
| `DateTime64(P, time_zone)` | `0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data>` |
| `String` | `0x15` |
| `FixedString(N)` | `0x16<var_uint_size>` |
| `Enum8` | `0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N>` |
| `Enum16` | `0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N>` |
| `Decimal32(P, S)` | `0x19<uint8_precision><uint8_scale>` |
| `Decimal64(P, S)` | `0x1A<uint8_precision><uint8_scale>` |
| `Decimal128(P, S)` | `0x1B<uint8_precision><uint8_scale>` |
| `Decimal256(P, S)` | `0x1C<uint8_precision><uint8_scale>` |
| `UUID` | `0x1D` |
| `Array(T)` | `0x1E<nested_type_encoding>` |
| `Tuple(T1, ..., TN)` | `0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N>` |
| `Tuple(name1 T1, ..., nameN TN)` | `0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `Set` | `0x21` |
| `Interval` | `0x22<interval_kind>` (see [interval kind binary encoding](#interval-kind-binary-encoding)) |
| `Nullable(T)` | `0x23<nested_type_encoding>` |
| `Function` | `0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding>` |
| `AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `LowCardinality(T)` | `0x26<nested_type_encoding>` |
| `Map(K, V)` | `0x27<key_type_encoding><value_type_encoding>` |
| `IPv4` | `0x28` |
| `IPv6` | `0x29` |
| `Variant(T1, ..., TN)` | `0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N>` |
| `Dynamic(max_types=N)` | `0x2B<uint8_max_types>` |
| `Custom type` (`Ring`, `Polygon`, etc) | `0x2C<var_uint_type_name_size><type_name_data>` |
| `Bool` | `0x2D` |
| `SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `Nested(name1 T1, ..., nameN TN)` | `0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `JSON(max_dynamic_paths=N, max_dynamic_types=M, path Type, SKIP skip_path, SKIP REGEXP skip_path_regexp)` | `0x30<uint8_serialization_version><var_int_max_dynamic_paths><uint8_max_dynamic_types><var_uint_number_of_typed_paths><var_uint_path_name_size_1><path_name_data_1><encoded_type_1>...<var_uint_number_of_skip_paths><var_uint_skip_path_size_1><skip_path_data_1>...<var_uint_number_of_skip_path_regexps><var_uint_skip_path_regexp_size_1><skip_path_data_regexp_1>...` |
For type `JSON` byte `uint8_serialization_version` indicates the version of the serialization. Right now the version is always 0 but can change in future if new arguments will be introduced for `JSON` type.
### Interval kind binary encoding

View File

@ -14,7 +14,7 @@ To declare a column of `Dynamic` type, use the following syntax:
<column_name> Dynamic(max_types=N)
```
Where `N` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_types` is `32`.
Where `N` is an optional parameter between `0` and `254` indicating how many different data types can be stored as separate subcolumns inside a column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all values with new types will be stored together in a special shared data structure in binary form. Default value of `max_types` is `32`.
:::note
The Dynamic data type is an experimental feature. To use it, set `allow_experimental_dynamic_type = 1`.
@ -224,41 +224,43 @@ SELECT d::Dynamic(max_types=5) as d2, dynamicType(d2) FROM test;
└───────┴────────────────┘
```
If `K < N`, then the values with the rarest types are converted to `String`:
If `K < N`, then the values with the rarest types will be inserted into a single special subcolumn, but still will be accessible:
```text
CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]);
SELECT d, dynamicType(d), d::Dynamic(max_types=2) as d2, dynamicType(d2) FROM test;
SELECT d, dynamicType(d), d::Dynamic(max_types=2) as d2, dynamicType(d2), isDynamicElementInSharedData(d2) FROM test;
```
```text
┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │
│ 42 │ Int64 │ 42 │ Int64 │
│ 43 │ Int64 │ 43 │ Int64 │
│ 42.42 │ String │ 42.42 │ String │
│ true │ Bool │ true │ String
│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String
└─────────┴────────────────┴─────────┴─────────────────┘
┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┬─isDynamicElementInSharedData(d2)─
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │ false │
│ 42 │ Int64 │ 42 │ Int64 │ false │
│ 43 │ Int64 │ 43 │ Int64 │ false │
│ 42.42 │ String │ 42.42 │ String │ false │
│ true │ Bool │ true │ Bool │ true
│ [1,2,3] │ Array(Int64) │ [1,2,3] │ Array(Int64) │ true
└─────────┴────────────────┴─────────┴─────────────────┴──────────────────────────────────
```
If `K=1`, all types are converted to `String`:
Functions `isDynamicElementInSharedData` returns `true` for rows that are stored in a special shared data structure inside `Dynamic` and as we can see, resulting column contains only 2 types that are not stored in shared data structure.
If `K=0`, all types will be inserted into single special subcolumn:
```text
CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]);
SELECT d, dynamicType(d), d::Dynamic(max_types=1) as d2, dynamicType(d2) FROM test;
SELECT d, dynamicType(d), d::Dynamic(max_types=0) as d2, dynamicType(d2), isDynamicElementInSharedData(d2) FROM test;
```
```text
┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │
│ 42 │ Int64 │ 42 │ String
│ 43 │ Int64 │ 43 │ String
│ 42.42 │ String │ 42.42 │ String │
│ true │ Bool │ true │ String
│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String
└─────────┴────────────────┴─────────┴─────────────────┘
┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┬─isDynamicElementInSharedData(d2)─
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │ false │
│ 42 │ Int64 │ 42 │ Int64 │ true
│ 43 │ Int64 │ 43 │ Int64 │ true
│ 42.42 │ String │ 42.42 │ String │ true │
│ true │ Bool │ true │ Bool │ true
│ [1,2,3] │ Array(Int64) │ [1,2,3] │ Array(Int64) │ true
└─────────┴────────────────┴─────────┴─────────────────┴──────────────────────────────────
```
## Reading Dynamic type from the data
@ -411,17 +413,17 @@ SELECT d, dynamicType(d) FROM test ORDER by d;
## Reaching the limit in number of different data types stored inside Dynamic
`Dynamic` data type can store only limited number of different data types inside. By default, this limit is 32, but you can change it in type declaration using syntax `Dynamic(max_types=N)` where N is between 1 and 255 (due to implementation details, it's impossible to have more than 255 different data types inside Dynamic).
When the limit is reached, all new data types inserted to `Dynamic` column will be casted to `String` and stored as `String` values.
`Dynamic` data type can store only limited number of different data types as separate subcolumns. By default, this limit is 32, but you can change it in type declaration using syntax `Dynamic(max_types=N)` where N is between 0 and 254 (due to implementation details, it's impossible to have more than 254 different data types that can be stored as separate subcolumns inside Dynamic).
When the limit is reached, all new data types inserted to `Dynamic` column will be inserted into a single shared data structure that stores values with different data types in binary form.
Let's see what happens when the limit is reached in different scenarios.
### Reaching the limit during data parsing
During parsing of `Dynamic` values from the data, when the limit is reached for current block of data, all new values will be inserted as `String` values:
During parsing of `Dynamic` values from the data, when the limit is reached for current block of data, all new values will be inserted into shared data structure:
```sql
SELECT d, dynamicType(d) FROM format(JSONEachRow, 'd Dynamic(max_types=3)', '
SELECT d, dynamicType(d), isDynamicElementInSharedData(d) FROM format(JSONEachRow, 'd Dynamic(max_types=3)', '
{"d" : 42}
{"d" : [1, 2, 3]}
{"d" : "Hello, World!"}
@ -432,22 +434,22 @@ SELECT d, dynamicType(d) FROM format(JSONEachRow, 'd Dynamic(max_types=3)', '
```
```text
┌─d──────────────────────────┬─dynamicType(d)─┐
│ 42 │ Int64 │
│ [1,2,3] │ Array(Int64) │
│ Hello, World! │ String │
│ 2020-01-01 │ String
│ ["str1", "str2", "str3"] │ String
{"a" : 1, "b" : [1, 2, 3]} │ String
└────────────────────────────┴────────────────┘
┌─d──────────────────────┬─dynamicType(d)─────────────────┬─isDynamicElementInSharedData(d)─┐
│ 42 │ Int64 │ false
│ [1,2,3] │ Array(Int64) │ false
│ Hello, World! │ String │ false
│ 2020-01-01 │ Date │ true
│ ['str1','str2','str3'] │ Array(String) │ true
(1,[1,2,3]) │ Tuple(a Int64, b Array(Int64)) │ true
└────────────────────────┴────────────────────────────────┴─────────────────────────────────┘
```
As we can see, after inserting 3 different data types `Int64`, `Array(Int64)` and `String` all new types were converted to `String`.
As we can see, after inserting 3 different data types `Int64`, `Array(Int64)` and `String` all new types were inserted into special shared data structure.
### During merges of data parts in MergeTree table engines
During merge of several data parts in MergeTree table the `Dynamic` column in the resulting data part can reach the limit of different data types inside and won't be able to store all types from source parts.
In this case ClickHouse chooses what types will remain after merge and what types will be casted to `String`. In most cases ClickHouse tries to keep the most frequent types and cast the rarest types to `String`, but it depends on the implementation.
During merge of several data parts in MergeTree table the `Dynamic` column in the resulting data part can reach the limit of different data types that can be stored in separate subcolumns inside and won't be able to store all types as subcolumns from source parts.
In this case ClickHouse chooses what types will remain as separate subcolumns after merge and what types will be inserted into shared data structure. In most cases ClickHouse tries to keep the most frequent types and store the rarest types in shared data structure, but it depends on the implementation.
Let's see an example of such merge. First, let's create a table with `Dynamic` column, set the limit of different data types to `3` and insert values with `5` different types:
@ -463,17 +465,17 @@ INSERT INTO test SELECT number, 'str_' || toString(number) FROM numbers(1);
Each insert will create a separate data pert with `Dynamic` column containing single type:
```sql
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
SELECT count(), dynamicType(d), isDynamicElementInSharedData(d), _part FROM test GROUP BY _part, dynamicType(d), isDynamicElementInSharedData(d) ORDER BY _part, count();
```
```text
┌─count()─┬─dynamicType(d)──────┬─_part─────┐
│ 5 │ UInt64 │ all_1_1_0 │
│ 4 │ Array(UInt64) │ all_2_2_0 │
│ 3 │ Date │ all_3_3_0 │
│ 2 │ Map(UInt64, UInt64) │ all_4_4_0 │
│ 1 │ String │ all_5_5_0 │
└─────────┴─────────────────────┴───────────┘
┌─count()─┬─dynamicType(d)──────┬─isDynamicElementInSharedData(d)─┬─_part─────┐
│ 5 │ UInt64 │ false │ all_1_1_0 │
│ 4 │ Array(UInt64) │ false │ all_2_2_0 │
│ 3 │ Date │ false │ all_3_3_0 │
│ 2 │ Map(UInt64, UInt64) │ false │ all_4_4_0 │
│ 1 │ String │ false │ all_5_5_0 │
└─────────┴─────────────────────┴─────────────────────────────────┴───────────
```
Now, let's merge all parts into one and see what will happen:
@ -481,18 +483,20 @@ Now, let's merge all parts into one and see what will happen:
```sql
SYSTEM START MERGES test;
OPTIMIZE TABLE test FINAL;
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
SELECT count(), dynamicType(d), isDynamicElementInSharedData(d), _part FROM test GROUP BY _part, dynamicType(d), isDynamicElementInSharedData(d) ORDER BY _part, count() desc;
```
```text
┌─count()─┬─dynamicType(d)─┬─_part─────┐
│ 5 │ UInt64 │ all_1_5_2 │
│ 6 │ String │ all_1_5_2 │
│ 4 │ Array(UInt64) │ all_1_5_2 │
└─────────┴────────────────┴───────────┘
┌─count()─┬─dynamicType(d)──────┬─isDynamicElementInSharedData(d)─┬─_part─────┐
│ 5 │ UInt64 │ false │ all_1_5_2 │
│ 4 │ Array(UInt64) │ false │ all_1_5_2 │
│ 3 │ Date │ false │ all_1_5_2 │
│ 2 │ Map(UInt64, UInt64) │ true │ all_1_5_2 │
│ 1 │ String │ true │ all_1_5_2 │
└─────────┴─────────────────────┴─────────────────────────────────┴───────────┘
```
As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`.
As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` as subcolumns and inserted all other types into shared data.
## JSONExtract functions with Dynamic
@ -509,22 +513,23 @@ SELECT JSONExtract('{"a" : [1, 2, 3]}', 'a', 'Dynamic') AS dynamic, dynamicType(
```
```sql
SELECT JSONExtract('{"obj" : {"a" : 42, "b" : "Hello", "c" : [1,2,3]}}', 'obj', 'Map(String, Variant(UInt32, String, Array(UInt32)))') AS map_of_dynamics, mapApply((k, v) -> (k, variantType(v)), map_of_dynamics) AS map_of_dynamic_types```
SELECT JSONExtract('{"obj" : {"a" : 42, "b" : "Hello", "c" : [1,2,3]}}', 'obj', 'Map(String, Dynamic)') AS map_of_dynamics, mapApply((k, v) -> (k, dynamicType(v)), map_of_dynamics) AS map_of_dynamic_types
```
```text
┌─map_of_dynamics──────────────────┬─map_of_dynamic_types────────────────────────────┐
│ {'a':42,'b':'Hello','c':[1,2,3]} │ {'a':'UInt32','b':'String','c':'Array(UInt32)'} │
└──────────────────────────────────┴─────────────────────────────────────────────────┘
┌─map_of_dynamics──────────────────┬─map_of_dynamic_types────────────────────────────────────
│ {'a':42,'b':'Hello','c':[1,2,3]} │ {'a':'Int64','b':'String','c':'Array(Nullable(Int64))'} │
└──────────────────────────────────┴─────────────────────────────────────────────────────────
```
```sql
SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Variant(UInt32, String, Array(UInt32))') AS dynamics, arrayMap(x -> (x.1, variantType(x.2)), dynamics) AS dynamic_types```
SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Dynamic') AS dynamics, arrayMap(x -> (x.1, dynamicType(x.2)), dynamics) AS dynamic_types```
```
```text
┌─dynamics───────────────────────────────┬─dynamic_types─────────────────────────────────────────┐
│ [('a',42),('b','Hello'),('c',[1,2,3])] │ [('a','UInt32'),('b','String'),('c','Array(UInt32)')] │
└────────────────────────────────────────┴───────────────────────────────────────────────────────┘
┌─dynamics───────────────────────────────┬─dynamic_types─────────────────────────────────────────────────
│ [('a',42),('b','Hello'),('c',[1,2,3])] │ [('a','Int64'),('b','String'),('c','Array(Nullable(Int64))')] │
└────────────────────────────────────────┴───────────────────────────────────────────────────────────────
```
### Binary output format

View File

@ -19,7 +19,8 @@ ClickHouse data types include:
- **Boolean**: ClickHouse has a [`Boolean` type](./boolean.md)
- **Strings**: [`String`](./string.md) and [`FixedString`](./fixedstring.md)
- **Dates**: use [`Date`](./date.md) and [`Date32`](./date32.md) for days, and [`DateTime`](./datetime.md) and [`DateTime64`](./datetime64.md) for instances in time
- **JSON**: the [`JSON` object](./json.md) stores a JSON document in a single column
- **Object**: the [`Object`](./json.md) stores a JSON document in a single column (deprecated)
- **JSON**: the [`JSON` object](./newjson.md) stores a JSON document in a single column
- **UUID**: a performant option for storing [`UUID` values](./uuid.md)
- **Low cardinality types**: use an [`Enum`](./enum.md) when you have a handful of unique values, or use [`LowCardinality`](./lowcardinality.md) when you have up to 10,000 unique values of a column
- **Arrays**: any column can be defined as an [`Array` of values](./array.md)

View File

@ -13,7 +13,7 @@ keywords: [object, data type]
Stores JavaScript Object Notation (JSON) documents in a single column.
`JSON` is an alias for `Object('json')`.
`JSON` can be used as an alias to `Object('json')` when setting [use_json_alias_for_old_object_type](../../operations/settings/settings.md#usejsonaliasforoldobjecttype) is enabled.
## Example
@ -81,3 +81,4 @@ SELECT * FROM json FORMAT JSONEachRow
- [Using JSON in ClickHouse](/docs/en/integrations/data-formats/json)
- [Getting Data Into ClickHouse - Part 2 - A JSON detour](https://clickhouse.com/blog/getting-data-into-clickhouse-part-2-json)
-

View File

@ -0,0 +1,516 @@
---
slug: /en/sql-reference/data-types/newjson
sidebar_position: 63
sidebar_label: JSON
keywords: [json, data type]
---
# JSON
Stores JavaScript Object Notation (JSON) documents in a single column.
:::note
This feature is experimental and is not production-ready. If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-formats/json/overview) instead.
If you want to use JSON type, set `allow_experimental_json_type = 1`.
:::
To declare a column of `JSON` type, use the following syntax:
``` sql
<column_name> JSON(max_dynamic_paths=N, max_dynamic_types=M, some.path TypeName, SKIP path.to.skip, SKIP REGEXP 'paths_regexp')
```
Where:
- `max_dynamic_paths` is an optional parameter indicating how many paths can be stored separately as subcolumns across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all other paths will be stored together in a single structure. Default value of `max_dynamic_paths` is `1024`.
- `max_dynamic_types` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a single path column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_dynamic_types` is `32`.
- `some.path TypeName` is an optional type hint for particular path in the JSON. Such paths will be always stored as subcolumns with specified type.
- `SKIP path.to.skip` is an optional hint for particular path that should be skipped during JSON parsing. Such paths will never be stored in the JSON column. If specified path is a nested JSON object, the whole nested object will be skipped.
- `SKIP REGEXP 'path_regexp'` is an optional hint with a regular expression that is used to skip paths during JSON parsing. All paths that match this regular expression will never be stored in the JSON column.
## Creating JSON
Using `JSON` type in table column definition:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42}, "c" : [1, 2, 3]}'), ('{"f" : "Hello, World!"}'), ('{"a" : {"b" : 43, "e" : 10}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json────────────────────────────────────────┐
│ {"a":{"b":"42"},"c":["1","2","3"]} │
│ {"f":"Hello, World!"} │
│ {"a":{"b":"43","e":"10"},"c":["4","5","6"]} │
└─────────────────────────────────────────────┘
```
```sql
CREATE TABLE test (json JSON(a.b UInt32, SKIP a.e)) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42}, "c" : [1, 2, 3]}'), ('{"f" : "Hello, World!"}'), ('{"a" : {"b" : 43, "e" : 10}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json──────────────────────────────┐
│ {"a":{"b":42},"c":[1,2,3]} │
│ {"a":{"b":0},"f":"Hello, World!"} │
│ {"a":{"b":43},"c":[4,5,6]} │
└───────────────────────────────────┘
```
Using CAST from 'String':
```sql
SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json;
```
```text
┌─json───────────────────────────────────────────┐
│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │
└────────────────────────────────────────────────┘
```
CAST from named `Tuple`, `Map` and `Object('json')` to `JSON` type will be supported later.
## Reading JSON paths as subcolumns
JSON type supports reading every path as a separate subcolumn. If type of the requested path was not specified in the JSON type declaration, the subcolumn of the path will always have type [Dynamic](/docs/en/sql-reference/data-types/dynamic.md).
For example:
```sql
CREATE TABLE test (json JSON(a.b UInt32, SKIP a.e)) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42, "g" : 42.42}, "c" : [1, 2, 3], "d" : "2020-01-01"}'), ('{"f" : "Hello, World!", "d" : "2020-01-02"}'), ('{"a" : {"b" : 43, "e" : 10, "g" : 43.43}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json──────────────────────────────────────────────────┐
│ {"a":{"b":42,"g":42.42},"c":[1,2,3],"d":"2020-01-01"} │
│ {"a":{"b":0},"d":"2020-01-02","f":"Hello, World!"} │
│ {"a":{"b":43,"g":43.43},"c":[4,5,6]} │
└───────────────────────────────────────────────────────┘
```
```sql
SELECT json.a.b, json.a.g, json.c, json.d FROM test;
```
```text
┌─json.a.b─┬─json.a.g─┬─json.c──┬─json.d─────┐
│ 42 │ 42.42 │ [1,2,3] │ 2020-01-01 │
│ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-02 │
│ 43 │ 43.43 │ [4,5,6] │ ᴺᵁᴸᴸ │
└──────────┴──────────┴─────────┴────────────┘
```
If the requested path wasn't found in the data, it will be filled with `NULL` values:
```sql
SELECT json.non.existing.path FROM test;
```
```text
┌─json.non.existing.path─┐
│ ᴺᵁᴸᴸ │
│ ᴺᵁᴸᴸ │
│ ᴺᵁᴸᴸ │
└────────────────────────┘
```
Let's check the data types of returned subcolumns:
```sql
SELECT toTypeName(json.a.b), toTypeName(json.a.g), toTypeName(json.c), toTypeName(json.d) FROM test;
```
```text
┌─toTypeName(json.a.b)─┬─toTypeName(json.a.g)─┬─toTypeName(json.c)─┬─toTypeName(json.d)─┐
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
└──────────────────────┴──────────────────────┴────────────────────┴────────────────────┘
```
As we can see, for `a.b` the type is `UInt32` as we specified in the JSON type declaration, and for all other subcolumns the type is `Dynamic`.
It is also possible to read subcolumns of a `Dynamic` type using special syntax `json.some.path.:TypeName`:
```sql
select json.a.g.:Float64, dynamicType(json.a.g), json.d.:Date, dynamicType(json.d) FROM test;
```
```text
┌─json.a.g.:`Float64`─┬─dynamicType(json.a.g)─┬─json.d.:`Date`─┬─dynamicType(json.d)─┐
│ 42.42 │ Float64 │ 2020-01-01 │ Date │
│ ᴺᵁᴸᴸ │ None │ 2020-01-02 │ Date │
│ 43.43 │ Float64 │ ᴺᵁᴸᴸ │ None │
└─────────────────────┴───────────────────────┴────────────────┴─────────────────────┘
```
`Dynamic` subcolumns can be casted to any data type. In this case the exception will be thrown if internal type inside `Dynamic` cannot be casted to the requested type:
```sql
select json.a.g::UInt64 as uint FROM test;
```
```text
┌─uint─┐
│ 42 │
│ 0 │
│ 43 │
└──────┘
```
```sql
select json.a.g::UUID as float FROM test;
```
```text
Received exception:
Code: 48. DB::Exception: Conversion between numeric types and UUID is not supported. Probably the passed UUID is unquoted: while executing 'FUNCTION CAST(__table1.json.a.g :: 2, 'UUID'_String :: 1) -> CAST(__table1.json.a.g, 'UUID'_String) UUID : 0'. (NOT_IMPLEMENTED)
```
## Reading JSON sub-objects as subcolumns
JSON type supports reading nested objects as subcolumns with type `JSON` using special syntax `json.^some.path`:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : {"c" : 42, "g" : 42.42}}, "c" : [1, 2, 3], "d" : {"e" : {"f" : {"g" : "Hello, World", "h" : [1, 2, 3]}}}}'), ('{"f" : "Hello, World!", "d" : {"e" : {"f" : {"h" : [4, 5, 6]}}}}'), ('{"a" : {"b" : {"c" : 43, "e" : 10, "g" : 43.43}}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json────────────────────────────────────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":42,"g":42.42}},"c":[1,2,3],"d":{"e":{"f":{"g":"Hello, World","h":[1,2,3]}}}} │
│ {"d":{"e":{"f":{"h":[4,5,6]}}},"f":"Hello, World!"} │
│ {"a":{"b":{"c":43,"e":10,"g":43.43}},"c":[4,5,6]} │
└─────────────────────────────────────────────────────────────────────────────────────────────┘
```
```sql
SELECT json.^a.b, json.^d.e.f FROM test;
```
```text
┌─json.^`a`.b───────────────┬─json.^`d`.e.f────────────────────┐
│ {"c":42,"g":42.42} │ {"g":"Hello, World","h":[1,2,3]} │
│ {} │ {"h":[4,5,6]} │
│ {"c":43,"e":10,"g":43.43} │ {} │
└───────────────────────────┴──────────────────────────────────┘
```
:::note
Reading sub-objects as subcolumns may be inefficient, as this may require almost full scan of the JSON data.
:::
## Types inference for paths
During JSON parsing ClickHouse tries to detect the most appropriate data type for each JSON path. It works similar to [automatic schema inference from input data](/docs/en/interfaces/schema-inference.md) and controlled by the same settings:
- [input_format_try_infer_integers](/docs/en/interfaces/schema-inference.md#inputformattryinferintegers)
- [input_format_try_infer_dates](/docs/en/interfaces/schema-inference.md#inputformattryinferdates)
- [input_format_try_infer_datetimes](/docs/en/interfaces/schema-inference.md#inputformattryinferdatetimes)
- [schema_inference_make_columns_nullable](/docs/en/interfaces/schema-inference.md#schemainferencemakecolumnsnullable)
- [input_format_json_try_infer_numbers_from_strings](/docs/en/interfaces/schema-inference.md#inputformatjsontryinfernumbersfromstrings)
- [input_format_json_infer_incomplete_types_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsoninferincompletetypesasstrings)
- [input_format_json_read_numbers_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadnumbersasstrings)
- [input_format_json_read_bools_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadboolsasstrings)
- [input_format_json_read_bools_as_numbers](/docs/en/interfaces/schema-inference.md#inputformatjsonreadboolsasnumbers)
- [input_format_json_read_arrays_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadarraysasstrings)
Let's see some examples:
```sql
SELECT JSONAllPathsWithTypes('{"a" : "2020-01-01", "b" : "2020-01-01 10:00:00"}'::JSON) AS paths_with_types settings input_format_try_infer_dates=1, input_format_try_infer_datetimes=1;
```
```text
┌─paths_with_types─────────────────┐
│ {'a':'Date','b':'DateTime64(9)'} │
└──────────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : "2020-01-01", "b" : "2020-01-01 10:00:00"}'::JSON) AS paths_with_types settings input_format_try_infer_dates=0, input_format_try_infer_datetimes=0;
```
```text
┌─paths_with_types────────────┐
│ {'a':'String','b':'String'} │
└─────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : [1, 2, 3]}'::JSON) AS paths_with_types settings schema_inference_make_columns_nullable=1;
```
```text
┌─paths_with_types───────────────┐
│ {'a':'Array(Nullable(Int64))'} │
└────────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : [1, 2, 3]}'::JSON) AS paths_with_types settings schema_inference_make_columns_nullable=0;
```
```text
┌─paths_with_types─────┐
│ {'a':'Array(Int64)'} │
└──────────────────────┘
```
## Handling arrays of JSON objects
JSON paths that contains an array of objects are parsed as type `Array(JSON)` and inserted into `Dynamic` column for this path. To read an array of objects you can extract it from `Dynamic` column as a subcolumn:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES
('{"a" : {"b" : [{"c" : 42, "d" : "Hello", "f" : [[{"g" : 42.42}]], "k" : {"j" : 1000}}, {"c" : 43}, {"e" : [1, 2, 3], "d" : "My", "f" : [[{"g" : 43.43, "h" : "2020-01-01"}]], "k" : {"j" : 2000}}]}}'),
('{"a" : {"b" : [1, 2, 3]}}'),
('{"a" : {"b" : [{"c" : 44, "f" : [[{"h" : "2020-01-02"}]]}, {"e" : [4, 5, 6], "d" : "World", "f" : [[{"g" : 44.44}]], "k" : {"j" : 3000}}]}}');
SELECT json FROM test;
```
```text3
┌─json────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ {"a":{"b":[{"c":"42","d":"Hello","f":[[{"g":42.42}]],"k":{"j":"1000"}},{"c":"43"},{"d":"My","e":["1","2","3"],"f":[[{"g":43.43,"h":"2020-01-01"}]],"k":{"j":"2000"}}]}} │
│ {"a":{"b":["1","2","3"]}} │
│ {"a":{"b":[{"c":"44","f":[[{"h":"2020-01-02"}]]},{"d":"World","e":["4","5","6"],"f":[[{"g":44.44}]],"k":{"j":"3000"}}]}} │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
```sql
SELECT json.a.b, dynamicType(json.a.b) FROM test;
```
```text
┌─json.a.b──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─dynamicType(json.a.b)────────────────────────────────────┐
│ ['{"c":"42","d":"Hello","f":[[{"g":42.42}]],"k":{"j":"1000"}}','{"c":"43"}','{"d":"My","e":["1","2","3"],"f":[[{"g":43.43,"h":"2020-01-01"}]],"k":{"j":"2000"}}'] │ Array(JSON(max_dynamic_types=16, max_dynamic_paths=256)) │
│ [1,2,3] │ Array(Nullable(Int64)) │
│ ['{"c":"44","f":[[{"h":"2020-01-02"}]]}','{"d":"World","e":["4","5","6"],"f":[[{"g":44.44}]],"k":{"j":"3000"}}'] │ Array(JSON(max_dynamic_types=16, max_dynamic_paths=256)) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────┘
```
As you can notice, the `max_dynamic_types/max_dynamic_paths` parameters of the nested `JSON` type were reduced compared to the default values. It's needed to avoid number of subcolumns to grow uncontrolled on nested arrays of JSON objects.
Let's try to read subcolumns from this nested `JSON` column:
```sql
SELECT json.a.b.:`Array(JSON)`.c, json.a.b.:`Array(JSON)`.f, json.a.b.:`Array(JSON)`.d FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c─┬─json.a.b.:`Array(JSON)`.f───────────────────────────────────┬─json.a.b.:`Array(JSON)`.d─┐
│ [42,43,NULL] │ [[['{"g":42.42}']],NULL,[['{"g":43.43,"h":"2020-01-01"}']]] │ ['Hello',NULL,'My'] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[['{"h":"2020-01-02"}']],[['{"g":44.44}']]] │ [NULL,'World'] │
└───────────────────────────┴─────────────────────────────────────────────────────────────┴───────────────────────────┘
```
We can avoid writing `Array(JSON)` subcolumn name using special syntax:
```sql
SELECT json.a.b[].c, json.a.b[].f, json.a.b[].d FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c─┬─json.a.b.:`Array(JSON)`.f───────────────────────────────────┬─json.a.b.:`Array(JSON)`.d─┐
│ [42,43,NULL] │ [[['{"g":42.42}']],NULL,[['{"g":43.43,"h":"2020-01-01"}']]] │ ['Hello',NULL,'My'] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[['{"h":"2020-01-02"}']],[['{"g":44.44}']]] │ [NULL,'World'] │
└───────────────────────────┴─────────────────────────────────────────────────────────────┴───────────────────────────┘
```
The number of `[]` after path indicates the array level. `json.path[][]` will be transformed to `json.path.:Array(Array(JSON))`
Let's check the paths and types inside our `Array(JSON)`:
```sql
SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(json.a.b[]))) FROM test;
```
```text
┌─arrayJoin(JSONAllPathsWithTypes(arrayJoin(json.a.b.:`Array(JSON)`)))──┐
│ ('c','Int64') │
│ ('d','String') │
│ ('f','Array(Array(JSON(max_dynamic_types=8, max_dynamic_paths=64)))') │
│ ('k.j','Int64') │
│ ('e','Array(Nullable(Int64))') │
└───────────────────────────────────────────────────────────────────────┘
```
Let's read subcolumns from `Array(JSON)` column:
```sql
SELECT json.a.b[].c.:Int64, json.a.b[].f[][].g.:Float64, json.a.b[].f[][].h.:Date FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c.:`Int64`─┬─json.a.b.:`Array(JSON)`.f.:`Array(Array(JSON))`.g.:`Float64`─┬─json.a.b.:`Array(JSON)`.f.:`Array(Array(JSON))`.h.:`Date`─┐
│ [42,43,NULL] │ [[[42.42]],[],[[43.43]]] │ [[[NULL]],[],[['2020-01-01']]] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[[NULL]],[[44.44]]] │ [[['2020-01-02']],[[NULL]]] │
└────────────────────────────────────┴──────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────┘
```
We can also read sub-object subcolumns from nested `JSON` column:
```sql
SELECT json.a.b[].^k FROM test
```
```text
┌─json.a.b.:`Array(JSON)`.^`k`─────────┐
│ ['{"j":"1000"}','{}','{"j":"2000"}'] │
│ [] │
│ ['{}','{"j":"3000"}'] │
└──────────────────────────────────────┘
```
## Reading JSON type from the data
All text formats (JSONEachRow, TSV, CSV, CustomSeparated, Values, etc) supports reading `JSON` type.
Examples:
```sql
SELECT json FROM format(JSONEachRow, 'json JSON(a.b.c UInt32, SKIP a.b.d, SKIP d.e, SKIP REGEXP \'b.*\')', '
{"json" : {"a" : {"b" : {"c" : 1, "d" : [0, 1]}}, "b" : "2020-01-01", "c" : 42, "d" : {"e" : {"f" : ["s1", "s2"]}, "i" : [1, 2, 3]}}}
{"json" : {"a" : {"b" : {"c" : 2, "d" : [2, 3]}}, "b" : [1, 2, 3], "c" : null, "d" : {"e" : {"g" : 43}, "i" : [4, 5, 6]}}}
{"json" : {"a" : {"b" : {"c" : 3, "d" : [4, 5]}}, "b" : {"c" : 10}, "e" : "Hello, World!"}}
{"json" : {"a" : {"b" : {"c" : 4, "d" : [6, 7]}}, "c" : 43}}
{"json" : {"a" : {"b" : {"c" : 5, "d" : [8, 9]}}, "b" : {"c" : 11, "j" : [1, 2, 3]}, "d" : {"e" : {"f" : ["s3", "s4"], "g" : 44}, "h" : "2020-02-02 10:00:00"}}}
')
```
```text
┌─json──────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":1}},"c":"42","d":{"i":["1","2","3"]}} │
│ {"a":{"b":{"c":2}},"d":{"i":["4","5","6"]}} │
│ {"a":{"b":{"c":3}},"e":"Hello, World!"} │
│ {"a":{"b":{"c":4}},"c":"43"} │
│ {"a":{"b":{"c":5}},"d":{"h":"2020-02-02 10:00:00.000000000"}} │
└───────────────────────────────────────────────────────────────┘
```
For text formats like CSV/TSV/etc `JSON` is parsed from a string containing JSON object
```sql
SELECT json FROM format(TSV, 'json JSON(a.b.c UInt32, SKIP a.b.d, SKIP REGEXP \'b.*\')',
'{"a" : {"b" : {"c" : 1, "d" : [0, 1]}}, "b" : "2020-01-01", "c" : 42, "d" : {"e" : {"f" : ["s1", "s2"]}, "i" : [1, 2, 3]}}
{"a" : {"b" : {"c" : 2, "d" : [2, 3]}}, "b" : [1, 2, 3], "c" : null, "d" : {"e" : {"g" : 43}, "i" : [4, 5, 6]}}
{"a" : {"b" : {"c" : 3, "d" : [4, 5]}}, "b" : {"c" : 10}, "e" : "Hello, World!"}
{"a" : {"b" : {"c" : 4, "d" : [6, 7]}}, "c" : 43}
{"a" : {"b" : {"c" : 5, "d" : [8, 9]}}, "b" : {"c" : 11, "j" : [1, 2, 3]}, "d" : {"e" : {"f" : ["s3", "s4"], "g" : 44}, "h" : "2020-02-02 10:00:00"}}')
```
```text
┌─json──────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":1}},"c":"42","d":{"i":["1","2","3"]}} │
│ {"a":{"b":{"c":2}},"d":{"i":["4","5","6"]}} │
│ {"a":{"b":{"c":3}},"e":"Hello, World!"} │
│ {"a":{"b":{"c":4}},"c":"43"} │
│ {"a":{"b":{"c":5}},"d":{"h":"2020-02-02 10:00:00.000000000"}} │
└───────────────────────────────────────────────────────────────┘
```
## Reaching the limit of dynamic paths inside JSON
`JSON` data type can store only limited number of paths as separate subcolumns inside. By default, this limit is 1024, but you can change it in type declaration using parameter `max_dynamic_paths`.
When the limit is reached, all new paths inserted to `JSON` column will be stored in a single shared data structure. It's still possible to read such paths as subcolumns, but it will require reading the whole
shared data structure to extract the values of this path. This limit is needed to avoid the enormous number of different subcolumns that can make the table unusable.
Let's see what happens when the limit is reached in different scenarios.
### Reaching the limit during data parsing
During parsing of `JSON` object from the data, when the limit is reached for current block of data, all new paths will be stored in a shared data structure. We can check it using introspection functions `JSONDynamicPaths, JSONSharedDataPaths`:
```sql
SELECT json, JSONDynamicPaths(json), JSONSharedDataPaths(json) FROM format(JSONEachRow, 'json JSON(max_dynamic_paths=3)', '
{"json" : {"a" : {"b" : 42}, "c" : [1, 2, 3]}}
{"json" : {"a" : {"b" : 43}, "d" : "2020-01-01"}}
{"json" : {"a" : {"b" : 44}, "c" : [4, 5, 6]}}
{"json" : {"a" : {"b" : 43}, "d" : "2020-01-02", "e" : "Hello", "f" : {"g" : 42.42}}}
{"json" : {"a" : {"b" : 43}, "c" : [7, 8, 9], "f" : {"g" : 43.43}, "h" : "World"}}
')
```
```text
┌─json───────────────────────────────────────────────────────────┬─JSONDynamicPaths(json)─┬─JSONSharedDataPaths(json)─┐
│ {"a":{"b":"42"},"c":["1","2","3"]} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"43"},"d":"2020-01-01"} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"44"},"c":["4","5","6"]} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"43"},"d":"2020-01-02","e":"Hello","f":{"g":42.42}} │ ['a.b','c','d'] │ ['e','f.g'] │
│ {"a":{"b":"43"},"c":["7","8","9"],"f":{"g":43.43},"h":"World"} │ ['a.b','c','d'] │ ['f.g','h'] │
└────────────────────────────────────────────────────────────────┴────────────────────────┴───────────────────────────┘
```
As we can see, after inserting paths `e` and `f.g` the limit was reached and we inserted them into shared data structure.
### During merges of data parts in MergeTree table engines
During merge of several data parts in MergeTree table the `JSON` column in the resulting data part can reach the limit of dynamic paths won't be able to store all paths from source parts as subcolumns.
In this case ClickHouse chooses what paths will remain as subcolumns after merge and what types will be stored in the shared data structure. In most cases ClickHouse tries to keep paths that contains
the largest number of non-null values and move the rarest paths to the shared data structure, but it depends on the implementation.
Let's see an example of such merge. First, let's create a table with `JSON` column, set the limit of dynamic paths to `3` and insert values with `5` different paths:
```sql
CREATE TABLE test (id UInt64, json JSON(max_dynamic_paths=3)) engine=MergeTree ORDER BY id;
SYSTEM STOP MERGES test;
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as a) FROM numbers(5);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as b) FROM numbers(4);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as c) FROM numbers(3);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as d) FROM numbers(2);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as e) FROM numbers(1);
```
Each insert will create a separate data pert with `JSON` column containing single path:
```sql
SELECT count(), JSONDynamicPaths(json) AS dynamic_paths, JSONSharedDataPaths(json) AS shared_data_paths, _part FROM test GROUP BY _part, dynamic_paths, shared_data_paths ORDER BY _part ASC
```
```text
┌─count()─┬─dynamic_paths─┬─shared_data_paths─┬─_part─────┐
│ 5 │ ['a'] │ [] │ all_1_1_0 │
│ 4 │ ['b'] │ [] │ all_2_2_0 │
│ 3 │ ['c'] │ [] │ all_3_3_0 │
│ 2 │ ['d'] │ [] │ all_4_4_0 │
│ 1 │ ['e'] │ [] │ all_5_5_0 │
└─────────┴───────────────┴───────────────────┴───────────┘
```
Now, let's merge all parts into one and see what will happen:
```sql
SYSTEM START MERGES test;
OPTIMIZE TABLE test FINAL;
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
```
```text
┌─count()─┬─dynamic_paths─┬─shared_data_paths─┬─_part─────┐
│ 1 │ ['a','b','c'] │ ['e'] │ all_1_5_2 │
│ 2 │ ['a','b','c'] │ ['d'] │ all_1_5_2 │
│ 12 │ ['a','b','c'] │ [] │ all_1_5_2 │
└─────────┴───────────────┴───────────────────┴───────────┘
```
As we can see, ClickHouse kept the most frequent paths `a`, `b` and `c` and moved paths `e` and `d` to shared data structure.
## Introspection functions
There are several functions that can help to inspect the content of the JSON column: [JSONAllPaths](../functions/json-functions.md#jsonallpaths), [JSONAllPathsWithTypes](../functions/json-functions.md#jsonallpathswithtypes), [JSONDynamicPaths](../functions/json-functions.md#jsondynamicpaths), [JSONDynamicPathsWithTypes](../functions/json-functions.md#jsondynamicpathswithtypes), [JSONSharedDataPaths](../functions/json-functions.md#jsonshareddatapaths), [JSONSharedDataPathsWithTypes](../functions/json-functions.md#jsonshareddatapathswithtypes).
## Tips for better usage of the JSON type
Before creating `JSON` column and loading data into it, consider the following tips:
- Investigate your data and specify as many path hints with types as you can. It will make the storage and the reading much more efficient.
- Think about what paths you will need and what paths you will never need. Specify paths that you won't need in the SKIP section and SKIP REGEXP if needed. It will improve the storage.
- Don't set `max_dynamic_paths` parameter to very high values, it can make the storage and reading less efficient.

View File

@ -1155,3 +1155,207 @@ SELECT jsonMergePatch('{"a":1}', '{"name": "joey"}', '{"name": "tom"}', '{"name"
│ {"a":1,"name":"zoey"} │
└───────────────────────┘
```
### JSONAllPaths
Returns the list of all paths stored in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONAllPaths(json)─┐
│ {"a":"42"} │ ['a'] │
│ {"b":"Hello"} │ ['b'] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['a','c'] │
└──────────────────────────────────────┴────────────────────┘
```
### JSONAllPathsWithTypes
Returns the map of all paths and their data types stored in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONAllPathsWithTypes(json)───────────────┐
│ {"a":"42"} │ {'a':'Int64'} │
│ {"b":"Hello"} │ {'b':'String'} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'a':'Array(Nullable(Int64))','c':'Date'} │
└──────────────────────────────────────┴───────────────────────────────────────────┘
```
### JSONDynamicPaths
Returns the list of dynamic paths that are stored as separate subcolumns in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONDynamicPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONDynamicPaths(json)─┐
| {"a":"42"} │ ['a'] │
│ {"b":"Hello"} │ [] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['a'] │
└──────────────────────────────────────┴────────────────────────┘
```
### JSONDynamicPathsWithTypes
Returns the map of dynamic paths that are stored as separate subcolumns and their types in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONDynamicPathsWithTypes(json)─┐
│ {"a":"42"} │ {'a':'Int64'} │
│ {"b":"Hello"} │ {} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'a':'Array(Nullable(Int64))'} │
└──────────────────────────────────────┴─────────────────────────────────┘
```
### JSONSharedDataPaths
Returns the list of paths that are stored in shared data structure in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONSharedDataPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONSharedDataPaths(json)─┐
│ {"a":"42"} │ [] │
│ {"b":"Hello"} │ ['b'] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['c'] │
└──────────────────────────────────────┴───────────────────────────┘
```
### JSONSharedDataPathsWithTypes
Returns the map of paths that are stored in shared data structure and their types in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONSharedDataPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONSharedDataPathsWithTypes(json)─┐
│ {"a":"42"} │ {} │
│ {"b":"Hello"} │ {'b':'String'} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'c':'Date'} │
└──────────────────────────────────────┴────────────────────────────────────┘
```

View File

@ -38,8 +38,7 @@ If you anticipate frequent deletes, consider using a [custom partitioning key](/
### Lightweight `DELETE`s with projections
By default, `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation and may require the projection to be rebuilt, negatively affecting `DELETE` performance.
However, there is an option to change this behavior. By changing setting `lightweight_mutation_projection_mode = 'drop'`, deletes will work with projections.
By default, `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation. But there is a [MergeTree setting](https://clickhouse.com/docs/en/operations/settings/merge-tree-settings) `lightweight_mutation_projection_mode` can change the behavior.
## Performance considerations when using lightweight `DELETE`

View File

@ -103,7 +103,7 @@ LIMIT 2;
└─────────┴─────────┴─────────┘
```
### Inserting data from a file into a table:
### Inserting data from a file into a table
``` sql
INSERT INTO FUNCTION

View File

@ -146,7 +146,30 @@ SELECT dictGet('dict', 'B', 2);
## Пример использования именованных соединений с базой данных PostgreSQL
Описание параметров смотрите [postgresql](../sql-reference/table-functions/postgresql.md).
Описание параметров смотрите [postgresql](../sql-reference/table-functions/postgresql.md). Дополнительно есть алиасы:
- `username` для `user`
- `db` для `database`.
Параметр `addresses_expr` используется в коллекции вместо `host:port`. Параметр опционален, потому что есть так же другие: `host`, `hostname`, `port`. Следующий псевдокод показывает приоритет:
```sql
CASE
WHEN collection['addresses_expr'] != '' THEN collection['addresses_expr']
WHEN collection['host'] != '' THEN collection['host'] || ':' || if(collection['port'] != '', collection['port'], '5432')
WHEN collection['hostname'] != '' THEN collection['hostname'] || ':' || if(collection['port'] != '', collection['port'], '5432')
END
```
Пример создания:
```sql
CREATE NAMED COLLECTION mypg AS
user = 'pguser',
password = 'jw8s0F4',
host = '127.0.0.1',
port = 5432,
database = 'test',
schema = 'test_schema'
```
Пример конфигурации:
```xml
@ -199,6 +222,10 @@ SELECT * FROM mypgtable;
└───┘
```
:::note
PostgreSQL копирует данные из named collection при создании таблицы. Изменения в коллекции не влияют на существующие таблицы.
:::
### Пример использования именованных соединений базой данных с движком PostgreSQL
```sql

View File

@ -75,6 +75,8 @@ public:
const String & default_database_,
const String & user_,
const String & password_,
const String & proto_send_chunked_,
const String & proto_recv_chunked_,
const String & quota_key_,
const String & stage,
bool randomize_,
@ -128,7 +130,9 @@ public:
connections.emplace_back(std::make_unique<ConnectionPool>(
concurrency,
cur_host, cur_port,
default_database_, user_, password_, quota_key_,
default_database_, user_, password_,
proto_send_chunked_, proto_recv_chunked_,
quota_key_,
/* cluster_= */ "",
/* cluster_secret_= */ "",
/* client_name_= */ std::string(DEFAULT_CLIENT_NAME),
@ -662,6 +666,50 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv)
Strings hosts = options.count("host") ? options["host"].as<Strings>() : Strings({"localhost"});
String proto_send_chunked {"notchunked"};
String proto_recv_chunked {"notchunked"};
if (options.count("proto_caps"))
{
std::string proto_caps_str = options["proto_caps"].as<std::string>();
std::vector<std::string_view> proto_caps;
splitInto<','>(proto_caps, proto_caps_str);
for (auto cap_str : proto_caps)
{
std::string direction;
if (cap_str.starts_with("send_"))
{
direction = "send";
cap_str = cap_str.substr(std::string_view("send_").size());
}
else if (cap_str.starts_with("recv_"))
{
direction = "recv";
cap_str = cap_str.substr(std::string_view("recv_").size());
}
if (cap_str != "chunked" && cap_str != "notchunked" && cap_str != "chunked_optional" && cap_str != "notchunked_optional")
throw Exception(ErrorCodes::BAD_ARGUMENTS, "proto_caps option is incorrect ({})", proto_caps_str);
if (direction.empty())
{
proto_send_chunked = cap_str;
proto_recv_chunked = cap_str;
}
else
{
if (direction == "send")
proto_send_chunked = cap_str;
else
proto_recv_chunked = cap_str;
}
}
}
Benchmark benchmark(
options["concurrency"].as<unsigned>(),
options["delay"].as<double>(),
@ -673,6 +721,8 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv)
options["database"].as<std::string>(),
options["user"].as<std::string>(),
options["password"].as<std::string>(),
proto_send_chunked,
proto_recv_chunked,
options["quota_key"].as<std::string>(),
options["stage"].as<std::string>(),
options.count("randomize"),

View File

@ -38,6 +38,21 @@
<production>{display_name} \e[1;31m:)\e[0m </production> <!-- if it matched to the substring "production" in the server display name -->
</prompt_by_server_display_name>
<!-- Chunked capabilities for native protocol by client.
Can be enabled separately for send and receive channels.
Supported modes:
- chunked - client will only work with server supporting chunked protocol;
- chunked_optional - client prefer server to enable chunked protocol, but can switch to notchunked if server does not support this;
- notchunked - client will only work with server supporting notchunked protocol (current default);
- notchunked_optional - client prefer server notchunked protocol, but can switch to chunked if server does not support this.
-->
<!--
<proto_caps>
<send>chunked_optional</send>
<recv>chunked_optional</recv>
</proto_caps>
-->
<!--
Settings adjustable via command-line parameters
can take their defaults from that config file, see examples:

View File

@ -175,6 +175,11 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
hash_func.update(options["seed"].as<std::string>());
}
SharedContextHolder shared_context = Context::createShared();
auto context = Context::createGlobal(shared_context.get());
auto context_const = WithContext(context).getContext();
context->makeGlobalContext();
registerInterpreters();
registerFunctions();
registerAggregateFunctions();

View File

@ -150,6 +150,21 @@
-->
<tcp_port>9000</tcp_port>
<!-- Chunked capabilities for native protocol by server.
Can be enabled separately for send and receive channels.
Supported modes:
- chunked - server requires from client to have chunked enabled;
- chunked_optional - server supports both chunked and notchunked protocol;
- notchunked - server requires from client notchunked protocol (current default);
- notchunked_optional - server supports both chunked and notchunked protocol.
-->
<!--
<proto_caps>
<send>notchunked_optional</send>
<recv>notchunked_optional</recv>
</proto_caps>
-->
<!-- Compatibility with MySQL protocol.
ClickHouse will pretend to be MySQL for applications connecting to this port.
-->

View File

@ -93,7 +93,7 @@ namespace
break;
}
UUID id = parse<UUID>(line);
UUID id = parse<UUID>(line.substr(0, line.find('\t')));
line.clear();
String queries;

View File

@ -242,7 +242,8 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const
/// Avoid cast for `IN tuple(...)` expression.
/// Tuples could be quite big, and adding a type may significantly increase query size.
/// It should be safe because set type for `column IN tuple` is deduced from `column` type.
if (isNameOfInFunction(function_name) && argument_nodes.size() > 1 && argument_nodes[1]->getNodeType() == QueryTreeNodeType::CONSTANT)
if (isNameOfInFunction(function_name) && argument_nodes.size() > 1 && argument_nodes[1]->getNodeType() == QueryTreeNodeType::CONSTANT
&& !static_cast<const ConstantNode *>(argument_nodes[1].get())->hasSourceExpression())
new_options.add_cast_for_constants = false;
const auto & parameters = getParameters();

View File

@ -1,5 +1,5 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/NestedUtils.h>
@ -452,10 +452,10 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromCompoundExpression(
if (auto * column = compound_expression->as<ColumnNode>())
{
const DataTypePtr & column_type = column->getColumn().getTypeInStorage();
if (column_type->getTypeId() == TypeIndex::Object)
if (column_type->getTypeId() == TypeIndex::ObjectDeprecated)
{
const auto * object_type = checkAndGetDataType<DataTypeObject>(column_type.get());
if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns())
const auto & object_type = checkAndGetDataType<DataTypeObjectDeprecated>(*column_type);
if (object_type.getSchemaFormat() == "json" && object_type.hasNullableSubcolumns())
{
QueryTreeNodePtr constant_node_null = std::make_shared<ConstantNode>(Field());
return constant_node_null;
@ -1000,7 +1000,6 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromJoin(const Identifi
if (!join_node_in_resolve_process && from_join_node.isUsingJoinExpression())
{
auto & join_using_list = from_join_node.getJoinExpression()->as<ListNode &>();
for (auto & join_using_node : join_using_list.getNodes())
{
auto & column_node = join_using_node->as<ColumnNode &>();

View File

@ -3,7 +3,7 @@
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>

View File

@ -490,8 +490,6 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
/// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
auto process_list_element = context_in_use->getProcessListElement();
/// Update context to preserve query information in processlist (settings, current_database)
process_list_element->updateContext(context_in_use);
thread_pool.scheduleOrThrowOnError(
[this,
@ -855,8 +853,6 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
/// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously.
auto process_list_element = context_in_use->getProcessListElement();
/// Update context to preserve query information in processlist (settings, current_database)
process_list_element->updateContext(context_in_use);
thread_pool.scheduleOrThrowOnError(
[this,

View File

@ -158,6 +158,8 @@ void ClientApplicationBase::init(int argc, char ** argv)
("config-file,C", po::value<std::string>(), "config-file path")
("proto_caps", po::value<std::string>(), "enable/disable chunked protocol: chunked_optional, notchunked, notchunked_optional, send_chunked, send_chunked_optional, send_notchunked, send_notchunked_optional, recv_chunked, recv_chunked_optional, recv_notchunked, recv_notchunked_optional")
("query,q", po::value<std::vector<std::string>>()->multitoken(), R"(Query. Can be specified multiple times (--query "SELECT 1" --query "SELECT 2") or once with multiple comma-separated queries (--query "SELECT 1; SELECT 2;"). In the latter case, INSERT queries with non-VALUE format must be separated by empty lines.)")
("queries-file", po::value<std::vector<std::string>>()->multitoken(), "file path with queries to execute; multiple files can be specified (--queries-file file1 file2...)")
("multiquery,n", "Obsolete, does nothing")
@ -337,6 +339,41 @@ void ClientApplicationBase::init(int argc, char ** argv)
if (options.count("server_logs_file"))
server_logs_file = options["server_logs_file"].as<std::string>();
if (options.count("proto_caps"))
{
std::string proto_caps_str = options["proto_caps"].as<std::string>();
std::vector<std::string_view> proto_caps;
splitInto<','>(proto_caps, proto_caps_str);
for (auto cap_str : proto_caps)
{
std::string direction;
if (cap_str.starts_with("send_"))
{
direction = "send";
cap_str = cap_str.substr(std::string_view("send_").size());
}
else if (cap_str.starts_with("recv_"))
{
direction = "recv";
cap_str = cap_str.substr(std::string_view("recv_").size());
}
if (cap_str != "chunked" && cap_str != "notchunked" && cap_str != "chunked_optional" && cap_str != "notchunked_optional")
throw Exception(ErrorCodes::BAD_ARGUMENTS, "proto_caps option is incorrect ({})", proto_caps_str);
if (direction.empty())
{
config().setString("proto_caps.send", std::string(cap_str));
config().setString("proto_caps.recv", std::string(cap_str));
}
else
config().setString("proto_caps." + direction, std::string(cap_str));
}
}
query_processing_stage = QueryProcessingStage::fromString(options["stage"].as<std::string>());
query_kind = parseQueryKind(options["query_kind"].as<std::string>());
profile_events.print = options.count("print-profile-events");

View File

@ -73,9 +73,11 @@
#include <limits>
#include <map>
#include <memory>
#include <string_view>
#include <unordered_map>
#include <Common/config_version.h>
#include <base/find_symbols.h>
#include "config.h"
#include <IO/ReadHelpers.h>
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
@ -914,6 +916,8 @@ void ClientBase::processTextAsSingleQuery(const String & full_query)
}
catch (Exception & e)
{
if (server_exception)
server_exception->rethrow();
if (!is_interactive)
e.addMessage("(in query: {})", full_query);
throw;
@ -1032,19 +1036,28 @@ void ClientBase::processOrdinaryQuery(const String & query_to_execute, ASTPtr pa
query_interrupt_handler.start(signals_before_stop);
SCOPE_EXIT({ query_interrupt_handler.stop(); });
connection->sendQuery(
connection_parameters.timeouts,
query,
query_parameters,
client_context->getCurrentQueryId(),
query_processing_stage,
&client_context->getSettingsRef(),
&client_context->getClientInfo(),
true,
[&](const Progress & progress) { onProgress(progress); });
try {
connection->sendQuery(
connection_parameters.timeouts,
query,
query_parameters,
client_context->getCurrentQueryId(),
query_processing_stage,
&client_context->getSettingsRef(),
&client_context->getClientInfo(),
true,
[&](const Progress & progress) { onProgress(progress); });
if (send_external_tables)
sendExternalTables(parsed_query);
}
catch (const NetException &)
{
// We still want to attempt to process whatever we already received or can receive (socket receive buffer can be not empty)
receiveResult(parsed_query, signals_before_stop, settings.partial_result_on_first_cancel);
throw;
}
if (send_external_tables)
sendExternalTables(parsed_query);
receiveResult(parsed_query, signals_before_stop, settings.partial_result_on_first_cancel);
break;

View File

@ -5,8 +5,6 @@
#include <Core/Settings.h>
#include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedWriteBuffer.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
@ -85,6 +83,7 @@ Connection::~Connection()
Connection::Connection(const String & host_, UInt16 port_,
const String & default_database_,
const String & user_, const String & password_,
const String & proto_send_chunked_, const String & proto_recv_chunked_,
[[maybe_unused]] const SSHKey & ssh_private_key_,
const String & jwt_,
const String & quota_key_,
@ -95,6 +94,7 @@ Connection::Connection(const String & host_, UInt16 port_,
Protocol::Secure secure_)
: host(host_), port(port_), default_database(default_database_)
, user(user_), password(password_)
, proto_send_chunked(proto_send_chunked_), proto_recv_chunked(proto_recv_chunked_)
#if USE_SSH
, ssh_private_key(ssh_private_key_)
#endif
@ -211,10 +211,10 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
, tcp_keep_alive_timeout_in_sec);
}
in = std::make_shared<ReadBufferFromPocoSocket>(*socket);
in = std::make_shared<ReadBufferFromPocoSocketChunked>(*socket);
in->setAsyncCallback(async_callback);
out = std::make_shared<WriteBufferFromPocoSocket>(*socket);
out = std::make_shared<WriteBufferFromPocoSocketChunked>(*socket);
out->setAsyncCallback(async_callback);
connected = true;
setDescription();
@ -222,9 +222,61 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
sendHello();
receiveHello(timeouts.handshake_timeout);
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS)
{
/// Client side of chunked protocol negotiation.
/// Server advertises its protocol capabilities (separate for send and receive channels) by sending
/// in its 'Hello' response one of four types - chunked, notchunked, chunked_optional, notchunked_optional.
/// Not optional types are strict meaning that server only supports this type, optional means that
/// server prefer this type but capable to work in opposite.
/// Client selects which type it is going to communicate based on the settings from config or arguments,
/// and sends either "chunked" or "notchunked" protocol request in addendum section of handshake.
/// Client can detect if server's protocol capabilities are not compatible with client's settings (for example
/// server strictly requires chunked protocol but client's settings only allows notchunked protocol) - in such case
/// client should interrupt this connection. However if client continues with incompatible protocol type request, server
/// will send appropriate exception and disconnect client.
auto is_chunked = [](const String & chunked_srv_str, const String & chunked_cl_str, const String & direction)
{
bool chunked_srv = chunked_srv_str.starts_with("chunked");
bool optional_srv = chunked_srv_str.ends_with("_optional");
bool chunked_cl = chunked_cl_str.starts_with("chunked");
bool optional_cl = chunked_cl_str.ends_with("_optional");
if (optional_srv)
return chunked_cl;
if (optional_cl)
return chunked_srv;
if (chunked_cl != chunked_srv)
throw NetException(
ErrorCodes::NETWORK_ERROR,
"Incompatible protocol: {} set to {}, server requires {}",
direction,
chunked_cl ? "chunked" : "notchunked",
chunked_srv ? "chunked" : "notchunked");
return chunked_srv;
};
proto_send_chunked = is_chunked(proto_recv_chunked_srv, proto_send_chunked, "send") ? "chunked" : "notchunked";
proto_recv_chunked = is_chunked(proto_send_chunked_srv, proto_recv_chunked, "recv") ? "chunked" : "notchunked";
}
else
{
if (proto_send_chunked == "chunked" || proto_recv_chunked == "chunked")
throw NetException(
ErrorCodes::NETWORK_ERROR,
"Incompatible protocol: server's version is too old and doesn't support chunked protocol while client settings require it.");
}
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_ADDENDUM)
sendAddendum();
if (proto_send_chunked == "chunked")
out->enableChunked();
if (proto_recv_chunked == "chunked")
in->enableChunked();
LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.",
server_name, server_version_major, server_version_minor, server_version_patch);
}
@ -393,6 +445,13 @@ void Connection::sendAddendum()
{
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_QUOTA_KEY)
writeStringBinary(quota_key, *out);
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS)
{
writeStringBinary(proto_send_chunked, *out);
writeStringBinary(proto_recv_chunked, *out);
}
out->next();
}
@ -472,6 +531,12 @@ void Connection::receiveHello(const Poco::Timespan & handshake_timeout)
else
server_version_patch = server_revision;
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS)
{
readStringBinary(proto_send_chunked_srv, *in);
readStringBinary(proto_recv_chunked_srv, *in);
}
if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_PASSWORD_COMPLEXITY_RULES)
{
UInt64 rules_size;
@ -611,6 +676,7 @@ bool Connection::ping(const ConnectionTimeouts & timeouts)
UInt64 pong = 0;
writeVarUInt(Protocol::Client::Ping, *out);
out->finishChunk();
out->next();
if (in->eof())
@ -660,6 +726,7 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time
writeVarUInt(Protocol::Client::TablesStatusRequest, *out);
request.write(*out, server_revision);
out->finishChunk();
out->next();
UInt64 response_type = 0;
@ -813,6 +880,8 @@ void Connection::sendQuery(
block_profile_events_in.reset();
block_out.reset();
out->finishChunk();
/// Send empty block which means end of data.
if (!with_pending_data)
{
@ -829,6 +898,7 @@ void Connection::sendCancel()
return;
writeVarUInt(Protocol::Client::Cancel, *out);
out->finishChunk();
out->next();
}
@ -854,7 +924,10 @@ void Connection::sendData(const Block & block, const String & name, bool scalar)
size_t prev_bytes = out->count();
block_out->write(block);
maybe_compressed_out->next();
if (maybe_compressed_out != out)
maybe_compressed_out->next();
if (!block)
out->finishChunk();
out->next();
if (throttler)
@ -865,6 +938,7 @@ void Connection::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
{
writeVarUInt(Protocol::Client::IgnoredPartUUIDs, *out);
writeVectorBinary(uuids, *out);
out->finishChunk();
out->next();
}
@ -874,6 +948,7 @@ void Connection::sendReadTaskResponse(const String & response)
writeVarUInt(Protocol::Client::ReadTaskResponse, *out);
writeVarUInt(DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION, *out);
writeStringBinary(response, *out);
out->finishChunk();
out->next();
}
@ -882,6 +957,7 @@ void Connection::sendMergeTreeReadTaskResponse(const ParallelReadResponse & resp
{
writeVarUInt(Protocol::Client::MergeTreeReadTaskResponse, *out);
response.serialize(*out);
out->finishChunk();
out->next();
}
@ -899,6 +975,8 @@ void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String
copyData(input, *out);
else
copyData(input, *out, size);
out->finishChunk();
out->next();
}
@ -927,6 +1005,8 @@ void Connection::sendScalarsData(Scalars & data)
sendData(elem.second, elem.first, true /* scalar */);
}
out->finishChunk();
out_bytes = out->count() - out_bytes;
maybe_compressed_out_bytes = maybe_compressed_out->count() - maybe_compressed_out_bytes;
double elapsed = watch.elapsedSeconds();
@ -1069,13 +1149,13 @@ std::optional<Poco::Net::SocketAddress> Connection::getResolvedAddress() const
bool Connection::poll(size_t timeout_microseconds)
{
return static_cast<ReadBufferFromPocoSocket &>(*in).poll(timeout_microseconds);
return in->poll(timeout_microseconds);
}
bool Connection::hasReadPendingData() const
{
return last_input_packet_type.has_value() || static_cast<const ReadBufferFromPocoSocket &>(*in).hasPendingData();
return last_input_packet_type.has_value() || in->hasBufferedData();
}
@ -1349,6 +1429,8 @@ ServerConnectionPtr Connection::createConnection(const ConnectionParameters & pa
parameters.default_database,
parameters.user,
parameters.password,
parameters.proto_send_chunked,
parameters.proto_recv_chunked,
parameters.ssh_private_key,
parameters.jwt,
parameters.quota_key,

View File

@ -8,8 +8,8 @@
#include <Core/Defines.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <IO/ReadBufferFromPocoSocketChunked.h>
#include <IO/WriteBufferFromPocoSocketChunked.h>
#include <Interpreters/TablesStatus.h>
#include <Interpreters/Context_fwd.h>
@ -52,6 +52,7 @@ public:
Connection(const String & host_, UInt16 port_,
const String & default_database_,
const String & user_, const String & password_,
const String & proto_send_chunked_, const String & proto_recv_chunked_,
const SSHKey & ssh_private_key_,
const String & jwt_,
const String & quota_key_,
@ -170,6 +171,10 @@ private:
String default_database;
String user;
String password;
String proto_send_chunked;
String proto_recv_chunked;
String proto_send_chunked_srv;
String proto_recv_chunked_srv;
#if USE_SSH
SSHKey ssh_private_key;
#endif
@ -209,8 +214,8 @@ private:
String server_display_name;
std::unique_ptr<Poco::Net::StreamSocket> socket;
std::shared_ptr<ReadBufferFromPocoSocket> in;
std::shared_ptr<WriteBufferFromPocoSocket> out;
std::shared_ptr<ReadBufferFromPocoSocketChunked> in;
std::shared_ptr<WriteBufferFromPocoSocketChunked> out;
std::optional<UInt64> last_input_packet_type;
String query_id;

View File

@ -107,6 +107,9 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati
}
}
proto_send_chunked = config.getString("proto_caps.send", "notchunked");
proto_recv_chunked = config.getString("proto_caps.recv", "notchunked");
quota_key = config.getString("quota_key", "");
/// By default compression is disabled if address looks like localhost.

View File

@ -20,6 +20,8 @@ struct ConnectionParameters
std::string default_database;
std::string user;
std::string password;
std::string proto_send_chunked = "notchunked";
std::string proto_recv_chunked = "notchunked";
std::string quota_key;
SSHKey ssh_private_key;
std::string jwt;

View File

@ -13,6 +13,8 @@ ConnectionPoolPtr ConnectionPoolFactory::get(
String default_database,
String user,
String password,
String proto_send_chunked,
String proto_recv_chunked,
String quota_key,
String cluster,
String cluster_secret,
@ -22,7 +24,7 @@ ConnectionPoolPtr ConnectionPoolFactory::get(
Priority priority)
{
Key key{
max_connections, host, port, default_database, user, password, quota_key, cluster, cluster_secret, client_name, compression, secure, priority};
max_connections, host, port, default_database, user, password, proto_send_chunked, proto_recv_chunked, quota_key, cluster, cluster_secret, client_name, compression, secure, priority};
std::lock_guard lock(mutex);
auto [it, inserted] = pools.emplace(key, ConnectionPoolPtr{});
@ -39,6 +41,8 @@ ConnectionPoolPtr ConnectionPoolFactory::get(
default_database,
user,
password,
proto_send_chunked,
proto_recv_chunked,
quota_key,
cluster,
cluster_secret,

View File

@ -73,6 +73,8 @@ public:
const String & default_database_,
const String & user_,
const String & password_,
const String & proto_send_chunked_,
const String & proto_recv_chunked_,
const String & quota_key_,
const String & cluster_,
const String & cluster_secret_,
@ -85,6 +87,8 @@ public:
, default_database(default_database_)
, user(user_)
, password(password_)
, proto_send_chunked(proto_send_chunked_)
, proto_recv_chunked(proto_recv_chunked_)
, quota_key(quota_key_)
, cluster(cluster_)
, cluster_secret(cluster_secret_)
@ -116,7 +120,9 @@ protected:
{
return std::make_shared<Connection>(
host, port,
default_database, user, password, SSHKey(), /*jwt*/ "", quota_key,
default_database, user, password,
proto_send_chunked, proto_recv_chunked,
SSHKey(), /*jwt*/ "", quota_key,
cluster, cluster_secret,
client_name, compression, secure);
}
@ -125,6 +131,8 @@ private:
String default_database;
String user;
String password;
String proto_send_chunked;
String proto_recv_chunked;
String quota_key;
/// For inter-server authorization
@ -150,6 +158,8 @@ public:
String default_database;
String user;
String password;
String proto_send_chunked;
String proto_recv_chunked;
String quota_key;
String cluster;
String cluster_secret;
@ -173,6 +183,8 @@ public:
String default_database,
String user,
String password,
String proto_send_chunked,
String proto_recv_chunked,
String quota_key,
String cluster,
String cluster_secret,
@ -190,6 +202,7 @@ inline bool operator==(const ConnectionPoolFactory::Key & lhs, const ConnectionP
{
return lhs.max_connections == rhs.max_connections && lhs.host == rhs.host && lhs.port == rhs.port
&& lhs.default_database == rhs.default_database && lhs.user == rhs.user && lhs.password == rhs.password
&& lhs.proto_send_chunked == rhs.proto_send_chunked && lhs.proto_recv_chunked == rhs.proto_recv_chunked
&& lhs.quota_key == rhs.quota_key
&& lhs.cluster == rhs.cluster && lhs.cluster_secret == rhs.cluster_secret && lhs.client_name == rhs.client_name
&& lhs.compression == rhs.compression && lhs.secure == rhs.secure && lhs.priority == rhs.priority;

View File

@ -452,6 +452,11 @@ void ColumnArray::reserve(size_t n)
getData().reserve(n); /// The average size of arrays is not taken into account here. Or it is considered to be no more than 1.
}
size_t ColumnArray::capacity() const
{
return getOffsets().capacity();
}
void ColumnArray::prepareForSquashing(const Columns & source_columns)
{
size_t new_size = size();

View File

@ -118,6 +118,7 @@ public:
void updatePermutationWithCollation(const Collator & collator, PermutationSortDirection direction, PermutationSortStability stability,
size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -53,6 +53,7 @@ public:
size_t allocatedBytes() const override { return data.allocated_bytes(); }
void protect() override { data.protect(); }
void reserve(size_t n) override { data.reserve_exact(n); }
size_t capacity() const override { return data.capacity(); }
void shrinkToFit() override { data.shrink_to_fit(); }
#if !defined(DEBUG_OR_SANITIZER_BUILD)

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnVariant.h>
#include <Columns/ColumnString.h>
#include <DataTypes/IDataType.h>
#include <Common/WeakHash.h>
@ -19,11 +20,19 @@ namespace DB
*
* When new values are inserted into Dynamic column, the internal Variant
* type and column are extended if the inserted value has new type.
* When the limit on number of dynamic types is exceeded, all values
* with new types are inserted into special shared variant with type String
* that contains values and their types in binary format.
*/
class ColumnDynamic final : public COWHelper<IColumnHelper<ColumnDynamic>, ColumnDynamic>
{
public:
///
/// Maximum limit on dynamic types. We use ColumnVariant to store all the types,
/// so the limit cannot be greater then ColumnVariant::MAX_NESTED_COLUMNS.
/// We also always have reserved variant for shared variant.
static constexpr size_t MAX_DYNAMIC_TYPES_LIMIT = ColumnVariant::MAX_NESTED_COLUMNS - 1;
static constexpr const char * SHARED_VARIANT_TYPE_NAME = "SharedVariant";
struct Statistics
{
enum class Source
@ -32,12 +41,27 @@ public:
MERGE, /// Statistics were calculated during merge of several MergeTree parts.
};
explicit Statistics(Source source_) : source(source_) {}
/// Source of the statistics.
Source source;
/// Statistics data: (variant name) -> (total variant size in data part).
std::unordered_map<String, size_t> data;
/// Statistics data for usual variants: (variant name) -> (total variant size in data part).
std::unordered_map<String, size_t> variants_statistics;
/// Statistics data for variants from shared variant: (variant name) -> (total variant size in data part).
/// For shared variant we store statistics only for first 256 variants (should cover almost all cases and it's not expensive).
static constexpr const size_t MAX_SHARED_VARIANT_STATISTICS_SIZE = 256;
std::unordered_map<String, size_t> shared_variants_statistics;
};
using StatisticsPtr = std::shared_ptr<const Statistics>;
struct ComparatorBase;
using ComparatorAscendingUnstable = ComparatorAscendingUnstableImpl<ComparatorBase>;
using ComparatorAscendingStable = ComparatorAscendingStableImpl<ComparatorBase>;
using ComparatorDescendingUnstable = ComparatorDescendingUnstableImpl<ComparatorBase>;
using ComparatorDescendingStable = ComparatorDescendingStableImpl<ComparatorBase>;
using ComparatorEqual = ComparatorEqualImpl<ComparatorBase>;
private:
friend class COWHelper<IColumnHelper<ColumnDynamic>, ColumnDynamic>;
@ -54,36 +78,40 @@ private:
};
explicit ColumnDynamic(size_t max_dynamic_types_);
ColumnDynamic(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {});
ColumnDynamic(MutableColumnPtr variant_column_, const DataTypePtr & variant_type_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_ = {});
ColumnDynamic(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_ = {});
public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWHelper<IColumnHelper<ColumnDynamic>, ColumnDynamic>;
static Ptr create(const ColumnPtr & variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {})
static Ptr create(const ColumnPtr & variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_ = {})
{
return ColumnDynamic::create(variant_column_->assumeMutable(), variant_info_, max_dynamic_types_, statistics_);
return ColumnDynamic::create(variant_column_->assumeMutable(), variant_info_, max_dynamic_types_, global_max_dynamic_types_, statistics_);
}
static MutablePtr create(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {})
static MutablePtr create(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_ = {})
{
return Base::create(std::move(variant_column_), variant_info_, max_dynamic_types_, statistics_);
return Base::create(std::move(variant_column_), variant_info_, max_dynamic_types_, global_max_dynamic_types_, statistics_);
}
static MutablePtr create(MutableColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {});
static ColumnPtr create(ColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {})
static MutablePtr create(MutableColumnPtr variant_column_, const DataTypePtr & variant_type_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_ = {})
{
return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, statistics_);
return Base::create(std::move(variant_column_), variant_type_, max_dynamic_types_, global_max_dynamic_types_, statistics_);
}
static MutablePtr create(size_t max_dynamic_types_)
static ColumnPtr create(ColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_ = {})
{
return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, global_max_dynamic_types_, statistics_);
}
static MutablePtr create(size_t max_dynamic_types_ = MAX_DYNAMIC_TYPES_LIMIT)
{
return Base::create(max_dynamic_types_);
}
std::string getName() const override { return "Dynamic(max_types=" + std::to_string(max_dynamic_types) + ")"; }
std::string getName() const override { return "Dynamic(max_types=" + std::to_string(global_max_dynamic_types) + ")"; }
const char * getFamilyName() const override
{
@ -98,47 +126,41 @@ public:
MutableColumnPtr cloneEmpty() const override
{
/// Keep current dynamic structure
return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, statistics);
return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, global_max_dynamic_types, statistics);
}
MutableColumnPtr cloneResized(size_t size) const override
{
return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, statistics);
return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, global_max_dynamic_types, statistics);
}
size_t size() const override
{
return variant_column->size();
return variant_column_ptr->size();
}
Field operator[](size_t n) const override
{
return (*variant_column)[n];
}
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override
{
variant_column->get(n, res);
}
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override
{
return variant_column->isDefaultAt(n);
return variant_column_ptr->isDefaultAt(n);
}
bool isNullAt(size_t n) const override
{
return variant_column->isNullAt(n);
return variant_column_ptr->isNullAt(n);
}
StringRef getDataAt(size_t n) const override
{
return variant_column->getDataAt(n);
return variant_column_ptr->getDataAt(n);
}
void insertData(const char * pos, size_t length) override
{
variant_column->insertData(pos, length);
variant_column_ptr->insertData(pos, length);
}
void insert(const Field & x) override;
@ -156,17 +178,17 @@ public:
void insertDefault() override
{
variant_column->insertDefault();
variant_column_ptr->insertDefault();
}
void insertManyDefaults(size_t length) override
{
variant_column->insertManyDefaults(length);
variant_column_ptr->insertManyDefaults(length);
}
void popBack(size_t n) override
{
variant_column->popBack(n);
variant_column_ptr->popBack(n);
}
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
@ -177,46 +199,46 @@ public:
WeakHash32 getWeakHash32() const override
{
return variant_column->getWeakHash32();
return variant_column_ptr->getWeakHash32();
}
void updateHashFast(SipHash & hash) const override
{
variant_column->updateHashFast(hash);
variant_column_ptr->updateHashFast(hash);
}
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
{
return create(variant_column->filter(filt, result_size_hint), variant_info, max_dynamic_types);
return create(variant_column_ptr->filter(filt, result_size_hint), variant_info, max_dynamic_types, global_max_dynamic_types);
}
void expand(const Filter & mask, bool inverted) override
{
variant_column->expand(mask, inverted);
variant_column_ptr->expand(mask, inverted);
}
ColumnPtr permute(const Permutation & perm, size_t limit) const override
{
return create(variant_column->permute(perm, limit), variant_info, max_dynamic_types);
return create(variant_column_ptr->permute(perm, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
}
ColumnPtr index(const IColumn & indexes, size_t limit) const override
{
return create(variant_column->index(indexes, limit), variant_info, max_dynamic_types);
return create(variant_column_ptr->index(indexes, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
}
ColumnPtr replicate(const Offsets & replicate_offsets) const override
{
return create(variant_column->replicate(replicate_offsets), variant_info, max_dynamic_types);
return create(variant_column_ptr->replicate(replicate_offsets), variant_info, max_dynamic_types, global_max_dynamic_types);
}
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override
{
MutableColumns scattered_variant_columns = variant_column->scatter(num_columns, selector);
MutableColumns scattered_variant_columns = variant_column_ptr->scatter(num_columns, selector);
MutableColumns scattered_columns;
scattered_columns.reserve(num_columns);
for (auto & scattered_variant_column : scattered_variant_columns)
scattered_columns.emplace_back(create(std::move(scattered_variant_column), variant_info, max_dynamic_types));
scattered_columns.emplace_back(create(std::move(scattered_variant_column), variant_info, max_dynamic_types, global_max_dynamic_types));
return scattered_columns;
}
@ -229,73 +251,76 @@ public:
bool hasEqualValues() const override
{
return variant_column->hasEqualValues();
return variant_column_ptr->hasEqualValues();
}
void getExtremes(Field & min, Field & max) const override
{
variant_column->getExtremes(min, max);
variant_column_ptr->getExtremes(min, max);
}
void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override
{
variant_column->getPermutation(direction, stability, limit, nan_direction_hint, res);
}
size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override
{
variant_column->updatePermutation(direction, stability, limit, nan_direction_hint, res, equal_ranges);
}
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override
{
variant_column->reserve(n);
variant_column_ptr->reserve(n);
}
size_t capacity() const override
{
return variant_column_ptr->capacity();
}
void prepareForSquashing(const Columns & source_columns) override;
/// Prepare only variants but not discriminators and offsets.
void prepareVariantsForSquashing(const Columns & source_columns);
void ensureOwnership() override
{
variant_column->ensureOwnership();
variant_column_ptr->ensureOwnership();
}
size_t byteSize() const override
{
return variant_column->byteSize();
return variant_column_ptr->byteSize();
}
size_t byteSizeAt(size_t n) const override
{
return variant_column->byteSizeAt(n);
return variant_column_ptr->byteSizeAt(n);
}
size_t allocatedBytes() const override
{
return variant_column->allocatedBytes();
return variant_column_ptr->allocatedBytes();
}
void protect() override
{
variant_column->protect();
variant_column_ptr->protect();
}
void forEachSubcolumn(MutableColumnCallback callback) override
{
callback(variant_column);
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
}
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override
{
callback(*variant_column);
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
variant_column->forEachSubcolumnRecursively(callback);
}
bool structureEquals(const IColumn & rhs) const override
{
if (const auto * rhs_concrete = typeid_cast<const ColumnDynamic *>(&rhs))
return max_dynamic_types == rhs_concrete->max_dynamic_types;
return global_max_dynamic_types == rhs_concrete->global_max_dynamic_types;
return false;
}
@ -303,27 +328,27 @@ public:
double getRatioOfDefaultRows(double sample_ratio) const override
{
return variant_column->getRatioOfDefaultRows(sample_ratio);
return variant_column_ptr->getRatioOfDefaultRows(sample_ratio);
}
UInt64 getNumberOfDefaultRows() const override
{
return variant_column->getNumberOfDefaultRows();
return variant_column_ptr->getNumberOfDefaultRows();
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
variant_column->getIndicesOfNonDefaultRows(indices, from, limit);
variant_column_ptr->getIndicesOfNonDefaultRows(indices, from, limit);
}
void finalize() override
{
variant_column->finalize();
variant_column_ptr->finalize();
}
bool isFinalized() const override
{
return variant_column->isFinalized();
return variant_column_ptr->isFinalized();
}
/// Apply null map to a nested Variant column.
@ -335,20 +360,79 @@ public:
const ColumnPtr & getVariantColumnPtr() const { return variant_column; }
ColumnPtr & getVariantColumnPtr() { return variant_column; }
const ColumnVariant & getVariantColumn() const { return assert_cast<const ColumnVariant &>(*variant_column); }
ColumnVariant & getVariantColumn() { return assert_cast<ColumnVariant &>(*variant_column); }
const ColumnVariant & getVariantColumn() const { return *variant_column_ptr; }
ColumnVariant & getVariantColumn() { return *variant_column_ptr; }
bool addNewVariant(const DataTypePtr & new_variant);
void addStringVariant();
bool addNewVariant(const DataTypePtr & new_variant, const String & new_variant_name);
bool addNewVariant(const DataTypePtr & new_variant) { return addNewVariant(new_variant, new_variant->getName()); }
bool hasDynamicStructure() const override { return true; }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
const Statistics & getStatistics() const { return statistics; }
const StatisticsPtr & getStatistics() const { return statistics; }
void setStatistics(const StatisticsPtr & statistics_) { statistics = statistics_; }
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
/// Check if we can add new variant types.
/// Shared variant doesn't count in the limit but always presents,
/// so we should subtract 1 from the total types count.
bool canAddNewVariants(size_t current_variants_count, size_t new_variants_count) const { return current_variants_count + new_variants_count - 1 <= max_dynamic_types; }
bool canAddNewVariant(size_t current_variants_count) const { return canAddNewVariants(current_variants_count, 1); }
bool canAddNewVariants(size_t new_variants_count) const { return canAddNewVariants(variant_info.variant_names.size(), new_variants_count); }
bool canAddNewVariant() const { return canAddNewVariants(variant_info.variant_names.size(), 1); }
void setVariantType(const DataTypePtr & variant_type);
void setMaxDynamicPaths(size_t max_dynamic_type_);
static const String & getSharedVariantTypeName()
{
static const String name = SHARED_VARIANT_TYPE_NAME;
return name;
}
static DataTypePtr getSharedVariantDataType();
ColumnVariant::Discriminator getSharedVariantDiscriminator() const
{
return variant_info.variant_name_to_discriminator.at(getSharedVariantTypeName());
}
ColumnString & getSharedVariant()
{
return assert_cast<ColumnString &>(getVariantColumn().getVariantByGlobalDiscriminator(getSharedVariantDiscriminator()));
}
const ColumnString & getSharedVariant() const
{
return assert_cast<const ColumnString &>(getVariantColumn().getVariantByGlobalDiscriminator(getSharedVariantDiscriminator()));
}
/// Serializes type and value in binary format into provided shared variant. Doesn't update Variant discriminators and offsets.
static void serializeValueIntoSharedVariant(ColumnString & shared_variant, const IColumn & src, const DataTypePtr & type, const SerializationPtr & serialization, size_t n);
/// Insert value into shared variant. Also updates Variant discriminators and offsets.
void insertValueIntoSharedVariant(const IColumn & src, const DataTypePtr & type, const String & type_name, size_t n);
const SerializationPtr & getVariantSerialization(const DataTypePtr & variant_type, const String & variant_name) const
{
/// Get serialization for provided data type.
/// To avoid calling type->getDefaultSerialization() every time we use simple cache with max size.
/// When max size is reached, just clear the cache.
if (serialization_cache.size() == SERIALIZATION_CACHE_MAX_SIZE)
serialization_cache.clear();
if (auto it = serialization_cache.find(variant_name); it != serialization_cache.end())
return it->second;
return serialization_cache.emplace(variant_name, variant_type->getDefaultSerialization()).first->second;
}
const SerializationPtr & getVariantSerialization(const DataTypePtr & variant_type) const { return getVariantSerialization(variant_type, variant_type->getName()); }
private:
void createVariantInfo(const DataTypePtr & variant_type);
/// Combine current variant with the other variant and return global discriminators mapping
/// from other variant to the combined one. It's used for inserting from
/// different variants.
@ -358,15 +442,26 @@ private:
void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type);
WrappedPtr variant_column;
/// Store and use pointer to ColumnVariant to avoid virtual calls.
/// ColumnDynamic is widely used inside ColumnObject for each path and
/// with hundreds of paths these virtual calls are noticeable.
ColumnVariant * variant_column_ptr;
/// Store the type of current variant with some additional information.
VariantInfo variant_info;
/// The maximum number of different types that can be stored in this Dynamic column.
/// If exceeded, all new variants will be converted to String.
/// If exceeded, all new variants will be added to a special shared variant with type String
/// in binary format. This limit can be different for different instances of Dynamic column.
/// When max_dynamic_types = 0, we will have only shared variant and insert all values into it.
size_t max_dynamic_types;
/// The types limit specified in the data type by the user Dynamic(max_types=N).
/// max_dynamic_types in all column instances of this Dynamic type can be only smaller
/// (for example, max_dynamic_types can be reduced in takeDynamicStructureFromSourceColumns
/// before merge of different Dynamic columns).
size_t global_max_dynamic_types;
/// Size statistics of each variants from MergeTree data part.
/// Used in takeDynamicStructureFromSourceColumns and set during deserialization.
Statistics statistics;
StatisticsPtr statistics;
/// Cache (Variant name) -> (global discriminators mapping from this variant to current variant in Dynamic column).
/// Used to avoid mappings recalculation in combineVariants for the same Variant types.
@ -374,6 +469,17 @@ private:
/// Cache of Variant types that couldn't be combined with current variant in Dynamic column.
/// Used to avoid checking if combination is possible for the same Variant types.
std::unordered_set<String> variants_with_failed_combination;
/// We can use serializations of different data types to serialize values into shared variant.
/// To avoid creating the same serialization multiple times, use simple cache.
static const size_t SERIALIZATION_CACHE_MAX_SIZE = 256;
mutable std::unordered_map<String, SerializationPtr> serialization_cache;
};
void extendVariantColumn(
IColumn & variant_column,
const DataTypePtr & old_variant_type,
const DataTypePtr & new_variant_type,
std::unordered_map<String, UInt8> old_variant_name_to_discriminator);
}

View File

@ -182,6 +182,11 @@ public:
chars.reserve_exact(n * size);
}
size_t capacity() const override
{
return chars.capacity() / n;
}
void shrinkToFit() override
{
chars.shrink_to_fit();

View File

@ -46,8 +46,8 @@ public:
return Base::create(std::move(column_unique), std::move(indexes), is_shared);
}
std::string getName() const override { return "ColumnLowCardinality"; }
const char * getFamilyName() const override { return "ColumnLowCardinality"; }
std::string getName() const override { return "LowCardinality(" + getDictionary().getNestedColumn()->getName() + ")"; }
const char * getFamilyName() const override { return "LowCardinality"; }
TypeIndex getDataType() const override { return TypeIndex::LowCardinality; }
ColumnPtr convertToFullColumn() const { return getDictionary().getNestedColumn()->index(getIndexes(), 0); }
@ -172,6 +172,7 @@ public:
}
void reserve(size_t n) override { idx.reserve(n); }
size_t capacity() const override { return idx.capacity(); }
void shrinkToFit() override { idx.shrinkToFit(); }
/// Don't count the dictionary size as it can be shared between different blocks.
@ -309,6 +310,7 @@ public:
void popBack(size_t n) { positions->popBack(n); }
void reserve(size_t n) { positions->reserve(n); }
size_t capacity() const { return positions->capacity(); }
void shrinkToFit() { positions->shrinkToFit(); }
UInt64 getMaxPositionForCurrentType() const;

View File

@ -249,6 +249,11 @@ void ColumnMap::reserve(size_t n)
nested->reserve(n);
}
size_t ColumnMap::capacity() const
{
return nested->capacity();
}
void ColumnMap::prepareForSquashing(const Columns & source_columns)
{
Columns nested_source_columns;

View File

@ -94,6 +94,7 @@ public:
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -706,6 +706,11 @@ void ColumnNullable::reserve(size_t n)
getNullMapData().reserve(n);
}
size_t ColumnNullable::capacity() const
{
return getNullMapData().capacity();
}
void ColumnNullable::prepareForSquashing(const Columns & source_columns)
{
size_t new_size = size();

View File

@ -125,6 +125,7 @@ public:
size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

File diff suppressed because it is too large Load Diff

View File

@ -1,216 +1,117 @@
#pragma once
#include <Columns/IColumn.h>
#include <Core/Field.h>
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnString.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <Formats/FormatSettings.h>
#include <Common/StringHashForHeterogeneousLookup.h>
#include <Common/WeakHash.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
/// If true then this field is an array of variadic dimension field
/// and we need to normalize the dimension
bool need_fold_dimension;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObject is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObject final : public COWHelper<IColumnHelper<ColumnObject>, ColumnObject>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
struct Statistics
{
public:
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
void get(size_t n, Field & res) const;
bool isFinalized() const;
const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); }
size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
Subcolumn cut(size_t start, size_t length) const;
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
FieldInfo getFieldInfo() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
const std::vector<WrappedPtr> & getData() const { return data; }
size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
friend class ColumnObject;
private:
class LeastCommonType
enum class Source
{
public:
LeastCommonType();
explicit LeastCommonType(DataTypePtr type_);
const DataTypePtr & get() const { return type; }
const DataTypePtr & getBase() const { return base_type; }
size_t getNumberOfDimensions() const { return num_dimensions; }
private:
DataTypePtr type;
DataTypePtr base_type;
size_t num_dimensions = 0;
READ, /// Statistics were loaded into column during reading from MergeTree.
MERGE, /// Statistics were calculated during merge of several MergeTree parts.
};
void addNewColumnPart(DataTypePtr type);
explicit Statistics(Source source_) : source(source_) {}
/// Current least common type of all values inserted to this subcolumn.
LeastCommonType least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
size_t num_rows = 0;
/// Source of the statistics.
Source source;
/// Statistics for dynamic paths: (path) -> (total number of not-null values).
std::unordered_map<String, size_t> dynamic_paths_statistics;
/// Statistics for paths in shared data: path) -> (total number of not-null values).
/// We don't store statistics for all paths in shared data but only for some subset of them
/// (is 10000 a good limit? It should not be expensive to store 10000 paths per part)
static const size_t MAX_SHARED_DATA_STATISTICS_SIZE = 10000;
std::unordered_map<String, size_t, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal> shared_data_paths_statistics;
};
using Subcolumns = SubcolumnsTree<Subcolumn>;
using StatisticsPtr = std::shared_ptr<const Statistics>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
friend class COWHelper<IColumnHelper<ColumnObject>, ColumnObject>;
Subcolumns subcolumns;
size_t num_rows;
ColumnObject(std::unordered_map<String, MutableColumnPtr> typed_paths_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
ColumnObject(
std::unordered_map<String, MutableColumnPtr> typed_paths_,
std::unordered_map<String, MutableColumnPtr> dynamic_paths_,
MutableColumnPtr shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
/// Use StringHashForHeterogeneousLookup hash for hash maps to be able to use std::string_view in find() method.
using PathToColumnMap = std::unordered_map<String, WrappedPtr, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
using PathToDynamicColumnPtrMap = std::unordered_map<String, ColumnDynamic *, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWHelper<IColumnHelper<ColumnObject>, ColumnObject>;
explicit ColumnObject(bool is_nullable_);
ColumnObject(Subcolumns && subcolumns_, bool is_nullable_);
static Ptr create(
const std::unordered_map<String, ColumnPtr> & typed_paths_,
const std::unordered_map<String, ColumnPtr> & dynamic_paths_,
const ColumnPtr & shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
static MutablePtr create(
std::unordered_map<String, MutableColumnPtr> typed_paths_,
std::unordered_map<String, MutableColumnPtr> dynamic_paths_,
MutableColumnPtr shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
bool hasSubcolumn(const PathInData & key) const;
static MutablePtr create(std::unordered_map<String, MutableColumnPtr> typed_paths_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
const Subcolumn & getSubcolumn(const PathInData & key) const;
Subcolumn & getSubcolumn(const PathInData & key);
std::string getName() const override;
void incrementNumRows() { ++num_rows; }
const char * getFamilyName() const override
{
return "Object";
}
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
TypeIndex getDataType() const override
{
return TypeIndex::Object;
}
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
MutableColumnPtr cloneEmpty() const override;
MutableColumnPtr cloneResized(size_t size) const override;
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
size_t size() const override
{
return shared_data->size();
}
/// Finds a subcolumn from the same Nested type as @entry and inserts
/// an array with default values with consistent sizes as in Nested type.
bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
const Subcolumns & getSubcolumns() const { return subcolumns; }
Subcolumns & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::Object; }
size_t size() const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(MutableColumnCallback callback) override;
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
void insert(const Field & field) override;
bool tryInsert(const Field & field) override;
void insertDefault() override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
bool tryInsert(const Field & x) override;
#if !defined(DEBUG_OR_SANITIZER_BUILD)
void insertFrom(const IColumn & src, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
@ -218,24 +119,31 @@ public:
void doInsertFrom(const IColumn & src, size_t n) override;
void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#endif
/// TODO: implement more optimal insertManyFrom
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t length) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
void popBack(size_t n) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumnPtr cloneResized(size_t new_size) const override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
/// Finalizes all subcolumns.
void finalize() override;
bool isFinalized() const override;
/// Order of rows in ColumnObject is undefined.
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override;
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &) const override;
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {}
/// Values of ColumnObject are not comparable.
#if !defined(DEBUG_OR_SANITIZER_BUILD)
int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#else
@ -243,35 +151,118 @@ public:
#endif
void getExtremes(Field & min, Field & max) const override;
/// All other methods throw exception.
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const std::vector<ColumnPtr> & source_columns) override;
void ensureOwnership() override;
size_t byteSize() const override;
size_t byteSizeAt(size_t n) const override;
size_t allocatedBytes() const override;
void protect() override;
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
char * serializeValueIntoMemory(size_t, char *) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash & hash) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
void forEachSubcolumn(MutableColumnCallback callback) override;
private:
[[noreturn]] static void throwMustBeConcrete()
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
ColumnPtr compress() const override;
void finalize() override;
bool isFinalized() const override;
bool hasDynamicStructure() const override { return true; }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
const PathToColumnMap & getTypedPaths() const { return typed_paths; }
PathToColumnMap & getTypedPaths() { return typed_paths; }
const PathToColumnMap & getDynamicPaths() const { return dynamic_paths; }
PathToColumnMap & getDynamicPaths() { return dynamic_paths; }
const PathToDynamicColumnPtrMap & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; }
PathToDynamicColumnPtrMap & getDynamicPathsPtrs() { return dynamic_paths_ptrs; }
const StatisticsPtr & getStatistics() const { return statistics; }
const ColumnPtr & getSharedDataPtr() const { return shared_data; }
ColumnPtr & getSharedDataPtr() { return shared_data; }
IColumn & getSharedDataColumn() { return *shared_data; }
const ColumnArray & getSharedDataNestedColumn() const { return assert_cast<const ColumnArray &>(*shared_data); }
ColumnArray & getSharedDataNestedColumn() { return assert_cast<ColumnArray &>(*shared_data); }
ColumnArray::Offsets & getSharedDataOffsets() { return assert_cast<ColumnArray &>(*shared_data).getOffsets(); }
const ColumnArray::Offsets & getSharedDataOffsets() const { return assert_cast<const ColumnArray &>(*shared_data).getOffsets(); }
std::pair<ColumnString *, ColumnString *> getSharedDataPathsAndValues()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObject must be converted to ColumnTuple before use");
auto & column_array = assert_cast<ColumnArray &>(*shared_data);
auto & column_tuple = assert_cast<ColumnTuple &>(column_array.getData());
return {assert_cast<ColumnString *>(&column_tuple.getColumn(0)), assert_cast<ColumnString *>(&column_tuple.getColumn(1))};
}
template <typename Func>
MutableColumnPtr applyForSubcolumns(Func && func) const;
std::pair<const ColumnString *, const ColumnString *> getSharedDataPathsAndValues() const
{
const auto & column_array = assert_cast<const ColumnArray &>(*shared_data);
const auto & column_tuple = assert_cast<const ColumnTuple &>(column_array.getData());
return {assert_cast<const ColumnString *>(&column_tuple.getColumn(0)), assert_cast<const ColumnString *>(&column_tuple.getColumn(1))};
}
/// It's used to get shared sized of Nested to insert correct default values.
const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
size_t getMaxDynamicPaths() const { return max_dynamic_paths; }
size_t getGlobalMaxDynamicPaths() const { return global_max_dynamic_paths; }
/// Try to add new dynamic path. Returns pointer to the new dynamic
/// path column or nullptr if limit on dynamic paths is reached.
ColumnDynamic * tryToAddNewDynamicPath(std::string_view path);
/// Throws an exception if cannot add.
void addNewDynamicPath(std::string_view path);
void setDynamicPaths(const std::vector<String> & paths);
void setMaxDynamicPaths(size_t max_dynamic_paths_);
void setStatistics(const StatisticsPtr & statistics_) { statistics = statistics_; }
void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, std::string_view path, const IColumn & column, size_t n);
void deserializeValueFromSharedData(const ColumnString * shared_data_values, size_t n, IColumn & column) const;
/// Paths in shared data are sorted in each row. Use this method to find the lower bound for specific path in the row.
static size_t findPathLowerBoundInSharedData(StringRef path, const ColumnString & shared_data_paths, size_t start, size_t end);
/// Insert all the data from shared data with specified path to dynamic column.
static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end);
private:
void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<std::string_view> && src_dynamic_paths_for_shared_data, size_t start, size_t length);
void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const;
/// Map path -> column for paths with explicitly specified types.
/// This set of paths is constant and cannot be changed.
PathToColumnMap typed_paths;
/// Map path -> column for dynamically added paths. All columns
/// here are Dynamic columns. This set of paths can be extended
/// during inerts into the column.
PathToColumnMap dynamic_paths;
/// Store and use pointers to ColumnDynamic to avoid virtual calls.
/// With hundreds of dynamic paths these virtual calls are noticeable.
PathToDynamicColumnPtrMap dynamic_paths_ptrs;
/// Shared storage for all other paths and values. It's filled
/// when the number of dynamic paths reaches the limit.
/// It has type Array(Tuple(String, String)) and stores
/// an array of pairs (path, binary serialized dynamic value) for each row.
WrappedPtr shared_data;
/// Maximum number of dynamic paths. If this limit is reached, all new paths will be inserted into shared data.
/// This limit can be different for different instances of Object column. For example, we can decrease it
/// in takeDynamicStructureFromSourceColumns before merge.
size_t max_dynamic_paths;
/// Global limit on number of dynamic paths for all column instances of this Object type. It's the limit specified
/// in the type definition (for example 'JSON(max_dynamic_paths=N)'). max_dynamic_paths is always not greater than this limit.
size_t global_max_dynamic_paths;
/// Maximum number of dynamic types for each dynamic path. Used while creating Dynamic columns for new dynamic paths.
size_t max_dynamic_types;
/// Statistics on the number of non-null values for each dynamic path and for some shared data paths in the MergeTree data part.
/// Calculated during serializing of data part in MergeTree. Used to determine the set of dynamic paths for the merged part.
StatisticsPtr statistics;
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,275 @@
#pragma once
#include <Columns/IColumn.h>
#include <Core/Field.h>
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <DataTypes/IDataType.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
/// If true then this field is an array of variadic dimension field
/// and we need to normalize the dimension
bool need_fold_dimension;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObjectDeprecated is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObjectDeprecated final : public COWHelper<IColumnHelper<ColumnObjectDeprecated>, ColumnObjectDeprecated>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
{
public:
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
void get(size_t n, Field & res) const;
bool isFinalized() const;
const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); }
size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
Subcolumn cut(size_t start, size_t length) const;
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
FieldInfo getFieldInfo() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
const std::vector<WrappedPtr> & getData() const { return data; }
size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
friend class ColumnObjectDeprecated;
private:
class LeastCommonType
{
public:
LeastCommonType();
explicit LeastCommonType(DataTypePtr type_);
const DataTypePtr & get() const { return type; }
const DataTypePtr & getBase() const { return base_type; }
size_t getNumberOfDimensions() const { return num_dimensions; }
private:
DataTypePtr type;
DataTypePtr base_type;
size_t num_dimensions = 0;
};
void addNewColumnPart(DataTypePtr type);
/// Current least common type of all values inserted to this subcolumn.
LeastCommonType least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
size_t num_rows = 0;
};
using Subcolumns = SubcolumnsTree<Subcolumn>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
Subcolumns subcolumns;
size_t num_rows;
public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
explicit ColumnObjectDeprecated(bool is_nullable_);
ColumnObjectDeprecated(Subcolumns && subcolumns_, bool is_nullable_);
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
bool hasSubcolumn(const PathInData & key) const;
const Subcolumn & getSubcolumn(const PathInData & key) const;
Subcolumn & getSubcolumn(const PathInData & key);
void incrementNumRows() { ++num_rows; }
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
/// Finds a subcolumn from the same Nested type as @entry and inserts
/// an array with default values with consistent sizes as in Nested type.
bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
const Subcolumns & getSubcolumns() const { return subcolumns; }
Subcolumns & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::ObjectDeprecated; }
size_t size() const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(MutableColumnCallback callback) override;
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
void insert(const Field & field) override;
bool tryInsert(const Field & field) override;
void insertDefault() override;
#if !defined(DEBUG_OR_SANITIZER_BUILD)
void insertFrom(const IColumn & src, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#else
void doInsertFrom(const IColumn & src, size_t n) override;
void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#endif
void popBack(size_t length) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumnPtr cloneResized(size_t new_size) const override;
/// Finalizes all subcolumns.
void finalize() override;
bool isFinalized() const override;
/// Order of rows in ColumnObjectDeprecated is undefined.
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override;
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {}
#if !defined(DEBUG_OR_SANITIZER_BUILD)
int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#else
int doCompareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#endif
void getExtremes(Field & min, Field & max) const override;
/// All other methods throw exception.
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
char * serializeValueIntoMemory(size_t, char *) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash &) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
private:
[[noreturn]] static void throwMustBeConcrete()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObjectDeprecated must be converted to ColumnTuple before use");
}
template <typename Func>
MutableColumnPtr applyForSubcolumns(Func && func) const;
/// It's used to get shared sized of Nested to insert correct default values.
const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
};
}

View File

@ -557,6 +557,11 @@ void ColumnString::reserve(size_t n)
offsets.reserve_exact(n);
}
size_t ColumnString::capacity() const
{
return offsets.capacity();
}
void ColumnString::prepareForSquashing(const Columns & source_columns)
{
size_t new_size = size();

View File

@ -283,6 +283,7 @@ public:
ColumnPtr compress() const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;

View File

@ -595,6 +595,14 @@ void ColumnTuple::reserve(size_t n)
getColumn(i).reserve(n);
}
size_t ColumnTuple::capacity() const
{
if (columns.empty())
return size();
return getColumn(0).capacity();
}
void ColumnTuple::prepareForSquashing(const Columns & source_columns)
{
const size_t tuple_size = columns.size();

View File

@ -110,6 +110,7 @@ public:
void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -48,6 +48,8 @@ private:
ColumnUnique(const ColumnUnique & other);
public:
std::string getName() const override { return "Unique(" + getNestedColumn()->getName() + ")"; }
MutableColumnPtr cloneEmpty() const override;
const ColumnPtr & getNestedColumn() const override;

View File

@ -476,7 +476,7 @@ void ColumnVariant::insertFromImpl(const DB::IColumn & src_, size_t n, const std
}
}
void ColumnVariant::insertRangeFromImpl(const DB::IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping)
void ColumnVariant::insertRangeFromImpl(const DB::IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping, const Discriminator * skip_discriminator)
{
const size_t num_variants = variants.size();
const auto & src = assert_cast<const ColumnVariant &>(src_);
@ -557,9 +557,12 @@ void ColumnVariant::insertRangeFromImpl(const DB::IColumn & src_, size_t start,
Discriminator global_discr = src_global_discr;
if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR)
global_discr = (*global_discriminators_mapping)[src_global_discr];
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
if (nested_length)
variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length);
if (!skip_discriminator || global_discr != *skip_discriminator)
{
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
if (nested_length)
variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length);
}
}
}
@ -610,7 +613,7 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l
void ColumnVariant::doInsertRangeFrom(const IColumn & src_, size_t start, size_t length)
#endif
{
insertRangeFromImpl(src_, start, length, nullptr);
insertRangeFromImpl(src_, start, length, nullptr, nullptr);
}
#if !defined(DEBUG_OR_SANITIZER_BUILD)
@ -627,9 +630,9 @@ void ColumnVariant::insertFrom(const DB::IColumn & src_, size_t n, const std::ve
insertFromImpl(src_, n, &global_discriminators_mapping);
}
void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping)
void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping, Discriminator skip_discriminator)
{
insertRangeFromImpl(src_, start, length, &global_discriminators_mapping);
insertRangeFromImpl(src_, start, length, &global_discriminators_mapping, &skip_discriminator);
}
void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping)
@ -673,6 +676,14 @@ void ColumnVariant::insertManyIntoVariantFrom(DB::ColumnVariant::Discriminator g
variants[local_discr]->insertManyFrom(src_, position, length);
}
void ColumnVariant::deserializeBinaryIntoVariant(ColumnVariant::Discriminator global_discr, const SerializationPtr & serialization, ReadBuffer & buf, const FormatSettings & format_settings)
{
auto local_discr = localDiscriminatorByGlobal(global_discr);
serialization->deserializeBinary(*variants[local_discr], buf, format_settings);
getLocalDiscriminators().push_back(local_discr);
getOffsets().push_back(variants[local_discr]->size() - 1);
}
void ColumnVariant::insertDefault()
{
getLocalDiscriminators().push_back(NULL_DISCRIMINATOR);
@ -1213,9 +1224,7 @@ struct ColumnVariant::ComparatorBase
ALWAYS_INLINE int compare(size_t lhs, size_t rhs) const
{
int res = parent.compareAt(lhs, rhs, parent, nan_direction_hint);
return res;
return parent.compareAt(lhs, rhs, parent, nan_direction_hint);
}
};
@ -1268,6 +1277,11 @@ void ColumnVariant::prepareForSquashing(const Columns & source_columns)
}
}
size_t ColumnVariant::capacity() const
{
return local_discriminators->capacity();
}
void ColumnVariant::ensureOwnership()
{
const size_t num_variants = variants.size();

View File

@ -2,6 +2,8 @@
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
#include <Formats/FormatSettings.h>
#include <DataTypes/Serializations/ISerialization.h>
namespace DB
@ -196,13 +198,15 @@ public:
/// Methods for insertion from another Variant but with known mapping between global discriminators.
void insertFrom(const IColumn & src_, size_t n, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping);
void insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping);
/// Don't insert data into variant with skip_discriminator global discriminator, it will be processed separately.
void insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping, Discriminator skip_discriminator);
void insertManyFrom(const IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping);
/// Methods for insertion into a specific variant.
void insertIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t n);
void insertRangeIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t start, size_t length);
void insertManyIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t position, size_t length);
void deserializeBinaryIntoVariant(Discriminator global_discr, const SerializationPtr & serialization, ReadBuffer & buf, const FormatSettings & format_settings);
void insertDefault() override;
void insertManyDefaults(size_t length) override;
@ -237,6 +241,7 @@ public:
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void ensureOwnership() override;
size_t byteSize() const override;
@ -264,6 +269,7 @@ public:
ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) { return variants[global_to_local_discriminators.at(discr)]; }
const NestedColumns & getVariants() const { return variants; }
NestedColumns & getVariants() { return variants; }
const IColumn & getLocalDiscriminatorsColumn() const { return *local_discriminators; }
IColumn & getLocalDiscriminatorsColumn() { return *local_discriminators; }
@ -303,6 +309,8 @@ public:
return true;
}
std::vector<Discriminator> getLocalToGlobalDiscriminatorsMapping() const { return local_to_global_discriminators; }
/// Check if we have only 1 non-empty variant and no NULL values,
/// and if so, return the discriminator of this non-empty column.
std::optional<Discriminator> getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls() const;
@ -323,7 +331,7 @@ public:
private:
void insertFromImpl(const IColumn & src_, size_t n, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping);
void insertRangeFromImpl(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping);
void insertRangeFromImpl(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping, const Discriminator * skip_discriminator);
void insertManyFromImpl(const IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping);
void initIdentityGlobalToLocalDiscriminatorsMapping();

View File

@ -180,6 +180,11 @@ public:
data.reserve_exact(n);
}
size_t capacity() const override
{
return data.capacity();
}
void shrinkToFit() override
{
data.shrink_to_fit();

View File

@ -11,12 +11,13 @@
#include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnMap.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVariant.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnVector.h>
#include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h>
@ -466,12 +467,13 @@ template class IColumnHelper<ColumnArray, IColumn>;
template class IColumnHelper<ColumnTuple, IColumn>;
template class IColumnHelper<ColumnMap, IColumn>;
template class IColumnHelper<ColumnSparse, IColumn>;
template class IColumnHelper<ColumnObject, IColumn>;
template class IColumnHelper<ColumnObjectDeprecated, IColumn>;
template class IColumnHelper<ColumnAggregateFunction, IColumn>;
template class IColumnHelper<ColumnFunction, IColumn>;
template class IColumnHelper<ColumnCompressed, IColumn>;
template class IColumnHelper<ColumnVariant, IColumn>;
template class IColumnHelper<ColumnDynamic, IColumn>;
template class IColumnHelper<ColumnObject, IColumn>;
template class IColumnHelper<IColumnDummy, IColumn>;

View File

@ -475,6 +475,9 @@ public:
/// It affects performance only (not correctness).
virtual void reserve(size_t /*n*/) {}
/// Returns the number of elements allocated in reserve.
virtual size_t capacity() const { return size(); }
/// Reserve memory before squashing all specified source columns into this column.
virtual void prepareForSquashing(const std::vector<Ptr> & source_columns)
{

View File

@ -73,7 +73,7 @@ public:
/// Returns dictionary hash which is SipHash is applied to each row of nested column.
virtual UInt128 getHash() const = 0;
const char * getFamilyName() const override { return "ColumnUnique"; }
const char * getFamilyName() const override { return "Unique"; }
TypeIndex getDataType() const override { return getNestedColumn()->getDataType(); }
void insert(const Field &) override

View File

@ -10,7 +10,7 @@ TEST(IColumn, dumpStructure)
{
auto type_lc = std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>());
ColumnPtr column_lc = type_lc->createColumn();
String expected_structure = "ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))";
String expected_structure = "LowCardinality(size = 0, UInt8(size = 0), Unique(size = 1, String(size = 1)))";
std::vector<std::thread> threads;
for (size_t i = 0; i < 6; ++i)

View File

@ -7,28 +7,34 @@ using namespace DB;
TEST(ColumnDynamic, CreateEmpty)
{
auto column = ColumnDynamic::create(255);
auto column = ColumnDynamic::create(254);
ASSERT_TRUE(column->empty());
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()");
ASSERT_TRUE(column->getVariantInfo().variant_names.empty());
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty());
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(SharedVariant)");
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 1);
ASSERT_EQ(column->getVariantInfo().variant_names[0], "SharedVariant");
ASSERT_EQ(column->getVariantInfo().variant_name_to_discriminator.size(), 1);
ASSERT_EQ(column->getVariantInfo().variant_name_to_discriminator.at("SharedVariant"), 0);
ASSERT_TRUE(column->getVariantColumn().getVariantByGlobalDiscriminator(0).empty());
}
TEST(ColumnDynamic, InsertDefault)
{
auto column = ColumnDynamic::create(255);
auto column = ColumnDynamic::create(254);
column->insertDefault();
ASSERT_TRUE(column->size() == 1);
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()");
ASSERT_TRUE(column->getVariantInfo().variant_names.empty());
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty());
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(SharedVariant)");
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 1);
ASSERT_EQ(column->getVariantInfo().variant_names[0], "SharedVariant");
ASSERT_EQ(column->getVariantInfo().variant_name_to_discriminator.size(), 1);
ASSERT_EQ(column->getVariantInfo().variant_name_to_discriminator.at("SharedVariant"), 0);
ASSERT_TRUE(column->getVariantColumn().getVariantByGlobalDiscriminator(0).empty());
ASSERT_TRUE(column->isNullAt(0));
ASSERT_EQ((*column)[0], Field(Null()));
}
TEST(ColumnDynamic, InsertFields)
{
auto column = ColumnDynamic::create(255);
auto column = ColumnDynamic::create(254);
column->insert(Field(42));
column->insert(Field(-42));
column->insert(Field("str1"));
@ -41,16 +47,16 @@ TEST(ColumnDynamic, InsertFields)
column->insert(Field(43.43));
ASSERT_TRUE(column->size() == 10);
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)");
std::vector<String> expected_names = {"Float64", "Int8", "String"};
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, SharedVariant, String)");
std::vector<String> expected_names = {"Float64", "Int8", "SharedVariant", "String"};
ASSERT_EQ(column->getVariantInfo().variant_names, expected_names);
std::unordered_map<String, UInt8> expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}};
std::unordered_map<String, UInt8> expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}};
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
ColumnDynamic::MutablePtr getDynamicWithManyVariants(size_t num_variants, Field tuple_element = Field(42))
{
auto column = ColumnDynamic::create(255);
auto column = ColumnDynamic::create(254);
for (size_t i = 0; i != num_variants; ++i)
{
Tuple tuple;
@ -66,61 +72,71 @@ TEST(ColumnDynamic, InsertFieldsOverflow1)
{
auto column = getDynamicWithManyVariants(253);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 253);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254);
column->insert(Field(42.42));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254);
ASSERT_EQ(column->size(), 254);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
column->insert(Field(42));
ASSERT_EQ(column->size(), 255);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column->getSharedVariant().size(), 1);
Field field = (*column)[column->size() - 1];
ASSERT_EQ(field, "42");
ASSERT_EQ(field, 42);
column->insert(Field(43));
ASSERT_EQ(column->size(), 256);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column->getSharedVariant().size(), 2);
field = (*column)[column->size() - 1];
ASSERT_EQ(field, "43");
ASSERT_EQ(field, 43);
column->insert(Field("str1"));
ASSERT_EQ(column->size(), 257);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column->getSharedVariant().size(), 3);
field = (*column)[column->size() - 1];
ASSERT_EQ(field, "str1");
column->insert(Field(Array({Field(42), Field(43)})));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column->getSharedVariant().size(), 4);
field = (*column)[column->size() - 1];
ASSERT_EQ(field, "[42, 43]");
ASSERT_EQ(field, Field(Array({Field(42), Field(43)})));
}
TEST(ColumnDynamic, InsertFieldsOverflow2)
{
auto column = getDynamicWithManyVariants(254);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
column->insert(Field("str1"));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column->getSharedVariant().size(), 1);
Field field = (*column)[column->size() - 1];
ASSERT_EQ(field, "str1");
column->insert(Field(42));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
Field field = (*column)[column->size() - 1];
ASSERT_EQ(field, "42");
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column->getSharedVariant().size(), 2);
field = (*column)[column->size() - 1];
ASSERT_EQ(field, 42);
}
ColumnDynamic::MutablePtr getInsertFromColumn(size_t num = 1)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
for (size_t i = 0; i != num; ++i)
{
column_from->insert(Field(42));
@ -154,41 +170,41 @@ void checkInsertFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynami
TEST(ColumnDynamic, InsertFrom1)
{
auto column_to = ColumnDynamic::create(255);
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
auto column_to = ColumnDynamic::create(254);
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, SharedVariant, String)", {"Float64", "Int8", "SharedVariant", "String"}, {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertFrom2)
{
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, SharedVariant, String)", {"Float64", "Int8", "SharedVariant", "String"}, {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertFrom3)
{
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
column_to->insert(Array({Field(42)}));
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}});
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, SharedVariant, String)", {"Array(Int8)", "Float64", "Int8", "SharedVariant", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"SharedVariant", 3}, {"String", 4}});
}
TEST(ColumnDynamic, InsertFromOverflow1)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertFrom(*column_from, 0);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
auto field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
@ -196,20 +212,22 @@ TEST(ColumnDynamic, InsertFromOverflow1)
column_to->insertFrom(*column_from, 1);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 1);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
ASSERT_EQ(field, 42.42);
column_to->insertFrom(*column_from, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
}
TEST(ColumnDynamic, InsertFromOverflow2)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
@ -221,9 +239,32 @@ TEST(ColumnDynamic, InsertFromOverflow2)
column_to->insertFrom(*column_from, 1);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 1);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
ASSERT_EQ(field, 42.42);
}
TEST(ColumnDynamic, InsertFromOverflow3)
{
auto column_from = ColumnDynamic::create(1);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(41));
column_to->insertFrom(*column_from, 0);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_EQ(column_to->getSharedVariant().size(), 0);
auto field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertFrom(*column_from, 1);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 1);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42.42);
}
void checkInsertManyFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector<String> & expected_names, const std::unordered_map<String, UInt8> & expected_variant_name_to_discriminator)
@ -256,42 +297,43 @@ void checkInsertManyFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDy
TEST(ColumnDynamic, InsertManyFrom1)
{
auto column_to = ColumnDynamic::create(255);
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
auto column_to = ColumnDynamic::create(254);
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, SharedVariant, String)", {"Float64", "Int8", "SharedVariant", "String"}, {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertManyFrom2)
{
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, SharedVariant, String)", {"Float64", "Int8", "SharedVariant", "String"}, {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertManyFrom3)
{
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
column_to->insert(Array({Field(42)}));
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}});
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, SharedVariant, String)", {"Array(Int8)", "Float64", "Int8", "SharedVariant", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"SharedVariant", 3}, {"String", 4}});
}
TEST(ColumnDynamic, InsertManyFromOverflow1)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertManyFrom(*column_from, 0, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_EQ(column_to->getSharedVariant().size(), 0);
auto field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 1];
@ -300,15 +342,17 @@ TEST(ColumnDynamic, InsertManyFromOverflow1)
column_to->insertManyFrom(*column_from, 1, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, "42.42");
ASSERT_EQ(field, 42.42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
ASSERT_EQ(field, 42.42);
column_to->insertManyFrom(*column_from, 2, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 4);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
field = (*column_to)[column_to->size() - 2];
@ -317,14 +361,15 @@ TEST(ColumnDynamic, InsertManyFromOverflow1)
TEST(ColumnDynamic, InsertManyFromOverflow2)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertManyFrom(*column_from, 0, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_EQ(column_to->getSharedVariant().size(), 0);
auto field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 1];
@ -333,11 +378,39 @@ TEST(ColumnDynamic, InsertManyFromOverflow2)
column_to->insertManyFrom(*column_from, 1, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, "42.42");
ASSERT_EQ(field, 42.42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
ASSERT_EQ(field, 42.42);
}
TEST(ColumnDynamic, InsertManyFromOverflow3)
{
auto column_from = ColumnDynamic::create(1);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(41));
column_to->insertManyFrom(*column_from, 0, 2);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_EQ(column_to->getSharedVariant().size(), 0);
auto field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertManyFrom(*column_from, 1, 2);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42.42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42.42);
}
void checkInsertRangeFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector<String> & expected_names, const std::unordered_map<String, UInt8> & expected_variant_name_to_discriminator)
@ -368,34 +441,34 @@ void checkInsertRangeFrom(const ColumnDynamic::MutablePtr & column_from, ColumnD
TEST(ColumnDynamic, InsertRangeFrom1)
{
auto column_to = ColumnDynamic::create(255);
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
auto column_to = ColumnDynamic::create(254);
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, SharedVariant, String)", {"Float64", "Int8", "SharedVariant", "String"}, {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertRangeFrom2)
{
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str1"));
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, SharedVariant, String)", {"Float64", "Int8", "SharedVariant", "String"}, {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertRangeFrom3)
{
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str1"));
column_to->insert(Array({Field(42)}));
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}});
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Array(Int8), Float64, Int8, SharedVariant, String)", {"Array(Int8)", "Float64", "Int8", "SharedVariant", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"SharedVariant", 3}, {"String", 4}});
}
TEST(ColumnDynamic, InsertRangeFromOverflow1)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
@ -403,23 +476,25 @@ TEST(ColumnDynamic, InsertRangeFromOverflow1)
auto column_to = getDynamicWithManyVariants(253);
column_to->insertRangeFrom(*column_from, 0, 4);
ASSERT_EQ(column_to->size(), 257);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
auto field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("42.42"));
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("str"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow2)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
@ -428,19 +503,20 @@ TEST(ColumnDynamic, InsertRangeFromOverflow2)
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 1);
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("42.42"));
ASSERT_EQ(field, Field(42.42));
}
TEST(ColumnDynamic, InsertRangeFromOverflow3)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
@ -449,20 +525,21 @@ TEST(ColumnDynamic, InsertRangeFromOverflow3)
column_to->insert(Field("Str"));
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 3);
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("42.42"));
ASSERT_EQ(field, Field(42.42));
}
TEST(ColumnDynamic, InsertRangeFromOverflow4)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
@ -471,19 +548,20 @@ TEST(ColumnDynamic, InsertRangeFromOverflow4)
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 3);
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field("42"));
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("42.42"));
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("str"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow5)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
@ -493,22 +571,23 @@ TEST(ColumnDynamic, InsertRangeFromOverflow5)
column_to->insert(Field("str"));
column_to->insertRangeFrom(*column_from, 0, 4);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_EQ(column_to->getSharedVariant().size(), 3);
auto field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("42.42"));
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("str"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow6)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(44));
@ -520,13 +599,14 @@ TEST(ColumnDynamic, InsertRangeFromOverflow6)
auto column_to = getDynamicWithManyVariants(253);
column_to->insertRangeFrom(*column_from, 2, 5);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_EQ(column_to->getSharedVariant().size(), 4);
auto field = (*column_to)[column_to->size() - 5];
ASSERT_EQ(field, Field("44"));
ASSERT_EQ(field, Field(44));
field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 3];
@ -534,12 +614,136 @@ TEST(ColumnDynamic, InsertRangeFromOverflow6)
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("str"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("[42]"));
ASSERT_EQ(field, Field(Array({Field(42)})));
}
TEST(ColumnDynamic, InsertRangeFromOverflow7)
{
auto column_from = ColumnDynamic::create(2);
column_from->insert(Field(42.42));
column_from->insert(Field("str1"));
column_from->insert(Field(42));
column_from->insert(Field(43.43));
column_from->insert(Field(Array({Field(41)})));
column_from->insert(Field(43));
column_from->insert(Field("str2"));
column_from->insert(Field(Array({Field(42)})));
auto column_to = ColumnDynamic::create(254);
column_to->insert(Field(42));
column_to->insertRangeFrom(*column_from, 0, 8);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 4);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
auto field = (*column_to)[column_to->size() - 8];
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 7];
ASSERT_EQ(field, Field("str1"));
field = (*column_to)[column_to->size() - 6];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 5];
ASSERT_EQ(field, Field(43.43));
field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(Array({Field(41)})));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("str2"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field(Array({Field(42)})));
}
TEST(ColumnDynamic, InsertRangeFromOverflow8)
{
auto column_from = ColumnDynamic::create(2);
column_from->insert(Field(42.42));
column_from->insert(Field("str1"));
column_from->insert(Field(42));
column_from->insert(Field(43.43));
column_from->insert(Field(Array({Field(41)})));
column_from->insert(Field(43));
column_from->insert(Field("str2"));
column_from->insert(Field(Array({Field(42)})));
auto column_to = ColumnDynamic::create(2);
column_to->insert(Field(42));
column_from->insert(Field("str1"));
column_to->insertRangeFrom(*column_from, 0, 8);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 3);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_EQ(column_to->getSharedVariant().size(), 4);
auto field = (*column_to)[column_to->size() - 8];
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 7];
ASSERT_EQ(field, Field("str1"));
field = (*column_to)[column_to->size() - 6];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 5];
ASSERT_EQ(field, Field(43.43));
field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(Array({Field(41)})));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("str2"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field(Array({Field(42)})));
}
TEST(ColumnDynamic, InsertRangeFromOverflow9)
{
auto column_from = ColumnDynamic::create(3);
column_from->insert(Field("str1"));
column_from->insert(Field(42.42));
column_from->insert(Field("str2"));
column_from->insert(Field(42));
column_from->insert(Field(43.43));
column_from->insert(Field(Array({Field(41)})));
column_from->insert(Field(43));
column_from->insert(Field("str2"));
column_from->insert(Field(Array({Field(42)})));
auto column_to = ColumnDynamic::create(2);
column_to->insert(Field(42));
column_to->insertRangeFrom(*column_from, 0, 9);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 3);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_EQ(column_to->getSharedVariant().size(), 4);
auto field = (*column_to)[column_to->size() - 9];
ASSERT_EQ(field, Field("str1"));
field = (*column_to)[column_to->size() - 8];
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 7];
ASSERT_EQ(field, Field("str2"));
field = (*column_to)[column_to->size() - 6];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 5];
ASSERT_EQ(field, Field(43.43));
field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(Array({Field(41)})));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("str2"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field(Array({Field(42)})));
}
TEST(ColumnDynamic, SerializeDeserializeFromArena1)
{
auto column = ColumnDynamic::create(255);
auto column = ColumnDynamic::create(254);
column->insert(Field(42));
column->insert(Field(42.42));
column->insert(Field("str"));
@ -564,7 +768,7 @@ TEST(ColumnDynamic, SerializeDeserializeFromArena1)
TEST(ColumnDynamic, SerializeDeserializeFromArena2)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
@ -577,26 +781,26 @@ TEST(ColumnDynamic, SerializeDeserializeFromArena2)
column_from->serializeValueIntoArena(2, arena, pos);
column_from->serializeValueIntoArena(3, arena, pos);
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
pos = column_to->deserializeAndInsertFromArena(ref1.data);
pos = column_to->deserializeAndInsertFromArena(pos);
pos = column_to->deserializeAndInsertFromArena(pos);
column_to->deserializeAndInsertFromArena(pos);
ASSERT_EQ((*column_from)[column_from->size() - 4], 42);
ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42);
ASSERT_EQ((*column_from)[column_from->size() - 2], "str");
ASSERT_EQ((*column_from)[column_from->size() - 1], Null());
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)");
std::vector<String> expected_names = {"Float64", "Int8", "String"};
ASSERT_EQ((*column_to)[column_to->size() - 4], 42);
ASSERT_EQ((*column_to)[column_to->size() - 3], 42.42);
ASSERT_EQ((*column_to)[column_to->size() - 2], "str");
ASSERT_EQ((*column_to)[column_to->size() - 1], Null());
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, SharedVariant, String)");
std::vector<String> expected_names = {"Float64", "Int8", "SharedVariant", "String"};
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
std::unordered_map<String, UInt8> expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}};
std::unordered_map<String, UInt8> expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"SharedVariant", 2}, {"String", 3}};
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow)
TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow1)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(254);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
@ -615,18 +819,56 @@ TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow)
pos = column_to->deserializeAndInsertFromArena(pos);
column_to->deserializeAndInsertFromArena(pos);
ASSERT_EQ((*column_from)[column_from->size() - 4], 42);
ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42);
ASSERT_EQ((*column_from)[column_from->size() - 2], "str");
ASSERT_EQ((*column_from)[column_from->size() - 1], Null());
ASSERT_EQ((*column_to)[column_to->size() - 4], 42);
ASSERT_EQ((*column_to)[column_to->size() - 3], 42.42);
ASSERT_EQ((*column_to)[column_to->size() - 2], "str");
ASSERT_EQ((*column_to)[column_to->size() - 1], Null());
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
}
TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow2)
{
auto column_from = ColumnDynamic::create(2);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
column_from->insert(Field(Null()));
column_from->insert(Field(Array({Field(42)})));
Arena arena;
const char * pos = nullptr;
auto ref1 = column_from->serializeValueIntoArena(0, arena, pos);
column_from->serializeValueIntoArena(1, arena, pos);
column_from->serializeValueIntoArena(2, arena, pos);
column_from->serializeValueIntoArena(3, arena, pos);
column_from->serializeValueIntoArena(4, arena, pos);
auto column_to = ColumnDynamic::create(2);
column_to->insert(Field(42.42));
pos = column_to->deserializeAndInsertFromArena(ref1.data);
pos = column_to->deserializeAndInsertFromArena(pos);
pos = column_to->deserializeAndInsertFromArena(pos);
pos = column_to->deserializeAndInsertFromArena(pos);
column_to->deserializeAndInsertFromArena(pos);
ASSERT_EQ((*column_to)[column_to->size() - 5], 42);
ASSERT_EQ((*column_to)[column_to->size() - 4], 42.42);
ASSERT_EQ((*column_to)[column_to->size() - 3], "str");
ASSERT_EQ((*column_to)[column_to->size() - 2], Null());
ASSERT_EQ((*column_to)[column_to->size() - 1], Field(Array({Field(42)})));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_EQ(column_to->getSharedVariant().size(), 2);
}
TEST(ColumnDynamic, skipSerializedInArena)
{
auto column_from = ColumnDynamic::create(255);
auto column_from = ColumnDynamic::create(3);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
@ -640,13 +882,41 @@ TEST(ColumnDynamic, skipSerializedInArena)
auto ref4 = column_from->serializeValueIntoArena(3, arena, pos);
const char * end = ref4.data + ref4.size;
auto column_to = ColumnDynamic::create(255);
auto column_to = ColumnDynamic::create(254);
pos = column_to->skipSerializedInArena(ref1.data);
pos = column_to->skipSerializedInArena(pos);
pos = column_to->skipSerializedInArena(pos);
pos = column_to->skipSerializedInArena(pos);
ASSERT_EQ(pos, end);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.empty());
ASSERT_TRUE(column_to->getVariantInfo().variant_names.empty());
ASSERT_EQ(column_to->getVariantInfo().variant_name_to_discriminator.at("SharedVariant"), 0);
ASSERT_EQ(column_to->getVariantInfo().variant_names, Names{"SharedVariant"});
}
TEST(ColumnDynamic, compare)
{
auto column_from = ColumnDynamic::create(3);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
column_from->insert(Field(Null()));
column_from->insert(Field(Array({Field(42)})));
ASSERT_EQ(column_from->compareAt(0, 0, *column_from, -1), 0);
ASSERT_EQ(column_from->compareAt(0, 1, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(1, 1, *column_from, -1), 0);
ASSERT_EQ(column_from->compareAt(0, 2, *column_from, -1), -1);
ASSERT_EQ(column_from->compareAt(2, 0, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(2, 4, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(4, 2, *column_from, -1), -1);
ASSERT_EQ(column_from->compareAt(4, 4, *column_from, -1), 0);
ASSERT_EQ(column_from->compareAt(0, 3, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(1, 3, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(2, 3, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(3, 3, *column_from, -1), 0);
ASSERT_EQ(column_from->compareAt(4, 3, *column_from, -1), 1);
ASSERT_EQ(column_from->compareAt(3, 0, *column_from, -1), -1);
ASSERT_EQ(column_from->compareAt(3, 1, *column_from, -1), -1);
ASSERT_EQ(column_from->compareAt(3, 2, *column_from, -1), -1);
ASSERT_EQ(column_from->compareAt(3, 4, *column_from, -1), -1);
}

View File

@ -0,0 +1,351 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteBufferFromString.h>
#include <Common/Arena.h>
#include <gtest/gtest.h>
using namespace DB;
TEST(ColumnObject, CreateEmpty)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=20, a.b UInt32, a.c Array(String))");
auto col = type->createColumn();
const auto & col_object = assert_cast<const ColumnObject &>(*col);
const auto & typed_paths = col_object.getTypedPaths();
ASSERT_TRUE(typed_paths.contains("a.b"));
ASSERT_EQ(typed_paths.at("a.b")->getName(), "UInt32");
ASSERT_TRUE(typed_paths.contains("a.c"));
ASSERT_EQ(typed_paths.at("a.c")->getName(), "Array(String)");
ASSERT_TRUE(col_object.getDynamicPaths().empty());
ASSERT_TRUE(col_object.getSharedDataOffsets().empty());
ASSERT_TRUE(col_object.getSharedDataPathsAndValues().first->empty());
ASSERT_TRUE(col_object.getSharedDataPathsAndValues().second->empty());
ASSERT_EQ(col_object.getMaxDynamicTypes(), 10);
ASSERT_EQ(col_object.getMaxDynamicPaths(), 20);
}
TEST(ColumnObject, GetName)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=20, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
ASSERT_EQ(col->getName(), "Object(max_dynamic_paths=20, max_dynamic_types=10, a.b Array(String), b.d UInt32)");
}
Field deserializeFieldFromSharedData(ColumnString * values, size_t n)
{
auto data = values->getDataAt(n);
ReadBufferFromMemory buf(data.data, data.size);
Field res;
std::make_shared<SerializationDynamic>()->deserializeBinary(res, buf, FormatSettings());
return res;
}
TEST(ColumnObject, InsertField)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
Object empty_object;
col_object.insert(empty_object);
ASSERT_EQ(col_object[0], (Object{{"a.b", Array{}}, {"b.d", Field(0u)}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 1);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(0));
ASSERT_EQ(typed_paths.at("b.d")->size(), 1);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(0));
ASSERT_TRUE(dynamic_paths.empty());
ASSERT_EQ(shared_data_nested_column.size(), 1);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(0));
Object object1 = {{"a.b", Array{String("Hello"), String("World")}}, {"a.c", Field(42)}};
col_object.insert(object1);
ASSERT_EQ(col_object[1], (Object{{"a.b", Array{String("Hello"), String("World")}}, {"b.d", Field(0u)}, {"a.c", Field(42)}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 2);
ASSERT_EQ((*typed_paths.at("a.b"))[1], (Array{String("Hello"), String("World")}));
ASSERT_EQ(typed_paths.at("b.d")->size(), 2);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(1));
ASSERT_EQ(dynamic_paths.size(), 1);
ASSERT_TRUE(dynamic_paths.contains("a.c"));
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 2);
ASSERT_TRUE(dynamic_paths.at("a.c")->isDefaultAt(0));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field(42));
ASSERT_EQ(shared_data_nested_column.size(), 2);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
Object object2 = {{"b.d", Field(142u)}, {"a.c", Field(43)}, {"a.d", Field("str")}, {"a.e", Field(242)}, {"a.f", Array{Field(42), Field(43)}}};
col_object.insert(object2);
ASSERT_EQ(col_object[2], (Object{{"a.b", Array{}}, {"b.d", Field(142u)}, {"a.c", Field(43)}, {"a.d", Field("str")}, {"a.e", Field(242)}, {"a.f", Array{Field(42), Field(43)}}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 3);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(2));
ASSERT_EQ(typed_paths.at("b.d")->size(), 3);
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(142u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_TRUE(dynamic_paths.contains("a.c"));
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 3);
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(43));
ASSERT_TRUE(dynamic_paths.contains("a.d"));
ASSERT_EQ(dynamic_paths.at("a.d")->size(), 3);
ASSERT_EQ((*dynamic_paths.at("a.d"))[2], Field("str"));
ASSERT_EQ(shared_data_nested_column.size(), 3);
ASSERT_EQ(shared_data_offsets[2] - shared_data_offsets[1], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field(242));
ASSERT_EQ((*shared_data_paths)[1], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), (Array({Field(42), Field(43)})));
Object object3 = {{"b.a", Field("Str")}, {"b.b", Field(2)}, {"b.c", Field(Tuple{Field(42), Field("Str")})}};
col_object.insert(object3);
ASSERT_EQ(col_object[3], (Object{{"a.b", Array{}}, {"b.d", Field(0u)}, {"b.a", Field("Str")}, {"b.b", Field(2)}, {"b.c", Field(Tuple{Field(42), Field("Str")})}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 4);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(3));
ASSERT_EQ(typed_paths.at("b.d")->size(), 4);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(3));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 4);
ASSERT_TRUE(dynamic_paths.at("a.c")->isDefaultAt(3));
ASSERT_EQ(dynamic_paths.at("a.d")->size(), 4);
ASSERT_TRUE(dynamic_paths.at("a.d")->isDefaultAt(3));
ASSERT_EQ(shared_data_nested_column.size(), 4);
ASSERT_EQ(shared_data_offsets[3] - shared_data_offsets[2], 3);
ASSERT_EQ((*shared_data_paths)[2], "b.a");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str"));
ASSERT_EQ((*shared_data_paths)[3], "b.b");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field(2));
ASSERT_EQ((*shared_data_paths)[4], "b.c");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field(Tuple{Field(42), Field("Str")}));
Object object4 = {{"c.c", Field(Null())}, {"c.d", Field(Null())}};
col_object.insert(object4);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(4));
}
TEST(ColumnObject, InsertFrom)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.a", Field(42)}});
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
auto src_col1 = type->createColumn();
auto & src_col_object1 = assert_cast<ColumnObject &>(*src_col1);
src_col_object1.insert(Object{{"b.d", Field(43u)}, {"a.c", Field("Str1")}});
col_object.insertFrom(src_col_object1, 0);
ASSERT_EQ((*typed_paths.at("a.b"))[1], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[1], Field(43u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[1], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field("Str1"));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
auto src_col2 = type->createColumn();
auto & src_col_object2 = assert_cast<ColumnObject &>(*src_col2);
src_col_object2.insert(Object{{"a.b", Array{"Str4", "Str5"}}, {"b.d", Field(44u)}, {"a.d", Field("Str2")}, {"a.e", Field("Str3")}});
col_object.insertFrom(src_col_object2, 0);
ASSERT_EQ((*typed_paths.at("a.b"))[2], Field(Array{"Str4", "Str5"}));
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(44u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[2], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(Null()));
ASSERT_EQ(shared_data_offsets[2] - shared_data_offsets[1], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field("Str2"));
ASSERT_EQ((*shared_data_paths)[1], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), Field("Str3"));
auto src_col3 = type->createColumn();
auto & src_col_object3 = assert_cast<ColumnObject &>(*src_col3);
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}});
src_col_object3.insert(Object{{"a.a", Field("Str10")}, {"a.c", Field(45u)}, {"a.h", Field("Str6")}, {"h.h", Field("Str7")}, {"a.f", Field("Str8")}, {"a.g", Field("Str9")}, {"a.i", Field("Str11")}, {"a.u", Field(Null())}});
col_object.insertFrom(src_col_object3, 1);
ASSERT_EQ((*typed_paths.at("a.b"))[3], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[3], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[3], Field("Str10"));
ASSERT_EQ((*dynamic_paths.at("a.c"))[3], Field(45u));
ASSERT_EQ(shared_data_offsets[3] - shared_data_offsets[2], 5);
ASSERT_EQ((*shared_data_paths)[2], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str8"));
ASSERT_EQ((*shared_data_paths)[3], "a.g");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field("Str9"));
ASSERT_EQ((*shared_data_paths)[4], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[5], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 5), Field("Str11"));
ASSERT_EQ((*shared_data_paths)[6], "h.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 6), Field("Str7"));
}
TEST(ColumnObject, InsertRangeFrom)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.a", Field(42)}});
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
auto src_col1 = type->createColumn();
auto & src_col_object1 = assert_cast<ColumnObject &>(*src_col1);
src_col_object1.insert(Object{{"b.d", Field(43u)}, {"a.c", Field("Str1")}});
src_col_object1.insert(Object{{"a.b", Field(Array{"Str1", "Str2"})}, {"a.a", Field("Str1")}});
src_col_object1.insert(Object{{"b.d", Field(45u)}, {"a.c", Field("Str2")}});
col_object.insertRangeFrom(src_col_object1, 0, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[1], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[2], Field(Array{"Str1", "Str2"}));
ASSERT_EQ((*typed_paths.at("a.b"))[3], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[1], Field(43u));
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[3], Field(45u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[1], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[2], Field("Str1"));
ASSERT_EQ((*dynamic_paths.at("a.a"))[3], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field("Str1"));
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[3], Field("Str2"));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(2));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(3));
auto src_col2 = type->createColumn();
auto & src_col_object2 = assert_cast<ColumnObject &>(*src_col2);
src_col_object2.insert(Object{{"a.b", Array{"Str4", "Str5"}}, {"a.d", Field("Str2")}, {"a.e", Field("Str3")}});
src_col_object2.insert(Object{{"b.d", Field(44u)}, {"a.d", Field("Str22")}, {"a.e", Field("Str33")}});
src_col_object2.insert(Object{{"a.b", Array{"Str44", "Str55"}}, {"a.d", Field("Str222")}, {"a.e", Field("Str333")}});
col_object.insertRangeFrom(src_col_object2, 0, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[4], Field(Array{"Str4", "Str5"}));
ASSERT_EQ((*typed_paths.at("a.b"))[5], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[6], Field(Array{"Str44", "Str55"}));
ASSERT_EQ((*typed_paths.at("b.d"))[4], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[5], Field(44u));
ASSERT_EQ((*typed_paths.at("b.d"))[6], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[4], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[5], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[6], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[4], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[5], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[6], Field(Null()));
ASSERT_EQ(shared_data_offsets[4] - shared_data_offsets[3], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field("Str2"));
ASSERT_EQ((*shared_data_paths)[1], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), Field("Str3"));
ASSERT_EQ(shared_data_offsets[5] - shared_data_offsets[4], 2);
ASSERT_EQ((*shared_data_paths)[2], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str22"));
ASSERT_EQ((*shared_data_paths)[3], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field("Str33"));
ASSERT_EQ(shared_data_offsets[6] - shared_data_offsets[5], 2);
ASSERT_EQ((*shared_data_paths)[4], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field("Str222"));
ASSERT_EQ((*shared_data_paths)[5], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 5), Field("Str333"));
auto src_col3 = type->createColumn();
auto & src_col_object3 = assert_cast<ColumnObject &>(*src_col3);
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}});
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}, {"a.f", Field("Str8")}, {"a.g", Field("Str9")}, {"a.i", Field("Str11")}});
src_col_object3.insert(Object{{"a.a", Field("Str10")}});
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"a.c", Field(45u)}, {"h.h", Field("Str7")}, {"a.i", Field("Str11")}});
col_object.insertRangeFrom(src_col_object3, 1, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[7], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[8], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[9], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[7], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[8], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[9], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[7], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[8], Field("Str10"));
ASSERT_EQ((*dynamic_paths.at("a.a"))[9], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[7], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[8], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[9], Field(45u));
ASSERT_EQ(shared_data_offsets[7] - shared_data_offsets[6], 5);
ASSERT_EQ((*shared_data_paths)[6], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 6), Field("Str8"));
ASSERT_EQ((*shared_data_paths)[7], "a.g");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 7), Field("Str9"));
ASSERT_EQ((*shared_data_paths)[8], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 8), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[9], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 9), Field("Str11"));
ASSERT_EQ((*shared_data_paths)[10], "h.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 10), Field("Str7"));
ASSERT_EQ(shared_data_offsets[8] - shared_data_offsets[7], 0);
ASSERT_EQ(shared_data_offsets[9] - shared_data_offsets[8], 3);
ASSERT_EQ((*shared_data_paths)[11], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 11), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[12], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 12), Field("Str11"));
}
TEST(ColumnObject, SerializeDeserializerFromArena)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}});
col_object.insert(Object{{"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}});
col_object.insert(Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}});
Arena arena;
const char * pos = nullptr;
auto ref1 = col_object.serializeValueIntoArena(0, arena, pos);
col_object.serializeValueIntoArena(1, arena, pos);
col_object.serializeValueIntoArena(2, arena, pos);
auto col2 = type->createColumn();
auto & col_object2 = assert_cast<ColumnObject &>(*col);
pos = col_object2.deserializeAndInsertFromArena(ref1.data);
pos = col_object2.deserializeAndInsertFromArena(pos);
col_object2.deserializeAndInsertFromArena(pos);
ASSERT_EQ(col_object2[0], (Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}}));
ASSERT_EQ(col_object2[1], (Object{{"b.d", Field{0u}}, {"a.b", Array{}}, {"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}}));
ASSERT_EQ(col_object2[2], (Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}}));
}
TEST(ColumnObject, SkipSerializedInArena)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}});
col_object.insert(Object{{"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}});
col_object.insert(Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}});
Arena arena;
const char * pos = nullptr;
auto ref1 = col_object.serializeValueIntoArena(0, arena, pos);
col_object.serializeValueIntoArena(1, arena, pos);
auto ref3 = col_object.serializeValueIntoArena(2, arena, pos);
const char * end = ref3.data + ref3.size;
auto col2 = type->createColumn();
pos = col2->skipSerializedInArena(ref1.data);
pos = col2->skipSerializedInArena(pos);
pos = col2->skipSerializedInArena(pos);
ASSERT_EQ(pos, end);
}

View File

@ -1,18 +1,24 @@
#include <Common/formatReadable.h>
#include <Common/AsynchronousMetrics.h>
#include <Common/Exception.h>
#include <Common/setThreadName.h>
#include <Common/CurrentMetrics.h>
#include <Common/filesystemHelpers.h>
#include <Common/logger_useful.h>
#include <IO/UncompressedCache.h>
#include <IO/MMappedFileCache.h>
#include <IO/ReadHelpers.h>
#include <IO/UncompressedCache.h>
#include <base/cgroupsv2.h>
#include <base/errnoToString.h>
#include <base/find_symbols.h>
#include <base/getPageSize.h>
#include <sys/resource.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/filesystemHelpers.h>
#include <Common/formatReadable.h>
#include <Common/logger_useful.h>
#include <Common/setThreadName.h>
#include <boost/locale/date_time_facet.hpp>
#include <chrono>
#include <string_view>
#include "config.h"
@ -52,6 +58,12 @@ static std::unique_ptr<ReadBufferFromFilePRead> openFileIfExists(const std::stri
return {};
}
static void openCgroupv2MetricFile(const std::string & filename, std::optional<ReadBufferFromFilePRead> & out)
{
if (auto path = getCgroupsV2PathContainingFile(filename))
openFileIfExists((path.value() + filename).c_str(), out);
};
#endif
@ -63,21 +75,15 @@ AsynchronousMetrics::AsynchronousMetrics(
, protocol_server_metrics_func(protocol_server_metrics_func_)
{
#if defined(OS_LINUX)
openFileIfExists("/proc/meminfo", meminfo);
openFileIfExists("/proc/loadavg", loadavg);
openFileIfExists("/proc/stat", proc_stat);
openFileIfExists("/proc/cpuinfo", cpuinfo);
openFileIfExists("/proc/sys/fs/file-nr", file_nr);
openFileIfExists("/proc/uptime", uptime);
openFileIfExists("/proc/net/dev", net_dev);
/// CGroups v2
openFileIfExists("/sys/fs/cgroup/memory.max", cgroupmem_limit_in_bytes);
if (cgroupmem_limit_in_bytes)
{
openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes);
}
openFileIfExists("/sys/fs/cgroup/cpu.max", cgroupcpu_max);
openCgroupv2MetricFile("memory.max", cgroupmem_limit_in_bytes);
openCgroupv2MetricFile("memory.current", cgroupmem_usage_in_bytes);
openCgroupv2MetricFile("cpu.max", cgroupcpu_max);
openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
/// CGroups v1
if (!cgroupmem_limit_in_bytes)
@ -90,6 +96,21 @@ AsynchronousMetrics::AsynchronousMetrics(
openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_period_us", cgroupcpu_cfs_period);
openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", cgroupcpu_cfs_quota);
}
if (!cgroupcpu_stat)
openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
if (!cgroupcpu_stat && !cgroupcpuacct_stat)
{
/// The following metrics are not cgroup-aware and we've found cgroup-specific metric files for the similar metrics,
/// so we're better not reporting them at all to avoid confusion
openFileIfExists("/proc/loadavg", loadavg);
openFileIfExists("/proc/stat", proc_stat);
openFileIfExists("/proc/uptime", uptime);
}
/// The same story for memory metrics
if (!cgroupmem_limit_in_bytes)
openFileIfExists("/proc/meminfo", meminfo);
openFileIfExists("/proc/sys/vm/max_map_count", vm_max_map_count);
openFileIfExists("/proc/self/maps", vm_maps);
@ -570,6 +591,151 @@ AsynchronousMetrics::NetworkInterfaceStatValues::operator-(const AsynchronousMet
#endif
#if defined(OS_LINUX)
void AsynchronousMetrics::applyCPUMetricsUpdate(
AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier)
{
new_values["OSUserTime" + cpu_suffix]
= {delta_values.user * multiplier,
"The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the "
"host machine, not just clickhouse-server."
" This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
"stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSNiceTime" + cpu_suffix]
= {delta_values.nice * multiplier,
"The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all "
"the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSSystemTime" + cpu_suffix]
= {delta_values.system * multiplier,
"The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIdleTime" + cpu_suffix]
= {delta_values.idle * multiplier,
"The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This "
"is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline "
"stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIOWaitTime" + cpu_suffix]
= {delta_values.iowait * multiplier,
"The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as "
"the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just "
"clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSIrqTime" + cpu_suffix]
= {delta_values.irq * multiplier,
"The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate hardware misconfiguration or a very high network load."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSSoftIrqTime" + cpu_suffix]
= {delta_values.softirq * multiplier,
"The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the "
"processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate inefficient software running on the system."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSStealTime" + cpu_suffix]
= {delta_values.steal * multiplier,
"The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide "
"metric, it includes all the processes on the host machine, not just clickhouse-server."
" Not every virtualized environments present this metric, and most of them don't."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSGuestTime" + cpu_suffix]
= {delta_values.guest * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man "
"procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
new_values["OSGuestNiceTime" + cpu_suffix]
= {delta_values.guest_nice * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest "
"was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host "
"machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across "
"them [0..num cores]."};
}
void AsynchronousMetrics::applyNormalizedCPUMetricsUpdate(
AsynchronousMetricValues & new_values, double num_cpus_to_normalize, const ProcStatValuesCPU & delta_values_all_cpus, double multiplier)
{
chassert(num_cpus_to_normalize);
new_values["OSUserTimeNormalized"]
= {delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSNiceTimeNormalized"]
= {delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSSystemTimeNormalized"]
= {delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIdleTimeNormalized"]
= {delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIOWaitTimeNormalized"]
= {delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSIrqTimeNormalized"]
= {delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of "
"the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSSoftIrqTimeNormalized"]
= {delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
"regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSStealTimeNormalized"]
= {delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestTimeNormalized"]
= {delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless "
"of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestNiceTimeNormalized"]
= {delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval "
"regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is "
"non-uniform, and still get the average resource utilization metric."};
}
#endif
void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
{
Stopwatch watch;
@ -831,7 +997,68 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."};
}
if (proc_stat)
if (cgroupcpu_stat || cgroupcpuacct_stat)
{
try
{
ReadBufferFromFilePRead & in = cgroupcpu_stat ? *cgroupcpu_stat : *cgroupcpuacct_stat;
ProcStatValuesCPU current_values{};
/// We re-read the file from the beginning each time
in.rewind();
while (!in.eof())
{
String name;
readStringUntilWhitespace(name, in);
skipWhitespaceIfAny(in);
/// `user_usec` for cgroup v2 and `user` for cgroup v1
if (name.starts_with("user"))
{
readText(current_values.user, in);
skipToNextLineOrEOF(in);
}
/// `system_usec` for cgroup v2 and `system` for cgroup v1
else if (name.starts_with("system"))
{
readText(current_values.system, in);
skipToNextLineOrEOF(in);
}
else
skipToNextLineOrEOF(in);
}
if (!first_run)
{
auto get_clock_ticks = [&]()
{
if (auto hz = sysconf(_SC_CLK_TCK); hz != -1)
return hz;
else
throw ErrnoException(ErrorCodes::CANNOT_SYSCONF, "Cannot call 'sysconf' to obtain system HZ");
};
const auto cgroup_version_specific_divisor = cgroupcpu_stat ? 1e6 : get_clock_ticks();
const double multiplier = 1.0 / cgroup_version_specific_divisor
/ (std::chrono::duration_cast<std::chrono::nanoseconds>(time_since_previous_update).count() / 1e9);
const ProcStatValuesCPU delta_values = current_values - proc_stat_values_all_cpus;
applyCPUMetricsUpdate(new_values, /*cpu_suffix=*/"", delta_values, multiplier);
if (max_cpu_cgroups > 0)
applyNormalizedCPUMetricsUpdate(new_values, max_cpu_cgroups, delta_values, multiplier);
}
proc_stat_values_all_cpus = current_values;
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
openCgroupv2MetricFile("cpu.stat", cgroupcpu_stat);
if (!cgroupcpu_stat)
openFileIfExists("/sys/fs/cgroup/cpuacct/cpuacct.stat", cgroupcpuacct_stat);
}
}
else if (proc_stat)
{
try
{
@ -886,43 +1113,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
else
delta_values_all_cpus = delta_values;
new_values["OSUserTime" + cpu_suffix] = { delta_values.user * multiplier,
"The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSNiceTime" + cpu_suffix] = { delta_values.nice * multiplier,
"The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSSystemTime" + cpu_suffix] = { delta_values.system * multiplier,
"The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIdleTime" + cpu_suffix] = { delta_values.idle * multiplier,
"The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIOWaitTime" + cpu_suffix] = { delta_values.iowait * multiplier,
"The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSIrqTime" + cpu_suffix] = { delta_values.irq * multiplier,
"The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate hardware misconfiguration or a very high network load."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSSoftIrqTime" + cpu_suffix] = { delta_values.softirq * multiplier,
"The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" A high number of this metric may indicate inefficient software running on the system."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSStealTime" + cpu_suffix] = { delta_values.steal * multiplier,
"The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" Not every virtualized environments present this metric, and most of them don't."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSGuestTime" + cpu_suffix] = { delta_values.guest * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
new_values["OSGuestNiceTime" + cpu_suffix] = { delta_values.guest_nice * multiplier,
"The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
" This metric is irrelevant for ClickHouse, but still exists for completeness."
" The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
applyCPUMetricsUpdate(new_values, cpu_suffix, delta_values, multiplier);
}
prev_values = current_values;
@ -978,38 +1169,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus;
if (num_cpus_to_normalize > 0)
{
new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
}
applyNormalizedCPUMetricsUpdate(new_values, num_cpus_to_normalize, delta_values_all_cpus, multiplier);
}
proc_stat_values_other = current_other_values;
@ -1042,8 +1202,7 @@ void AsynchronousMetrics::update(TimePoint update_time, bool force_update)
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
if (meminfo)
else if (meminfo)
{
try
{

View File

@ -126,6 +126,8 @@ private:
std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_period TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_cfs_quota TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_max TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpu_stat TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> cgroupcpuacct_stat TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> vm_max_map_count TSA_GUARDED_BY(data_mutex);
std::optional<ReadBufferFromFilePRead> vm_maps TSA_GUARDED_BY(data_mutex);
@ -221,6 +223,16 @@ private:
void openBlockDevices();
void openSensorsChips();
void openEDAC();
void applyCPUMetricsUpdate(
AsynchronousMetricValues & new_values, const std::string & cpu_suffix, const ProcStatValuesCPU & delta_values, double multiplier);
void applyNormalizedCPUMetricsUpdate(
AsynchronousMetricValues & new_values,
double num_cpus_to_normalize,
const ProcStatValuesCPU & delta_values_all_cpus,
double multiplier);
#endif
void run();

View File

@ -144,31 +144,6 @@ private:
/// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
/// systems existed only for a short transition period.
std::optional<std::string> getCgroupsV2Path()
{
if (!cgroupsV2Enabled())
return {};
if (!cgroupsV2MemoryControllerEnabled())
return {};
fs::path current_cgroup = cgroupV2PathOfProcess();
if (current_cgroup.empty())
return {};
/// Return the bottom-most nested current memory file. If there is no such file at the current
/// level, try again at the parent level as memory settings are inherited.
while (current_cgroup != default_cgroups_mount.parent_path())
{
const auto current_path = current_cgroup / "memory.current";
const auto stat_path = current_cgroup / "memory.stat";
if (fs::exists(current_path) && fs::exists(stat_path))
return {current_cgroup};
current_cgroup = current_cgroup.parent_path();
}
return {};
}
std::optional<std::string> getCgroupsV1Path()
{
auto path = default_cgroups_mount / "memory/memory.stat";
@ -179,7 +154,7 @@ std::optional<std::string> getCgroupsV1Path()
std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsPath()
{
auto v2_path = getCgroupsV2Path();
auto v2_path = getCgroupsV2PathContainingFile("memory.current");
if (v2_path.has_value())
return {*v2_path, CgroupsMemoryUsageObserver::CgroupsVersion::V2};

View File

@ -307,7 +307,7 @@
M(FilteringMarksWithPrimaryKey, "Number of threads currently doing filtering of mark ranges by the primary key") \
M(FilteringMarksWithSecondaryKeys, "Number of threads currently doing filtering of mark ranges by secondary keys") \
\
M(S3DiskNoKeyErrors, "The number of `NoSuchKey` errors that occur when reading data from S3 cloud storage through ClickHouse disks.") \
M(DiskS3NoSuchKeyErrors, "The number of `NoSuchKey` errors that occur when reading data from S3 cloud storage through ClickHouse disks.") \
#ifdef APPLY_FOR_EXTERNAL_METRICS
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)

View File

@ -244,33 +244,43 @@ const char * analyzeImpl(
is_trivial = false;
if (!in_square_braces)
{
/// Check for case-insensitive flag.
if (pos + 1 < end && pos[1] == '?')
/// it means flag negation
/// there are various possible flags
/// actually only imsU are supported by re2
auto is_flag_char = [](char x)
{
for (size_t offset = 2; pos + offset < end; ++offset)
return x == '-' || x == 'i' || x == 'm' || x == 's' || x == 'U' || x == 'u';
};
/// Check for case-insensitive flag.
if (pos + 2 < end && pos[1] == '?' && is_flag_char(pos[2]))
{
size_t offset = 2;
for (; pos + offset < end; ++offset)
{
if (pos[offset] == '-' /// it means flag negation
/// various possible flags, actually only imsU are supported by re2
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
if (pos[offset] == 'i')
{
if (pos[offset] == 'i')
{
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
break;
}
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
}
else
else if (!is_flag_char(pos[offset]))
break;
}
pos += offset;
if (pos == end)
return pos;
/// if this group only contains flags, we have nothing to do.
if (*pos == ')')
{
++pos;
break;
}
}
/// (?:regex) means non-capturing parentheses group
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
else if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
{
pos += 2;
}
if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
else if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
{
pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end);
}

View File

@ -209,8 +209,35 @@
\
M(Merge, "Number of launched background merges.") \
M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \
M(MergedColumns, "Number of columns merged during the horizontal stage of merges.") \
M(GatheredColumns, "Number of columns gathered during the vertical stage of merges.") \
M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \
M(MergesTimeMilliseconds, "Total time spent for background merges.")\
M(MergeTotalMilliseconds, "Total time spent for background merges") \
M(MergeExecuteMilliseconds, "Total busy time spent for execution of background merges") \
M(MergeHorizontalStageTotalMilliseconds, "Total time spent for horizontal stage of background merges") \
M(MergeHorizontalStageExecuteMilliseconds, "Total busy time spent for execution of horizontal stage of background merges") \
M(MergeVerticalStageTotalMilliseconds, "Total time spent for vertical stage of background merges") \
M(MergeVerticalStageExecuteMilliseconds, "Total busy time spent for execution of vertical stage of background merges") \
M(MergeProjectionStageTotalMilliseconds, "Total time spent for projection stage of background merges") \
M(MergeProjectionStageExecuteMilliseconds, "Total busy time spent for execution of projection stage of background merges") \
\
M(MergingSortedMilliseconds, "Total time spent while merging sorted columns") \
M(AggregatingSortedMilliseconds, "Total time spent while aggregating sorted columns") \
M(CollapsingSortedMilliseconds, "Total time spent while collapsing sorted columns") \
M(ReplacingSortedMilliseconds, "Total time spent while replacing sorted columns") \
M(SummingSortedMilliseconds, "Total time spent while summing sorted columns") \
M(VersionedCollapsingSortedMilliseconds, "Total time spent while version collapsing sorted columns") \
M(GatheringColumnMilliseconds, "Total time spent while gathering columns for vertical merge") \
\
M(MutationTotalParts, "Number of total parts for which mutations tried to be applied") \
M(MutationUntouchedParts, "Number of total parts for which mutations tried to be applied but which was completely skipped according to predicate") \
M(MutatedRows, "Rows read for mutations. This is the number of rows before mutation") \
M(MutatedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for mutations. This is the number before mutation.") \
M(MutationTotalMilliseconds, "Total time spent for mutations.") \
M(MutationExecuteMilliseconds, "Total busy time spent for execution of mutations.") \
M(MutationAllPartColumns, "Number of times when task to mutate all columns in part was created") \
M(MutationSomePartColumns, "Number of times when task to mutate some columns in part was created") \
M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections in mutations.") \
\
M(MergeTreeDataWriterRows, "Number of rows INSERTed to MergeTree tables.") \
M(MergeTreeDataWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables.") \
@ -225,7 +252,6 @@
M(MergeTreeDataWriterProjectionsCalculationMicroseconds, "Time spent calculating projections") \
M(MergeTreeDataProjectionWriterSortingBlocksMicroseconds, "Time spent sorting blocks (for projection it might be a key different from table's sorting key)") \
M(MergeTreeDataProjectionWriterMergingBlocksMicroseconds, "Time spent merging blocks") \
M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections") \
\
M(InsertedWideParts, "Number of parts inserted in Wide format.") \
M(InsertedCompactParts, "Number of parts inserted in Compact format.") \

View File

@ -110,7 +110,7 @@ namespace
errno = saved_errno;
}
[[maybe_unused]] constexpr UInt32 TIMER_PRECISION = 1e9;
[[maybe_unused]] constexpr UInt64 TIMER_PRECISION = 1e9;
}
namespace ErrorCodes
@ -167,18 +167,18 @@ void Timer::createIfNecessary(UInt64 thread_id, int clock_type, int pause_signal
}
}
void Timer::set(UInt32 period)
void Timer::set(UInt64 period)
{
/// Too high frequency can introduce infinite busy loop of signal handlers. We will limit maximum frequency (with 1000 signals per second).
period = std::max<UInt32>(period, 1000000);
period = std::max<UInt64>(period, 1000000);
/// Randomize offset as uniform random value from 0 to period - 1.
/// It will allow to sample short queries even if timer period is large.
/// (For example, with period of 1 second, query with 50 ms duration will be sampled with 1 / 20 probability).
/// It also helps to avoid interference (moire).
UInt32 period_rand = std::uniform_int_distribution<UInt32>(0, period)(thread_local_rng);
UInt64 period_rand = std::uniform_int_distribution<UInt64>(0, period)(thread_local_rng);
struct timespec interval{.tv_sec = period / TIMER_PRECISION, .tv_nsec = period % TIMER_PRECISION};
struct timespec offset{.tv_sec = period_rand / TIMER_PRECISION, .tv_nsec = period_rand % TIMER_PRECISION};
struct timespec interval{.tv_sec = time_t(period / TIMER_PRECISION), .tv_nsec = int64_t(period % TIMER_PRECISION)};
struct timespec offset{.tv_sec = time_t(period_rand / TIMER_PRECISION), .tv_nsec = int64_t(period_rand % TIMER_PRECISION)};
struct itimerspec timer_spec = {.it_interval = interval, .it_value = offset};
if (timer_settime(*timer_id, 0, &timer_spec, nullptr))
@ -229,7 +229,7 @@ void Timer::cleanup()
template <typename ProfilerImpl>
QueryProfilerBase<ProfilerImpl>::QueryProfilerBase(
[[maybe_unused]] UInt64 thread_id, [[maybe_unused]] int clock_type, [[maybe_unused]] UInt32 period, [[maybe_unused]] int pause_signal_)
[[maybe_unused]] UInt64 thread_id, [[maybe_unused]] int clock_type, [[maybe_unused]] UInt64 period, [[maybe_unused]] int pause_signal_)
: log(getLogger("QueryProfiler")), pause_signal(pause_signal_)
{
#if defined(SANITIZER)
@ -270,7 +270,7 @@ QueryProfilerBase<ProfilerImpl>::QueryProfilerBase(
template <typename ProfilerImpl>
void QueryProfilerBase<ProfilerImpl>::setPeriod([[maybe_unused]] UInt32 period_)
void QueryProfilerBase<ProfilerImpl>::setPeriod([[maybe_unused]] UInt64 period_)
{
#if defined(SANITIZER)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "QueryProfiler disabled because they cannot work under sanitizers");
@ -307,7 +307,7 @@ void QueryProfilerBase<ProfilerImpl>::cleanup()
template class QueryProfilerBase<QueryProfilerReal>;
template class QueryProfilerBase<QueryProfilerCPU>;
QueryProfilerReal::QueryProfilerReal(UInt64 thread_id, UInt32 period)
QueryProfilerReal::QueryProfilerReal(UInt64 thread_id, UInt64 period)
: QueryProfilerBase(thread_id, CLOCK_MONOTONIC, period, SIGUSR1)
{}
@ -320,7 +320,7 @@ void QueryProfilerReal::signalHandler(int sig, siginfo_t * info, void * context)
writeTraceInfo(TraceType::Real, sig, info, context);
}
QueryProfilerCPU::QueryProfilerCPU(UInt64 thread_id, UInt32 period)
QueryProfilerCPU::QueryProfilerCPU(UInt64 thread_id, UInt64 period)
: QueryProfilerBase(thread_id, CLOCK_THREAD_CPUTIME_ID, period, SIGUSR2)
{}

View File

@ -40,7 +40,7 @@ public:
~Timer();
void createIfNecessary(UInt64 thread_id, int clock_type, int pause_signal);
void set(UInt32 period);
void set(UInt64 period);
void stop();
void cleanup();
@ -54,10 +54,10 @@ template <typename ProfilerImpl>
class QueryProfilerBase
{
public:
QueryProfilerBase(UInt64 thread_id, int clock_type, UInt32 period, int pause_signal_);
QueryProfilerBase(UInt64 thread_id, int clock_type, UInt64 period, int pause_signal_);
~QueryProfilerBase();
void setPeriod(UInt32 period_);
void setPeriod(UInt64 period_);
private:
void cleanup();
@ -76,7 +76,7 @@ private:
class QueryProfilerReal : public QueryProfilerBase<QueryProfilerReal>
{
public:
QueryProfilerReal(UInt64 thread_id, UInt32 period); /// NOLINT
QueryProfilerReal(UInt64 thread_id, UInt64 period); /// NOLINT
static void signalHandler(int sig, siginfo_t * info, void * context);
};
@ -85,7 +85,7 @@ public:
class QueryProfilerCPU : public QueryProfilerBase<QueryProfilerCPU>
{
public:
QueryProfilerCPU(UInt64 thread_id, UInt32 period); /// NOLINT
QueryProfilerCPU(UInt64 thread_id, UInt64 period); /// NOLINT
static void signalHandler(int sig, siginfo_t * info, void * context);
};

View File

@ -184,14 +184,20 @@ void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfi
// Resource update leads to loss of runtime data of nodes and may lead to temporary violation of constraints (e.g. limits)
// Try to minimise this by reusing "equal" resources (initialized with the same configuration).
std::vector<State::ResourcePtr> resources_to_attach;
for (auto & [name, new_resource] : new_state->resources)
{
if (auto iter = state->resources.find(name); iter != state->resources.end()) // Resource update
{
State::ResourcePtr old_resource = iter->second;
if (old_resource->equals(*new_resource))
{
new_resource = old_resource; // Rewrite with older version to avoid loss of runtime data
continue;
}
}
// It is new or updated resource
resources_to_attach.emplace_back(new_resource);
}
// Commit new state
@ -199,17 +205,14 @@ void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfi
state = new_state;
// Attach new and updated resources to the scheduler
for (auto & [name, resource] : new_state->resources)
for (auto & resource : resources_to_attach)
{
const SchedulerNodePtr & root = resource->nodes.find("/")->second.ptr;
if (root->parent == nullptr)
resource->attached_to = &scheduler;
scheduler.event_queue->enqueue([this, root]
{
resource->attached_to = &scheduler;
scheduler.event_queue->enqueue([this, root]
{
scheduler.attachChild(root);
});
}
scheduler.attachChild(root);
});
}
// NOTE: after mutex unlock `state` became available for Classifier(s) and must be immutable

View File

@ -0,0 +1,30 @@
#pragma once
#include <base/StringRef.h>
namespace DB
{
/// See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r3.html
struct StringHashForHeterogeneousLookup
{
using hash_type = std::hash<std::string_view>;
using transparent_key_equal = std::equal_to<>;
using is_transparent = void; // required to make find() work with different type than key_type
auto operator()(const std::string_view view) const
{
return hash_type()(view);
}
auto operator()(const std::string & str) const
{
return hash_type()(str);
}
auto operator()(const char * data) const
{
return hash_type()(data);
}
};
}

View File

@ -44,7 +44,7 @@ namespace ErrorCodes
namespace zkutil
{
/// Preferred size of multi() command (in number of ops)
/// Preferred size of multi command (in the number of operations)
constexpr size_t MULTI_BATCH_SIZE = 100;
struct ShuffleHost

View File

@ -79,11 +79,16 @@ std::vector<String> parseRemoteDescription(
/// Look for the corresponding closing bracket
for (m = i + 1; m < r; ++m)
{
if (description[m] == '{') ++cnt;
if (description[m] == '}') --cnt;
if (description[m] == '.' && description[m-1] == '.') last_dot = m;
if (description[m] == separator) have_splitter = true;
if (cnt == 0) break;
if (description[m] == '{')
++cnt;
if (description[m] == '}')
--cnt;
if (description[m] == '.' && description[m-1] == '.')
last_dot = m;
if (description[m] == separator)
have_splitter = true;
if (cnt == 0)
break;
}
if (cnt != 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}': incorrect brace sequence in first argument", func_name);

View File

@ -19,6 +19,9 @@ TEST(OptimizeRE, analyze)
};
test_f("abc", "abc", {}, true, true);
test_f("c([^k]*)de", "");
test_f("(?-s)bob", "bob", {}, false, true);
test_f("(?s)bob", "bob", {}, false, true);
test_f("(?ssss", "");
test_f("abc(de)fg", "abcdefg", {}, false, true);
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);

View File

@ -54,7 +54,7 @@ namespace
std::filesystem::path path(snapshot_path);
std::string filename = path.stem();
Strings name_parts;
splitInto<'_'>(name_parts, filename);
splitInto<'_', '.'>(name_parts, filename);
return parse<uint64_t>(name_parts[1]);
}

View File

@ -26,12 +26,16 @@ std::optional<RaftServerConfig> RaftServerConfig::parse(std::string_view server)
if (!with_id_endpoint && !with_server_type && !with_priority)
return std::nullopt;
const std::string_view id_str = parts[0];
std::string_view id_str = parts[0];
if (!id_str.starts_with("server."))
return std::nullopt;
id_str = id_str.substr(7);
if (auto eq_pos = id_str.find('='); std::string_view::npos != eq_pos)
id_str = id_str.substr(0, eq_pos);
Int32 id;
if (!tryParse(id, std::next(id_str.begin(), 7)))
if (!tryParse(id, id_str))
return std::nullopt;
if (id <= 0)
return std::nullopt;

View File

@ -24,9 +24,7 @@ void GTIDSet::tryMerge(size_t i)
void GTIDSets::parse(String gtid_format)
{
if (gtid_format.empty())
{
return;
}
std::vector<String> gtid_sets;
boost::split(gtid_sets, gtid_format, [](char c) { return c == ','; });

View File

@ -10,20 +10,19 @@ GTEST_TEST(GTIDSetsContains, Tests)
contained1, contained2, contained3, contained4, contained5,
not_contained1, not_contained2, not_contained3, not_contained4, not_contained5, not_contained6;
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:2-3:11:47-49");
contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:11");
contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:47-49:60");
contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:60");
contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:47-49:60");
contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:60");
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
not_contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:0-3:11:47-49");
not_contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:99");
not_contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:46-49:60");
not_contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:99");
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
not_contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:46-49:60");
not_contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:99");
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
ASSERT_TRUE(gtid_set.contains(contained1));
ASSERT_TRUE(gtid_set.contains(contained2));

View File

@ -83,6 +83,9 @@ static constexpr auto DBMS_MIN_REVISION_WITH_SYSTEM_KEYWORDS_TABLE = 54468;
static constexpr auto DBMS_MIN_REVISION_WITH_ROWS_BEFORE_AGGREGATION = 54469;
/// Packets size header
static constexpr auto DBMS_MIN_PROTOCOL_VERSION_WITH_CHUNKED_PACKETS = 54470;
/// Version of ClickHouse TCP protocol.
///
/// Should be incremented manually on protocol changes.
@ -90,6 +93,6 @@ static constexpr auto DBMS_MIN_REVISION_WITH_ROWS_BEFORE_AGGREGATION = 54469;
/// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION,
/// later is just a number for server version (one number instead of commit SHA)
/// for simplicity (sometimes it may be more convenient in some use cases).
static constexpr auto DBMS_TCP_PROTOCOL_VERSION = 54469;
static constexpr auto DBMS_TCP_PROTOCOL_VERSION = 54470;
}

View File

@ -325,6 +325,7 @@ class IColumn;
\
M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
\
M(Int32, join_output_by_rowlist_perkey_rows_threshold, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join.", 0) \
M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
M(Bool, single_join_prefer_left_table, true, "For single JOIN in case of identifier ambiguity prefer left table", IMPORTANT) \
@ -593,7 +594,6 @@ class IColumn;
M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \
M(Bool, enable_lightweight_delete, true, "Enable lightweight DELETE mutations for mergetree tables.", 0) ALIAS(allow_experimental_lightweight_delete) \
M(UInt64, lightweight_deletes_sync, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes", 0) \
M(LightweightMutationProjectionMode, lightweight_mutation_projection_mode, LightweightMutationProjectionMode::THROW, "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop all projection related to this table then do lightweight delete.", 0) \
M(Bool, apply_deleted_mask, true, "Enables filtering out rows deleted with lightweight DELETE. If disabled, a query will be able to read those rows. This is useful for debugging and \"undelete\" scenarios", 0) \
M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \
M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \
@ -878,6 +878,7 @@ class IColumn;
M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \
M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \
M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \
M(Bool, use_json_alias_for_old_object_type, false, "When enabled, JSON type alias will create old experimental Object type instead of a new JSON type", 0) \
M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0) \
M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \
M(Bool, print_pretty_type_names, true, "Print pretty type names in DESCRIBE query and toTypeName() function", 0) \
@ -897,6 +898,7 @@ class IColumn;
M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory.", 0) ALIAS(extract_kvp_max_pairs_per_row) \
M(Bool, restore_replace_external_engines_to_null, false, "Replace all the external table engines to Null on restore. Useful for testing purposes", 0) \
M(Bool, restore_replace_external_table_functions_to_null, false, "Replace all table functions to Null on restore. Useful for testing purposes", 0) \
M(Bool, create_if_not_exists, false, "Enable IF NOT EXISTS for CREATE statements by default", 0) \
\
\
/* ###################################### */ \
@ -911,6 +913,7 @@ class IColumn;
M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \
M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
M(Bool, allow_experimental_json_type, false, "Allow JSON data type", 0) \
M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
@ -1132,10 +1135,12 @@ class IColumn;
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
M(Bool, type_json_skip_duplicated_paths, false, "When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception", 0) \
M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes_only_datetime64, false, "When input_format_try_infer_datetimes is enabled, infer only DateTime64 but not DateTime types", 0) \
M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \

View File

@ -75,6 +75,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
},
{"24.8",
{
{"create_if_not_exists", false, false, "New setting."},
{"rows_before_aggregation", true, true, "Provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation"},
{"restore_replace_external_table_functions_to_null", false, false, "New setting."},
{"restore_replace_external_engines_to_null", false, false, "New setting."},
@ -87,7 +88,12 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"},
{"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
{"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
{"allow_experimental_json_type", false, false, "Add new experimental JSON type"},
{"use_json_alias_for_old_object_type", true, false, "Use JSON type alias to create new JSON type"},
{"type_json_skip_duplicated_paths", false, false, "Allow to skip duplicated paths during JSON parsing"},
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
{"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
{"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"}
}
},
{"24.7",
@ -103,7 +109,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"dictionary_validate_primary_key_type", false, false, "Validate primary key type for dictionaries. By default id type for simple layouts will be implicitly converted to UInt64."},
{"collect_hash_table_stats_during_joins", false, true, "New setting."},
{"max_size_to_preallocate_for_joins", 0, 100'000'000, "New setting."},
{"input_format_orc_reader_time_zone_name", "GMT", "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT."}, {"lightweight_mutation_projection_mode", "throw", "throw", "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop all projection related to this table then do lightweight delete."},
{"input_format_orc_reader_time_zone_name", "GMT", "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT."},
{"database_replicated_allow_heavy_create", true, false, "Long-running DDL queries (CREATE AS SELECT and POPULATE) for Replicated database engine was forbidden"},
{"query_plan_merge_filters", false, false, "Allow to merge filters in the query plan"},
{"azure_sdk_max_retries", 10, 10, "Maximum number of retries in azure sdk"},

View File

@ -175,7 +175,8 @@ IMPLEMENT_SETTING_ENUM(ParallelReplicasCustomKeyFilterType, ErrorCodes::BAD_ARGU
IMPLEMENT_SETTING_ENUM(LightweightMutationProjectionMode, ErrorCodes::BAD_ARGUMENTS,
{{"throw", LightweightMutationProjectionMode::THROW},
{"drop", LightweightMutationProjectionMode::DROP}})
{"drop", LightweightMutationProjectionMode::DROP},
{"rebuild", LightweightMutationProjectionMode::REBUILD}})
IMPLEMENT_SETTING_ENUM(DeduplicateMergeProjectionMode, ErrorCodes::BAD_ARGUMENTS,
{{"throw", DeduplicateMergeProjectionMode::THROW},

View File

@ -311,6 +311,7 @@ enum class LightweightMutationProjectionMode : uint8_t
{
THROW,
DROP,
REBUILD,
};
DECLARE_SETTING_ENUM(LightweightMutationProjectionMode)

Some files were not shown because too many files have changed in this diff Show More