Merge branch 'master' into no-hardlinks-while-making-backup-of-mergetree-in-atomic-db

This commit is contained in:
Alexander Tokmakov 2022-09-09 14:24:44 +03:00 committed by GitHub
commit 48927ba0ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
117 changed files with 2554 additions and 629 deletions

View File

@ -923,6 +923,53 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinAmd64SSE2:
needs: [DockerHubPush]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
BUILD_NAME=binary_amd64sse2
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
- name: Check out repository code
uses: actions/checkout@v2
with:
fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build
run: |
git -C "$GITHUB_WORKSPACE" submodule sync --recursive
git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
- name: Upload build URLs to artifacts
if: ${{ success() || failure() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.BUILD_URLS }}
path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
- name: Cleanup
if: always()
run: |
# shellcheck disable=SC2046
docker kill $(docker ps -q) ||:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
############################################################################################
##################################### Docker images #######################################
############################################################################################
@ -1011,6 +1058,7 @@ jobs:
- BuilderBinFreeBSD
# - BuilderBinGCC
- BuilderBinPPC64
- BuilderBinAmd64SSE2
- BuilderBinClangTidy
- BuilderDebShared
runs-on: [self-hosted, style-checker]

View File

@ -935,6 +935,51 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinAmd64SSE2:
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
BUILD_NAME=binary_amd64sse2
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
- name: Check out repository code
uses: actions/checkout@v2
- name: Build
run: |
git -C "$GITHUB_WORKSPACE" submodule sync --recursive
git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
- name: Upload build URLs to artifacts
if: ${{ success() || failure() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.BUILD_URLS }}
path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
- name: Cleanup
if: always()
run: |
# shellcheck disable=SC2046
docker kill $(docker ps -q) ||:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
############################################################################################
##################################### Docker images #######################################
############################################################################################
@ -1023,6 +1068,7 @@ jobs:
- BuilderBinFreeBSD
# - BuilderBinGCC
- BuilderBinPPC64
- BuilderBinAmd64SSE2
- BuilderBinClangTidy
- BuilderDebShared
runs-on: [self-hosted, style-checker]

View File

@ -143,6 +143,8 @@ include (cmake/add_warning.cmake)
if (COMPILER_CLANG)
# generate ranges for fast "addr2line" search
if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
# NOTE: that clang has a bug because of it does not emit .debug_aranges
# with ThinLTO, so custom ld.lld wrapper is shipped in docker images.
set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges")
endif ()

View File

@ -15,4 +15,5 @@ ClickHouse® is an open-source column-oriented database management system that a
* [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any.
## Upcoming events
* [**v22.8 Release Webinar**](https://clickhouse.com/company/events/v22-8-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
* [**v22.9 Release Webinar**](https://clickhouse.com/company/events/v22-9-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
* [**ClickHouse for Analytics @ Barracuda Networks**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/288140358/) Join us for this in person meetup hosted by our friends at Barracuda in Bay Area.

View File

@ -24,6 +24,23 @@ option (ENABLE_BMI "Use BMI instructions on x86_64" 0)
option (ENABLE_AVX2_FOR_SPEC_OP "Use avx2 instructions for specific operations on x86_64" 0)
option (ENABLE_AVX512_FOR_SPEC_OP "Use avx512 instructions for specific operations on x86_64" 0)
# X86: Allow compilation for a SSE2-only target machine. Done by a special build in CI for embedded or very old hardware.
option (NO_SSE3_OR_HIGHER "Disable SSE3 or higher on x86_64" 0)
if (NO_SSE3_OR_HIGHER)
SET(ENABLE_SSSE3 0)
SET(ENABLE_SSE41 0)
SET(ENABLE_SSE42 0)
SET(ENABLE_PCLMULQDQ 0)
SET(ENABLE_POPCNT 0)
SET(ENABLE_AVX 0)
SET(ENABLE_AVX2 0)
SET(ENABLE_AVX512 0)
SET(ENABLE_AVX512_VBMI 0)
SET(ENABLE_BMI 0)
SET(ENABLE_AVX2_FOR_SPEC_OP 0)
SET(ENABLE_AVX512_FOR_SPEC_OP 0)
endif()
option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)
if (ARCH_NATIVE)

17
cmake/ld.lld.in Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
# This is a workaround for bug in llvm/clang,
# that does not produce .debug_aranges with LTO
#
# NOTE: this is a temporary solution, that should be removed once [1] will be
# resolved.
#
# [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898/8
# NOTE: only -flto=thin is supported.
# NOTE: it is not possible to check was there -gdwarf-aranges initially or not.
if [[ "$*" =~ -plugin-opt=thinlto ]]; then
exec "@LLD_PATH@" -mllvm -generate-arange-section "$@"
else
exec "@LLD_PATH@" "$@"
fi

View File

@ -20,7 +20,7 @@ macro(clickhouse_split_debug_symbols)
COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin"
COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
# Splits debug symbols into separate file, leaves the binary untouched:
COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
# Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check:
COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"

View File

@ -94,8 +94,13 @@ if (LINKER_NAME)
if (NOT LLD_PATH)
message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.")
endif ()
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}")
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}")
# This a temporary quirk to emit .debug_aranges with ThinLTO
set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld")
configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY)
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
else ()
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")

View File

@ -1,6 +1,6 @@
# We use vectorscan, a portable and API/ABI-compatible drop-in replacement for hyperscan.
if (ARCH_AMD64)
if (ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER)
option (ENABLE_VECTORSCAN "Enable vectorscan library" ${ENABLE_LIBRARIES})
endif()

View File

@ -130,6 +130,7 @@ def parse_env_variables(
ARM_SUFFIX = "-aarch64"
FREEBSD_SUFFIX = "-freebsd"
PPC_SUFFIX = "-ppc64le"
AMD64_SSE2_SUFFIX = "-amd64sse2"
result = []
result.append("OUTPUT_DIR=/output")
@ -141,6 +142,7 @@ def parse_env_variables(
is_cross_arm = compiler.endswith(ARM_SUFFIX)
is_cross_ppc = compiler.endswith(PPC_SUFFIX)
is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX)
is_amd64_sse2 = compiler.endswith(AMD64_SSE2_SUFFIX)
if is_cross_darwin:
cc = compiler[: -len(DARWIN_SUFFIX)]
@ -186,6 +188,10 @@ def parse_env_variables(
cmake_flags.append(
"-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake"
)
elif is_amd64_sse2:
cc = compiler[: -len(AMD64_SSE2_SUFFIX)]
result.append("DEB_ARCH=amd64")
cmake_flags.append("-DNO_SSE3_OR_HIGHER=1")
else:
cc = compiler
result.append("DEB_ARCH=amd64")
@ -339,6 +345,7 @@ if __name__ == "__main__":
"clang-14-darwin-aarch64",
"clang-14-aarch64",
"clang-14-ppc64le",
"clang-14-amd64sse2",
"clang-14-freebsd",
"gcc-11",
),

View File

@ -140,6 +140,6 @@ hash cmake
ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.
They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request.
Binaries are built for stable and LTS releases and also every commit to `master` for each pull request.
To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.

View File

@ -0,0 +1,654 @@
---
slug: /en/getting-started/example-datasets/nypd_complaint_data
sidebar_label: NYPD Complaint Data
description: "Ingest and query Tab Separated Value data in 5 steps"
title: NYPD Complaint Data
---
Tab separated value, or TSV, files are common and may include field headings as the first line of the file. ClickHouse can ingest TSVs, and also can query TSVs without ingesting the files. This guide covers both of these cases. If you need to query or ingest CSV files, the same techniques work, simply substitute `TSV` with `CSV` in your format arguments.
While working through this guide you will:
- **Investigate**: Query the structure and content of the TSV file.
- **Determine the target ClickHouse schema**: Choose proper data types and map the existing data to those types.
- **Create a ClickHouse table**.
- **Preprocess and stream** the data to ClickHouse.
- **Run some queries** against ClickHouse.
The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly.
**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243)
**Terms of use**: https://www1.nyc.gov/home/terms-of-use.page
## Prerequisites
- Download the dataset by visiting the [NYPD Complaint Data Current (Year To Date)](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) page, clicking the Export button, and choosing **TSV for Excel**.
- Install [ClickHouse server and client](../../getting-started/install.md).
- [Launch](../../getting-started/install.md#launch) ClickHouse server, and connect with `clickhouse-client`
### A note about the commands described in this guide
There are two types of commands in this guide:
- Some of the commands are querying the TSV files, these are run at the command prompt.
- The rest of the commands are querying ClickHouse, and these are run in the `clickhouse-client` or Play UI.
:::note
The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed.
:::
## Familiarize yourself with the TSV file
Before starting to work with the ClickHouse database familiarize yourself with the data.
### Look at the fields in the source TSV file
This is an example of a command to query a TSV file, but don't run it yet.
```sh
clickhouse-local --query \
"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')"
```
Sample response
```response
CMPLNT_NUM Nullable(Float64)
ADDR_PCT_CD Nullable(Float64)
BORO_NM Nullable(String)
CMPLNT_FR_DT Nullable(String)
CMPLNT_FR_TM Nullable(String)
```
:::tip
Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples. This is not always the case. Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](../../guides/developer/working-with-json/json-semi-structured.md/#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric. By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000`
you can get a better idea of the content.
Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled.
:::
Run this command at your command prompt. You will be using `clickhouse-local` to query the data in the TSV file you downloaded.
```sh
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')"
```
Result:
```response
CMPLNT_NUM Nullable(String)
ADDR_PCT_CD Nullable(Float64)
BORO_NM Nullable(String)
CMPLNT_FR_DT Nullable(String)
CMPLNT_FR_TM Nullable(String)
CMPLNT_TO_DT Nullable(String)
CMPLNT_TO_TM Nullable(String)
CRM_ATPT_CPTD_CD Nullable(String)
HADEVELOPT Nullable(String)
HOUSING_PSA Nullable(Float64)
JURISDICTION_CODE Nullable(Float64)
JURIS_DESC Nullable(String)
KY_CD Nullable(Float64)
LAW_CAT_CD Nullable(String)
LOC_OF_OCCUR_DESC Nullable(String)
OFNS_DESC Nullable(String)
PARKS_NM Nullable(String)
PATROL_BORO Nullable(String)
PD_CD Nullable(Float64)
PD_DESC Nullable(String)
PREM_TYP_DESC Nullable(String)
RPT_DT Nullable(String)
STATION_NAME Nullable(String)
SUSP_AGE_GROUP Nullable(String)
SUSP_RACE Nullable(String)
SUSP_SEX Nullable(String)
TRANSIT_DISTRICT Nullable(Float64)
VIC_AGE_GROUP Nullable(String)
VIC_RACE Nullable(String)
VIC_SEX Nullable(String)
X_COORD_CD Nullable(Float64)
Y_COORD_CD Nullable(Float64)
Latitude Nullable(Float64)
Longitude Nullable(Float64)
Lat_Lon Tuple(Nullable(Float64), Nullable(Float64))
New Georeferenced Column Nullable(String)
```
At this point you should check that the columns in the TSV file match the names and types specified in the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243). The data types are not very specific, all numeric fields are set to `Nullable(Float64)`, and all other fields are `Nullable(String)`. When you create a ClickHouse table to store the data you can specify more appropriate and performant types.
### Determine the proper schema
In order to figure out what types should be used for the fields it is necessary to know what the data looks like. For example, the field `JURISDICTION_CODE` is a numeric: should it be a `UInt8`, or an `Enum`, or is `Float64` appropriate?
```sql
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select JURISDICTION_CODE, count() FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
GROUP BY JURISDICTION_CODE
ORDER BY JURISDICTION_CODE
FORMAT PrettyCompact"
```
Result:
```response
┌─JURISDICTION_CODE─┬─count()─┐
│ 0 │ 188875 │
│ 1 │ 4799 │
│ 2 │ 13833 │
│ 3 │ 656 │
│ 4 │ 51 │
│ 6 │ 5 │
│ 7 │ 2 │
│ 9 │ 13 │
│ 11 │ 14 │
│ 12 │ 5 │
│ 13 │ 2 │
│ 14 │ 70 │
│ 15 │ 20 │
│ 72 │ 159 │
│ 87 │ 9 │
│ 88 │ 75 │
│ 97 │ 405 │
└───────────────────┴─────────┘
```
The query response shows that the `JURISDICTION_CODE` fits well in a `UInt8`.
Similarly, look at some of the `String` fields and see if they are well suited to being `DateTime` or [`LowCardinality(String)`](../../sql-reference/data-types/lowcardinality.md) fields.
For example, the field `PARKS_NM` is described as "Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included)". The names of parks in New York City may be a good candidate for a `LowCardinality(String)`:
```sh
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select count(distinct PARKS_NM) FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
FORMAT PrettyCompact"
```
Result:
```response
┌─uniqExact(PARKS_NM)─┐
│ 319 │
└─────────────────────┘
```
Have a look at some of the park names:
```sql
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select distinct PARKS_NM FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
LIMIT 10
FORMAT PrettyCompact"
```
Result:
```response
┌─PARKS_NM───────────────────┐
│ (null) │
│ ASSER LEVY PARK │
│ JAMES J WALKER PARK │
│ BELT PARKWAY/SHORE PARKWAY │
│ PROSPECT PARK │
│ MONTEFIORE SQUARE │
│ SUTTON PLACE PARK │
│ JOYCE KILMER PARK │
│ ALLEY ATHLETIC PLAYGROUND │
│ ASTORIA PARK │
└────────────────────────────┘
```
The dataset in use at the time of writing has only a few hundred distinct parks and playgrounds in the `PARK_NM` column. This is a small number based on the [LowCardinality](../../sql-reference/data-types/lowcardinality.md#lowcardinality-dscr) recommendation to stay below 10,000 distinct strings in a `LowCardinality(String)` field.
### DateTime fields
Based on the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) there are date and time fields for the start and end of the reported event. Looking at the min and max of the `CMPLNT_FR_DT` and `CMPLT_TO_DT` gives an idea of whether or not the fields are always populated:
```sh title="CMPLNT_FR_DT"
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select min(CMPLNT_FR_DT), max(CMPLNT_FR_DT) FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
FORMAT PrettyCompact"
```
Result:
```response
┌─min(CMPLNT_FR_DT)─┬─max(CMPLNT_FR_DT)─┐
│ 01/01/1973 │ 12/31/2021 │
└───────────────────┴───────────────────┘
```
```sh title="CMPLNT_TO_DT"
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select min(CMPLNT_TO_DT), max(CMPLNT_TO_DT) FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
FORMAT PrettyCompact"
```
Result:
```response
┌─min(CMPLNT_TO_DT)─┬─max(CMPLNT_TO_DT)─┐
│ │ 12/31/2021 │
└───────────────────┴───────────────────┘
```
```sh title="CMPLNT_FR_TM"
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select min(CMPLNT_FR_TM), max(CMPLNT_FR_TM) FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
FORMAT PrettyCompact"
```
Result:
```response
┌─min(CMPLNT_FR_TM)─┬─max(CMPLNT_FR_TM)─┐
│ 00:00:00 │ 23:59:00 │
└───────────────────┴───────────────────┘
```
```sh title="CMPLNT_TO_TM"
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select min(CMPLNT_TO_TM), max(CMPLNT_TO_TM) FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
FORMAT PrettyCompact"
```
Result:
```response
┌─min(CMPLNT_TO_TM)─┬─max(CMPLNT_TO_TM)─┐
│ (null) │ 23:59:00 │
└───────────────────┴───────────────────┘
```
## Make a plan
Based on the above investigation:
- `JURISDICTION_CODE` should be cast as `UInt8`.
- `PARKS_NM` should be cast to `LowCardinality(String)`
- `CMPLNT_FR_DT` and `CMPLNT_FR_TM` are always populated (possibly with a default time of `00:00:00`)
- `CMPLNT_TO_DT` and `CMPLNT_TO_TM` may be empty
- Dates and times are stored in separate fields in the source
- Dates are `mm/dd/yyyy` format
- Times are `hh:mm:ss` format
- Dates and times can be concatenated into DateTime types
- There are some dates before January 1st 1970, which means we need a 64 bit DateTime
:::note
There are many more changes to be made to the types, they all can be determined by following the same investigation steps. Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions. The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics.
:::
## Concatenate the date and time fields
To concatenate the date and time fields `CMPLNT_FR_DT` and `CMPLNT_FR_TM` into a single `String` that can be cast to a `DateTime`, select the two fields joined by the concatenation operator: `CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM`. The `CMPLNT_TO_DT` and `CMPLNT_TO_TM` fields are handled similarly.
```sh
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM AS complaint_begin FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
LIMIT 10
FORMAT PrettyCompact"
```
Result:
```response
┌─complaint_begin─────┐
│ 07/29/2010 00:01:00 │
│ 12/01/2011 12:00:00 │
│ 04/01/2017 15:00:00 │
│ 03/26/2018 17:20:00 │
│ 01/01/2019 00:00:00 │
│ 06/14/2019 00:00:00 │
│ 11/29/2021 20:00:00 │
│ 12/04/2021 00:35:00 │
│ 12/05/2021 12:50:00 │
│ 12/07/2021 20:30:00 │
└─────────────────────┘
```
## Convert the date and time String to a DateTime64 type
Earlier in the guide we discovered that there are dates in the TSV file before January 1st 1970, which means that we need a 64 bit DateTime type for the dates. The dates also need to be converted from `MM/DD/YYYY` to `YYYY/MM/DD` format. Both of these can be done with [`parseDateTime64BestEffort()`](../../sql-reference/functions/type-conversion-functions.md#parsedatetime64besteffort).
```sh
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
(CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
select parseDateTime64BestEffort(CMPLNT_START) AS complaint_begin,
parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end
FROM file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
ORDER BY complaint_begin ASC
LIMIT 25
FORMAT PrettyCompact"
```
Lines 2 and 3 above contain the concatenation from the previous step, and lines 4 and 5 above parse the strings into `DateTime64`. As the complaint end time is not guaranteed to exist `parseDateTime64BestEffortOrNull` is used.
Result:
```response
┌─────────complaint_begin─┬───────────complaint_end─┐
│ 1925-01-01 10:00:00.000 │ 2021-02-12 09:30:00.000 │
│ 1925-01-01 11:37:00.000 │ 2022-01-16 11:49:00.000 │
│ 1925-01-01 15:00:00.000 │ 2021-12-31 00:00:00.000 │
│ 1925-01-01 15:00:00.000 │ 2022-02-02 22:00:00.000 │
│ 1925-01-01 19:00:00.000 │ 2022-04-14 05:00:00.000 │
│ 1955-09-01 19:55:00.000 │ 2022-08-01 00:45:00.000 │
│ 1972-03-17 11:40:00.000 │ 2022-03-17 11:43:00.000 │
│ 1972-05-23 22:00:00.000 │ 2022-05-24 09:00:00.000 │
│ 1972-05-30 23:37:00.000 │ 2022-05-30 23:50:00.000 │
│ 1972-07-04 02:17:00.000 │ ᴺᵁᴸᴸ │
│ 1973-01-01 00:00:00.000 │ ᴺᵁᴸᴸ │
│ 1975-01-01 00:00:00.000 │ ᴺᵁᴸᴸ │
│ 1976-11-05 00:01:00.000 │ 1988-10-05 23:59:00.000 │
│ 1977-01-01 00:00:00.000 │ 1977-01-01 23:59:00.000 │
│ 1977-12-20 00:01:00.000 │ ᴺᵁᴸᴸ │
│ 1981-01-01 00:01:00.000 │ ᴺᵁᴸᴸ │
│ 1981-08-14 00:00:00.000 │ 1987-08-13 23:59:00.000 │
│ 1983-01-07 00:00:00.000 │ 1990-01-06 00:00:00.000 │
│ 1984-01-01 00:01:00.000 │ 1984-12-31 23:59:00.000 │
│ 1985-01-01 12:00:00.000 │ 1987-12-31 15:00:00.000 │
│ 1985-01-11 09:00:00.000 │ 1985-12-31 12:00:00.000 │
│ 1986-03-16 00:05:00.000 │ 2022-03-16 00:45:00.000 │
│ 1987-01-07 00:00:00.000 │ 1987-01-09 00:00:00.000 │
│ 1988-04-03 18:30:00.000 │ 2022-08-03 09:45:00.000 │
│ 1988-07-29 12:00:00.000 │ 1990-07-27 22:00:00.000 │
└─────────────────────────┴─────────────────────────┘
```
:::note
The dates shown as `1925` above are from errors in the data. There are several records in the original data with dates in the years `1019` - `1022` that should be `2019` - `2022`. They are being stored as Jan 1st 1925 as that is the earliest date with a 64 bit DateTime.
:::
## Create a table
The decisions made above on the data types used for the columns are reflected in the table schema
below. We also need to decide on the `ORDER BY` and `PRIMARY KEY` used for the table. At least one
of `ORDER BY` or `PRIMARY KEY` must be specified. Here are some guidelines on deciding on the
columns to includes in `ORDER BY`, and more information is in the *Next Steps* section at the end
of this document.
### Order By and Primary Key clauses
- The `ORDER BY` tuple should include fields that are used in query filters
- To maximize compression on disk the `ORDER BY` tuple should be ordered by ascending cardinality
- If it exists, the `PRIMARY KEY` tuple must be a subset of the `ORDER BY` tuple
- If only `ORDER BY` is specified, then the same tuple will be used as `PRIMARY KEY`
- The primary key index is created using the `PRIMARY KEY` tuple if specified, otherwise the `ORDER BY` tuple
- The `PRIMARY KEY` index is kept in main memory
Looking at the dataset and the questions that might be answered by querying it we might
decide that we would look at the types of crimes reported over time in the five boroughs of
New York City. These fields might be then included in the `ORDER BY`:
| Column | Description (from the data dictionary) |
| ----------- | --------------------------------------------------- |
| OFNS_DESC | Description of offense corresponding with key code |
| RPT_DT | Date event was reported to police |
| BORO_NM | The name of the borough in which the incident occurred |
Querying the TSV file for the cardinality of the three candidate columns:
```bash
clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
--query \
"select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC,
formatReadableQuantity(uniq(RPT_DT)) as cardinality_RPT_DT,
formatReadableQuantity(uniq(BORO_NM)) as cardinality_BORO_NM
FROM
file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
FORMAT PrettyCompact"
```
Result:
```response
┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐
│ 60.00 │ 306.00 │ 6.00 │
└───────────────────────┴────────────────────┴─────────────────────┘
```
Ordering by cardinality, the `ORDER BY` becomes:
```
ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT )
```
:::note
The table below will use more easily read column names, the above names will be mapped to
```
ORDER BY ( borough, offense_description, date_reported )
```
:::
Putting together the changes to data types and the `ORDER BY` tuple gives this table structure:
```sql
CREATE TABLE NYPD_Complaint (
complaint_number String,
precinct UInt8,
borough LowCardinality(String),
complaint_begin DateTime64(0,'America/New_York'),
complaint_end DateTime64(0,'America/New_York'),
was_crime_completed String,
housing_authority String,
housing_level_code UInt32,
jurisdiction_code UInt8,
jurisdiction LowCardinality(String),
offense_code UInt8,
offense_level LowCardinality(String),
location_descriptor LowCardinality(String),
offense_description LowCardinality(String),
park_name LowCardinality(String),
patrol_borough LowCardinality(String),
PD_CD UInt16,
PD_DESC String,
location_type LowCardinality(String),
date_reported Date,
transit_station LowCardinality(String),
suspect_age_group LowCardinality(String),
suspect_race LowCardinality(String),
suspect_sex LowCardinality(String),
transit_district UInt8,
victim_age_group LowCardinality(String),
victim_race LowCardinality(String),
victim_sex LowCardinality(String),
NY_x_coordinate UInt32,
NY_y_coordinate UInt32,
Latitude Float64,
Longitude Float64
) ENGINE = MergeTree
ORDER BY ( borough, offense_description, date_reported )
```
### Finding the primary key of a table
The ClickHouse `system` database, specifically `system.table` has all of the information about the table you
just created. This query shows the `ORDER BY` (sorting key), and the `PRIMARY KEY`:
```sql
SELECT
partition_key,
sorting_key,
primary_key,
table
FROM system.tables
WHERE table = 'NYPD_Complaint'
FORMAT Vertical
```
Response
```response
Query id: 6a5b10bf-9333-4090-b36e-c7f08b1d9e01
Row 1:
──────
partition_key:
sorting_key: borough, offense_description, date_reported
primary_key: borough, offense_description, date_reported
table: NYPD_Complaint
1 row in set. Elapsed: 0.001 sec.
```
## Preprocess and Import Data {#preprocess-import-data}
We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
### `clickhouse-local` arguments used
:::tip
`table='input'` appears in the arguments to clickhouse-local below. clickhouse-local takes the provided input (`cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`) and inserts the input into a table. By default the table is named `table`. In this guide the name of the table is set to `input` to make the data flow clearer. The final argument to clickhouse-local is a query that selects from the table (`FROM input`) which is then piped to `clickhouse-client` to populate the table `NYPD_Complaint`.
:::
```sql
cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \
| clickhouse-local --table='input' --input-format='TSVWithNames' \
--input_format_max_rows_to_read_for_schema_inference=2000 \
--query "
WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
(CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
SELECT
CMPLNT_NUM AS complaint_number,
ADDR_PCT_CD AS precinct,
BORO_NM AS borough,
parseDateTime64BestEffort(CMPLNT_START) AS complaint_begin,
parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end,
CRM_ATPT_CPTD_CD AS was_crime_completed,
HADEVELOPT AS housing_authority_development,
HOUSING_PSA AS housing_level_code,
JURISDICTION_CODE AS jurisdiction_code,
JURIS_DESC AS jurisdiction,
KY_CD AS offense_code,
LAW_CAT_CD AS offense_level,
LOC_OF_OCCUR_DESC AS location_descriptor,
OFNS_DESC AS offense_description,
PARKS_NM AS park_name,
PATROL_BORO AS patrol_borough,
PD_CD,
PD_DESC,
PREM_TYP_DESC AS location_type,
toDate(parseDateTimeBestEffort(RPT_DT)) AS date_reported,
STATION_NAME AS transit_station,
SUSP_AGE_GROUP AS suspect_age_group,
SUSP_RACE AS suspect_race,
SUSP_SEX AS suspect_sex,
TRANSIT_DISTRICT AS transit_district,
VIC_AGE_GROUP AS victim_age_group,
VIC_RACE AS victim_race,
VIC_SEX AS victim_sex,
X_COORD_CD AS NY_x_coordinate,
Y_COORD_CD AS NY_y_coordinate,
Latitude,
Longitude
FROM input" \
| clickhouse-client --query='INSERT INTO NYPD_Complaint FORMAT TSV'
```
## Validate the Data {#validate-data}
:::note
The dataset changes once or more per year, your counts may not match what is in this document.
:::
Query:
```sql
SELECT count()
FROM NYPD_Complaint
```
Result:
```text
┌─count()─┐
│ 208993 │
└─────────┘
1 row in set. Elapsed: 0.001 sec.
```
The size of the dataset in ClickHouse is just 12% of the original TSV file, compare the size of the original TSV file with the size of the table:
Query:
```sql
SELECT formatReadableSize(total_bytes)
FROM system.tables
WHERE name = 'NYPD_Complaint'
```
Result:
```text
┌─formatReadableSize(total_bytes)─┐
│ 8.63 MiB │
└─────────────────────────────────┘
```
## Run Some Queries {#run-queries}
### Query 1. Compare the number of complaints by month
Query:
```sql
SELECT
dateName('month', date_reported) AS month,
count() AS complaints,
bar(complaints, 0, 50000, 80)
FROM NYPD_Complaint
GROUP BY month
ORDER BY complaints DESC
```
Result:
```response
Query id: 7fbd4244-b32a-4acf-b1f3-c3aa198e74d9
┌─month─────┬─complaints─┬─bar(count(), 0, 50000, 80)───────────────────────────────┐
│ March │ 34536 │ ███████████████████████████████████████████████████████▎ │
│ May │ 34250 │ ██████████████████████████████████████████████████████▋ │
│ April │ 32541 │ ████████████████████████████████████████████████████ │
│ January │ 30806 │ █████████████████████████████████████████████████▎ │
│ February │ 28118 │ ████████████████████████████████████████████▊ │
│ November │ 7474 │ ███████████▊ │
│ December │ 7223 │ ███████████▌ │
│ October │ 7070 │ ███████████▎ │
│ September │ 6910 │ ███████████ │
│ August │ 6801 │ ██████████▊ │
│ June │ 6779 │ ██████████▋ │
│ July │ 6485 │ ██████████▍ │
└───────────┴────────────┴──────────────────────────────────────────────────────────┘
12 rows in set. Elapsed: 0.006 sec. Processed 208.99 thousand rows, 417.99 KB (37.48 million rows/s., 74.96 MB/s.)
```
### Query 2. Compare total number of complaints by Borough
Query:
```sql
SELECT
borough,
count() AS complaints,
bar(complaints, 0, 125000, 60)
FROM NYPD_Complaint
GROUP BY borough
ORDER BY complaints DESC
```
Result:
```response
Query id: 8cdcdfd4-908f-4be0-99e3-265722a2ab8d
┌─borough───────┬─complaints─┬─bar(count(), 0, 125000, 60)──┐
│ BROOKLYN │ 57947 │ ███████████████████████████▋ │
│ MANHATTAN │ 53025 │ █████████████████████████▍ │
│ QUEENS │ 44875 │ █████████████████████▌ │
│ BRONX │ 44260 │ █████████████████████▏ │
│ STATEN ISLAND │ 8503 │ ████ │
│ (null) │ 383 │ ▏ │
└───────────────┴────────────┴──────────────────────────────┘
6 rows in set. Elapsed: 0.008 sec. Processed 208.99 thousand rows, 209.43 KB (27.14 million rows/s., 27.20 MB/s.)
```
## Next Steps
[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.

View File

@ -59,7 +59,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.
</details>
You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs.
You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.
You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable).
@ -106,7 +106,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.
</details>
If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available.
You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.
Then run these commands to install packages:
@ -221,7 +221,7 @@ For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse bui
curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse
```
Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `sudo clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data.

View File

@ -2,10 +2,9 @@
slug: /en/operations/backup
sidebar_position: 49
sidebar_label: Data backup and restore
title: Data backup and restore
---
# Data backup and restore
While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you cant just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards do not cover all possible cases and can be circumvented.
In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.

View File

@ -20,6 +20,7 @@ Additional cache types:
- [Avro format](../interfaces/formats.md#data-format-avro) schemas cache.
- [Dictionaries](../sql-reference/dictionaries/index.md) data cache.
- Schema inference cache.
- [Filesystem cache](storing-data.md) over S3, Azure, Local and other disks.
Indirectly used:

View File

@ -1452,7 +1452,7 @@ Port for communicating with clients over MySQL protocol.
**Possible values**
Positive integer.
Positive integer to specify the port number to listen to or empty value to disable.
Example
@ -1466,7 +1466,7 @@ Port for communicating with clients over PostgreSQL protocol.
**Possible values**
Positive integer.
Positive integer to specify the port number to listen to or empty value to disable.
Example

View File

@ -1176,8 +1176,9 @@ Enables the quorum writes.
- If `insert_quorum < 2`, the quorum writes are disabled.
- If `insert_quorum >= 2`, the quorum writes are enabled.
- If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number.
Default value: 0.
Default value: 0 - disabled.
Quorum writes
@ -1259,7 +1260,7 @@ Possible values:
Default value: 1.
By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)).
For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window).

View File

@ -112,6 +112,119 @@ Example of disk configuration:
</clickhouse>
```
## Using local cache {#using-local-cache}
It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. Cache uses `LRU` cache policy.
Example of configuration for versions later or equal to 22.8:
``` xml
<clickhouse>
<storage_configuration>
<disks>
<s3>
<type>s3</type>
<endpoint>...</endpoint>
... s3 configuration ...
</s3>
<cache>
<type>cache</type>
<disk>s3</disk>
<path>/s3_cache/</path>
<max_size>10000000</max_size>
</cache>
</disks>
</storage_configuration>
```
Example of configuration for versions earlier than 22.8:
``` xml
<clickhouse>
<storage_configuration>
<disks>
<s3>
<type>s3</type>
<endpoint>...</endpoint>
... s3 configuration ...
<data_cache_enabled>1</data_cache_enabled>
<data_cache_size>10000000</data_cache_size>
</s3>
</disks>
</storage_configuration>
```
Cache **configuration settings**:
- `path` - path to the directory with cache. Default: None, this setting is obligatory.
- `max_size` - maximum size of the cache in bytes. When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory.
- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled).
- `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`.
- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`.
- `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb).
- `max_elements` - a limit for a number of cache files. Default: `1048576`.
Cache **query settings**:
- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`.
- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`.
- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on.
- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`.
- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`.
- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit.
** Warning **
Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported.
Cache **system tables**:
- `system.filesystem_cache` - system tables which shows current state of cache.
- `system.filesystem_cache_log` - system table which shows detailed cache usage per query. Requires `enable_filesystem_cache_log` setting to be `true`.
Cache **commands**:
- `SYSTEM DROP FILESYSTEM CACHE (<path>) (ON CLUSTER)`
- `SHOW CACHES` -- show list of caches which were configured on the server.
- `DESCRIBE CACHE '<cache_name>'` - show cache configuration and some general statistics for a specific cache. Cache name can be taken from `SHOW CACHES` command.
Cache current metrics:
- `FilesystemCacheSize`
- `FilesystemCacheElements`
Cache asynchronous metrics:
- `FilesystemCacheBytes`
- `FilesystemCacheFiles`
Cache profile events:
- `CachedReadBufferReadFromSourceBytes`, `CachedReadBufferReadFromCacheBytes,`
- `CachedReadBufferReadFromSourceMicroseconds`, `CachedReadBufferReadFromCacheMicroseconds`
- `CachedReadBufferCacheWriteBytes`, `CachedReadBufferCacheWriteMicroseconds`
- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
## Storing Data on Web Server {#storing-data-on-webserver}
There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.

View File

@ -74,13 +74,16 @@ Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is ena
## File System {#file-system}
Ext4 is the most reliable option. Set the mount options `noatime`.
XFS should be avoided. It works mostly fine but there are some reports about lower performance.
Ext4 is the most reliable option. Set the mount options `noatime`. XFS works well too.
Most other file systems should also work fine.
FAT-32 and exFAT are not supported due to lack of hard links.
Do not use compressed filesystems, because ClickHouse does compression on its own and better.
It's not recommended to use encrypted filesystems, because you can use builtin encryption in ClickHouse, which is better.
While ClickHouse can work over NFS, it is not the best idea.
## Linux Kernel {#linux-kernel}
Dont use an outdated Linux kernel.

View File

@ -640,7 +640,8 @@ Result:
## date\_diff
Returns the difference between two dates or dates with time values.
Returns the difference between two dates or dates with time values.
The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)).
**Syntax**
@ -692,6 +693,25 @@ Result:
└────────────────────────────────────────────────────────────────────────────────────────┘
```
Query:
``` sql
SELECT
toDate('2022-01-01') AS e,
toDate('2021-12-29') AS s,
dateDiff('day', s, e) AS day_diff,
dateDiff('month', s, e) AS month__diff,
dateDiff('year', s, e) AS year_diff;
```
Result:
``` text
┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐
│ 2022-01-01 │ 2021-12-29 │ 3 │ 1 │ 1 │
└────────────┴────────────┴──────────┴─────────────┴───────────┘
```
## date\_sub
Subtracts the time interval or date interval from the provided date or date with time.

View File

@ -42,6 +42,14 @@ endif ()
# See `src/Common/TargetSpecific.h`
option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)
if (NO_SSE3_OR_HIGHER)
# Optimized x86 code in DECLARE_*_SPECIFIC_CODE blocks (see `src/Common/TargetSpecific.h`) is sometimes marked FORCE_INLINE. As a
# result, its instruction set requirements (e.g. SSE4.2) leak into generic code. This is normally not a problem for standard x86 builds
# because generic code is compiled with SSE 4.2 anyways. But it breaks SSE2-only builds. Therefore disabling the multitarget code
# machinery and always use generic code. (The cleaner alternative is removing FORCE_INLINE but that impacts performance too much.)
set(ENABLE_MULTITARGET_CODE OFF)
endif()
if (ENABLE_MULTITARGET_CODE)
add_definitions(-DENABLE_MULTITARGET_CODE=1)
else()

View File

@ -16,6 +16,7 @@ namespace ErrorCodes
extern const int ATTEMPT_TO_READ_AFTER_EOF;
extern const int NETWORK_ERROR;
extern const int SOCKET_TIMEOUT;
extern const int DNS_ERROR;
}
ConnectionEstablisher::ConnectionEstablisher(
@ -90,6 +91,7 @@ void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::
catch (const Exception & e)
{
if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
&& e.code() != ErrorCodes::DNS_ERROR
&& e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throw;

View File

@ -13,6 +13,7 @@
#include <Poco/DOM/Text.h>
#include <Poco/DOM/Attr.h>
#include <Poco/DOM/Comment.h>
#include <Poco/XML/XMLWriter.h>
#include <Poco/Util/XMLConfiguration.h>
#include <Common/ZooKeeper/ZooKeeperNodeCache.h>
#include <Common/ZooKeeper/KeeperException.h>
@ -729,7 +730,11 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config,
if (!preprocessed_path_parent.empty())
fs::create_directories(preprocessed_path_parent);
}
DOMWriter().writeNode(preprocessed_path, loaded_config.preprocessed_xml);
DOMWriter writer;
writer.setNewLine("\n");
writer.setIndent(" ");
writer.setOptions(Poco::XML::XMLWriter::PRETTY_PRINT);
writer.writeNode(preprocessed_path, loaded_config.preprocessed_xml);
LOG_DEBUG(log, "Saved preprocessed configuration to '{}'.", preprocessed_path);
}
catch (Poco::Exception & e)

View File

@ -26,114 +26,107 @@ namespace ErrorCodes
extern const int CANNOT_PARSE_YAML;
}
/// A prefix symbol in yaml key
/// We add attributes to nodes by using a prefix symbol in the key part.
/// Currently we use @ as a prefix symbol. Note, that @ is reserved
/// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
const char YAML_ATTRIBUTE_PREFIX = '@';
namespace
{
/// A prefix symbol in yaml key
/// We add attributes to nodes by using a prefix symbol in the key part.
/// Currently we use @ as a prefix symbol. Note, that @ is reserved
/// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
const char YAML_ATTRIBUTE_PREFIX = '@';
Poco::AutoPtr<Poco::XML::Element> createCloneNode(Poco::XML::Element & original_node)
{
Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
original_node.parentNode()->appendChild(clone_node);
return clone_node;
}
void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_element)
{
auto * xml_document = parent_xml_element.ownerDocument();
switch (node.Type())
Poco::AutoPtr<Poco::XML::Element> cloneXMLNode(const Poco::XML::Element & original_node)
{
case YAML::NodeType::Scalar:
{
std::string value = node.as<std::string>();
Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
parent_xml_element.appendChild(xml_value);
break;
}
Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
original_node.parentNode()->appendChild(clone_node);
return clone_node;
}
/// We process YAML Sequences as a
/// list of <key>value</key> tags with same key and different values.
/// For example, we translate this sequence
/// seq:
/// - val1
/// - val2
///
/// into this:
/// <seq>val1</seq>
/// <seq>val2</seq>
case YAML::NodeType::Sequence:
void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_node)
{
auto * xml_document = parent_xml_node.ownerDocument();
switch (node.Type())
{
for (const auto & child_node : node)
/// For sequences it depends how we want to process them.
/// Sequences of key-value pairs such as:
/// seq:
/// - k1: val1
/// - k2: val2
/// into xml like this:
/// <seq>
/// <k1>val1</k1>
/// <k2>val2</k2>
/// </seq>
///
/// But, if the sequence is just a list, the root-node needs to be repeated, such as:
/// seq:
/// - val1
/// - val2
/// into xml like this:
/// <seq>val1</seq>
/// <seq>val2</seq>
///
/// Therefore check what type the child is, for further processing.
/// Mixing types (values list or map) will lead to strange results but should not happen.
if (parent_xml_element.hasChildNodes() && !child_node.IsMap())
{
/// Create a new parent node with same tag for each child node
processNode(child_node, *createCloneNode(parent_xml_element));
}
else
{
/// Map, so don't recreate the parent node but add directly
processNode(child_node, parent_xml_element);
}
break;
}
case YAML::NodeType::Map:
{
for (const auto & key_value_pair : node)
case YAML::NodeType::Scalar:
{
const auto & key_node = key_value_pair.first;
const auto & value_node = key_value_pair.second;
std::string key = key_node.as<std::string>();
bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
if (is_attribute)
{
/// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
auto attribute_name = key.substr(1);
std::string value = value_node.as<std::string>();
parent_xml_element.setAttribute(attribute_name, value);
}
else
{
Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
parent_xml_element.appendChild(xml_key);
processNode(value_node, *xml_key);
}
std::string value = node.as<std::string>();
Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
parent_xml_node.appendChild(xml_value);
break;
}
/// For sequences we repeat the parent xml node. For example,
/// seq:
/// - val1
/// - val2
/// is converted into the following xml:
/// <seq>val1</seq>
/// <seq>val2</seq>
///
/// A sequence of mappings is converted in the same way:
/// seq:
/// - k1: val1
/// k2: val2
/// - k3: val3
/// is converted into the following xml:
/// <seq><k1>val1</k1><k2>val2</k2></seq>
/// <seq><k3>val3</k3></seq>
case YAML::NodeType::Sequence:
{
size_t i = 0;
for (auto it = node.begin(); it != node.end(); ++it, ++i)
{
const auto & child_node = *it;
bool need_clone_parent_xml_node = (i > 0);
if (need_clone_parent_xml_node)
{
/// Create a new parent node with same tag for each child node
processNode(child_node, *cloneXMLNode(parent_xml_node));
}
else
{
/// Map, so don't recreate the parent node but add directly
processNode(child_node, parent_xml_node);
}
}
break;
}
case YAML::NodeType::Map:
{
for (const auto & key_value_pair : node)
{
const auto & key_node = key_value_pair.first;
const auto & value_node = key_value_pair.second;
std::string key = key_node.as<std::string>();
bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
if (is_attribute)
{
/// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
auto attribute_name = key.substr(1);
std::string value = value_node.as<std::string>();
parent_xml_node.setAttribute(attribute_name, value);
}
else
{
Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
parent_xml_node.appendChild(xml_key);
processNode(value_node, *xml_key);
}
}
break;
}
case YAML::NodeType::Null: break;
case YAML::NodeType::Undefined:
{
throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
}
break;
}
case YAML::NodeType::Null: break;
case YAML::NodeType::Undefined:
{
throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
}
}
}
}
Poco::AutoPtr<Poco::XML::Document> YAMLParser::parse(const String& path)
{

View File

@ -22,13 +22,13 @@ Elf::Elf(const std::string & path)
/// Check if it's an elf.
elf_size = in.buffer().size();
if (elf_size < sizeof(ElfEhdr))
throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF);
throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The size of supposedly ELF file '{}' is too small", path);
mapped = in.buffer().begin();
header = reinterpret_cast<const ElfEhdr *>(mapped);
if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0)
throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF);
throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The file '{}' is not ELF according to magic", path);
/// Get section header.
ElfOff section_header_offset = header->e_shoff;
@ -37,7 +37,7 @@ Elf::Elf(const std::string & path)
if (!section_header_offset
|| !section_header_num_entries
|| section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size)
throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section header points after end of file)", path);
section_headers = reinterpret_cast<const ElfShdr *>(mapped + section_header_offset);
@ -48,11 +48,11 @@ Elf::Elf(const std::string & path)
});
if (!section_names_strtab)
throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF);
throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' doesn't have string table with section names", path);
ElfOff section_names_offset = section_names_strtab->header.sh_offset;
if (section_names_offset >= elf_size)
throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section names string table points after end of file)", path);
section_names = reinterpret_cast<const char *>(mapped + section_names_offset);
@ -64,7 +64,7 @@ Elf::Elf(const std::string & path)
if (!program_header_offset
|| !program_header_num_entries
|| program_header_offset + program_header_num_entries * sizeof(ElfPhdr) > elf_size)
throw Exception("The ELF is truncated (program header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (program header points after end of file)", path);
program_headers = reinterpret_cast<const ElfPhdr *>(mapped + program_header_offset);
}

View File

@ -145,5 +145,11 @@ String FieldVisitorToString::operator() (const Object & x) const
}
String convertFieldToString(const Field & field)
{
if (field.getType() == Field::Types::Which::String)
return field.get<String>();
return applyVisitor(FieldVisitorToString(), field);
}
}

View File

@ -31,5 +31,8 @@ public:
String operator() (const bool & x) const;
};
}
/// Get value from field and convert it to string.
/// Also remove quotes from strings.
String convertFieldToString(const Field & field);
}

View File

@ -88,7 +88,13 @@ void Span::addAttribute(std::exception_ptr e) noexcept
SpanHolder::SpanHolder(std::string_view _operation_name)
{
if (current_thread_trace_context.isTraceEnabled())
if (!current_thread_trace_context.isTraceEnabled())
{
return;
}
/// Use try-catch to make sure the ctor is exception safe.
try
{
this->trace_id = current_thread_trace_context.trace_id;
this->parent_span_id = current_thread_trace_context.span_id;
@ -97,9 +103,19 @@ SpanHolder::SpanHolder(std::string_view _operation_name)
this->start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
// set current span id to this
current_thread_trace_context.span_id = this->span_id;
/// Add new initialization here
}
catch (...)
{
tryLogCurrentException(__FUNCTION__);
/// Clear related fields to make sure the span won't be recorded.
this->trace_id = UUID();
return;
}
/// Set current span as parent of other spans created later on this thread.
current_thread_trace_context.span_id = this->span_id;
}
void SpanHolder::finish() noexcept
@ -216,7 +232,7 @@ const TracingContextOnThread & CurrentContext()
return current_thread_trace_context;
}
void TracingContextOnThread::reset()
void TracingContextOnThread::reset() noexcept
{
this->trace_id = UUID();
this->span_id = 0;
@ -231,63 +247,75 @@ TracingContextHolder::TracingContextHolder(
const Settings * settings_ptr,
const std::weak_ptr<OpenTelemetrySpanLog> & _span_log)
{
if (current_thread_trace_context.isTraceEnabled())
/// Use try-catch to make sure the ctor is exception safe.
/// If any exception is raised during the construction, the tracing is not enabled on current thread.
try
{
///
/// This is not the normal case,
/// it means that construction of current object is not at the start of current thread.
/// Usually this is due to:
/// 1. bad design
/// 2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
///
/// In such case, we should use current context as parent of this new constructing object,
/// So this branch ensures this class can be instantiated multiple times on one same thread safely.
///
this->is_context_owner = false;
this->root_span.trace_id = current_thread_trace_context.trace_id;
this->root_span.parent_span_id = current_thread_trace_context.span_id;
if (current_thread_trace_context.isTraceEnabled())
{
///
/// This is not the normal case,
/// it means that construction of current object is not at the start of current thread.
/// Usually this is due to:
/// 1. bad design
/// 2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
///
/// In such case, we should use current context as parent of this new constructing object,
/// So this branch ensures this class can be instantiated multiple times on one same thread safely.
///
this->is_context_owner = false;
this->root_span.trace_id = current_thread_trace_context.trace_id;
this->root_span.parent_span_id = current_thread_trace_context.span_id;
this->root_span.span_id = thread_local_rng();
this->root_span.operation_name = _operation_name;
this->root_span.start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
/// Set the root span as parent of other spans created on current thread
current_thread_trace_context.span_id = this->root_span.span_id;
return;
}
if (!_parent_trace_context.isTraceEnabled())
{
if (settings_ptr == nullptr)
/// Skip tracing context initialization on current thread
return;
// Start the trace with some configurable probability.
std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
if (!should_start_trace(thread_local_rng))
/// skip tracing context initialization on current thread
return;
while (_parent_trace_context.trace_id == UUID())
{
// Make sure the random generated trace_id is not 0 which is an invalid id.
_parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
_parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
}
_parent_trace_context.span_id = 0;
}
this->root_span.trace_id = _parent_trace_context.trace_id;
this->root_span.parent_span_id = _parent_trace_context.span_id;
this->root_span.span_id = thread_local_rng();
this->root_span.operation_name = _operation_name;
this->root_span.start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
current_thread_trace_context.span_id = this->root_span.span_id;
/// Add new initialization here
}
catch (...)
{
tryLogCurrentException(__FUNCTION__);
/// Clear related fields to make sure the tracing is not enabled.
this->root_span.trace_id = UUID();
return;
}
if (!_parent_trace_context.isTraceEnabled())
{
if (settings_ptr == nullptr)
/// skip tracing context initialization on current thread
return;
// start the trace ourselves, with some configurable probability.
std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
if (!should_start_trace(thread_local_rng))
/// skip tracing context initialization on current thread
return;
while (_parent_trace_context.trace_id == UUID())
{
// make sure the random generated trace_id is not 0 which is an invalid id
_parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
_parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
}
_parent_trace_context.span_id = 0;
}
this->root_span.trace_id = _parent_trace_context.trace_id;
this->root_span.parent_span_id = _parent_trace_context.span_id;
this->root_span.span_id = thread_local_rng();
this->root_span.operation_name = _operation_name;
this->root_span.start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
/// This object is created to initialize tracing context on a new thread,
/// it's helpful to record the thread_id so that we know the thread switching from the span log
this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
/// set up trace context on current thread
/// Set up trace context on current thread only when the root span is successfully initialized.
current_thread_trace_context = _parent_trace_context;
current_thread_trace_context.span_id = this->root_span.span_id;
current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED;
@ -306,6 +334,18 @@ TracingContextHolder::~TracingContextHolder()
auto shared_span_log = current_thread_trace_context.span_log.lock();
if (shared_span_log)
{
try
{
/// This object is created to initialize tracing context on a new thread,
/// it's helpful to record the thread_id so that we know the thread switching from the span log
this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
}
catch (...)
{
/// It's acceptable that the attribute is not recorded in case of any exception,
/// so the exception is ignored to try to log the span.
}
this->root_span.finish_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

View File

@ -74,7 +74,7 @@ struct TracingContextOnThread : TracingContext
return *this;
}
void reset();
void reset() noexcept;
/// Use weak_ptr instead of shared_ptr to hold a reference to the underlying system.opentelemetry_span_log table
/// Since this object is kept on threads and passed across threads, a weak_ptr is more safe to prevent potential leak

View File

@ -37,7 +37,7 @@ But because ClickHouse is linked with most of the symbols exported (-rdynamic fl
It allows to get source file names and line numbers from addresses. Only available if you use -g option for compiler.
It is also used by default for ClickHouse builds, but because of its weight (about two gigabytes)
it is split to separate binary and provided in clickhouse-common-static-dbg package.
This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse and is loaded automatically by tools like gdb, addr2line.
This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse.debug and is loaded automatically by tools like gdb, addr2line.
When you build ClickHouse by yourself, debug info is not split and present in a single huge binary.
What ClickHouse is using to provide good stack traces?
@ -391,10 +391,22 @@ void collectSymbolsFromELF(
std::filesystem::path local_debug_info_path = canonical_path.parent_path() / canonical_path.stem();
local_debug_info_path += ".debug";
std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path();
debug_info_path += ".debug";
if (std::filesystem::exists(local_debug_info_path))
/// NOTE: This is a workaround for current package system.
///
/// Since nfpm cannot copy file only if it exists,
/// and so in cmake empty .debug file is created instead,
/// but if we will try to load empty Elf file, then the CANNOT_PARSE_ELF
/// exception will be thrown from the Elf::Elf.
auto exists_not_empty = [](const std::filesystem::path & path)
{
return std::filesystem::exists(path) && !std::filesystem::is_empty(path);
};
if (exists_not_empty(local_debug_info_path))
object_name = local_debug_info_path;
else if (std::filesystem::exists(debug_info_path))
else if (exists_not_empty(debug_info_path))
object_name = debug_info_path;
else if (build_id.size() >= 2)
{
@ -412,7 +424,7 @@ void collectSymbolsFromELF(
std::filesystem::path build_id_debug_info_path(
fmt::format("/usr/lib/debug/.build-id/{}/{}.debug", build_id_hex.substr(0, 2), build_id_hex.substr(2)));
if (std::filesystem::exists(build_id_debug_info_path))
if (exists_not_empty(build_id_debug_info_path))
object_name = build_id_debug_info_path;
else
object_name = canonical_path;

View File

@ -898,4 +898,25 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
}
PathMatchResult matchPath(std::string_view path, std::string_view match_to)
{
using enum PathMatchResult;
if (path.ends_with('/'))
path.remove_suffix(1);
if (match_to.ends_with('/'))
match_to.remove_suffix(1);
auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
if (second_it != match_to.end())
return NOT_MATCH;
if (first_it == path.end())
return EXACT;
return *first_it == '/' ? IS_CHILD : NOT_MATCH;
}
}

View File

@ -554,4 +554,13 @@ private:
ZooKeeperRequestFactory();
};
enum class PathMatchResult
{
NOT_MATCH,
EXACT,
IS_CHILD
};
PathMatchResult matchPath(std::string_view path, std::string_view match_to);
}

View File

@ -0,0 +1,15 @@
#include <gtest/gtest.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
TEST(ZooKeeperTest, TestMatchPath)
{
using namespace Coordination;
ASSERT_EQ(matchPath("/path/file", "/path"), PathMatchResult::IS_CHILD);
ASSERT_EQ(matchPath("/path/file", "/path/"), PathMatchResult::IS_CHILD);
ASSERT_EQ(matchPath("/path/file", "/"), PathMatchResult::IS_CHILD);
ASSERT_EQ(matchPath("/", "/"), PathMatchResult::EXACT);
ASSERT_EQ(matchPath("/path", "/path/"), PathMatchResult::EXACT);
ASSERT_EQ(matchPath("/path/", "/path"), PathMatchResult::EXACT);
}

View File

@ -43,11 +43,8 @@ clickhouse:
text_log:
database: system
table: text_log
partition_by:
"@remove": "1"
engine:
- "@replace" : "1"
- "ENGINE MergeTree"
partition_by: {"@remove": "1"}
engine: "ENGINE MergeTree"
flush_interval_milliseconds: 7500
level: debug
)YAML";
@ -112,11 +109,8 @@ clickhouse:
text_log :
database: system
table: text_log
partition_by:
"@remove": "1"
engine:
- "@replace" : "1"
- "ENGINE MergeTree"
partition_by: {"@remove": "1"}
engine: "ENGINE MergeTree"
flush_interval_milliseconds: 7500
level: debug
)YAML";

View File

@ -13,40 +13,12 @@
using namespace DB;
TEST(Common, YamlParserInvalidFile)
TEST(YamlParser, InvalidFile)
{
ASSERT_THROW(YAMLParser::parse("some-non-existing-file.yaml"), Exception);
}
TEST(Common, YamlParserProcessKeysList)
{
auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
operator:
access_management: "1"
networks:
- ip: "10.1.6.168"
- ip: "::1"
- ip: "127.0.0.1"
)YAML");
SCOPE_EXIT({ yaml_file->remove(); });
Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
auto *p_node = xml->getNodeByPath("/clickhouse");
EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
<operator>
<access_management>1</access_management>
<networks>
<ip>10.1.6.168</ip>
<ip>::1</ip>
<ip>127.0.0.1</ip>
</networks>
</operator>
</clickhouse>
)CONFIG");
}
TEST(Common, YamlParserProcessValuesList)
TEST(YamlParser, ProcessValuesList)
{
auto yaml_file = getFileWithContents("values-list.yaml", R"YAML(
rules:
@ -75,4 +47,141 @@ rules:
)CONFIG");
}
TEST(YamlParser, ProcessKeysList)
{
auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
operator:
access_management: 1
networks:
ip:
- 10.1.6.168
- ::1
- 127.0.0.1
)YAML");
SCOPE_EXIT({ yaml_file->remove(); });
Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
auto *p_node = xml->getNodeByPath("/clickhouse");
EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
<operator>
<access_management>1</access_management>
<networks>
<ip>10.1.6.168</ip>
<ip>::1</ip>
<ip>127.0.0.1</ip>
</networks>
</operator>
</clickhouse>
)CONFIG");
}
TEST(YamlParser, ProcessListAttributes)
{
auto yaml_file = getFileWithContents("list_attributes.yaml", R"YAML(
seq:
- "@attr1": x
- k1: val1
k2: val2
"@attr2": y
- k3: val3
"@attr3": z
)YAML");
SCOPE_EXIT({ yaml_file->remove(); });
Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
auto *p_node = xml->getNodeByPath("/clickhouse");
EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
<seq attr1="x"></seq>
<seq attr2="y">
<k1>val1</k1>
<k2>val2</k2>
</seq>
<seq attr3="z">
<k3>val3</k3>
</seq>
</clickhouse>
)CONFIG");
}
TEST(YamlParser, ProcessMapAttributes)
{
auto yaml_file = getFileWithContents("map_attributes.yaml", R"YAML(
map:
"@attr1": x
k1: val1
k2: val2
"@attr2": y
k3: val3
"@attr3": z
)YAML");
SCOPE_EXIT({ yaml_file->remove(); });
Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
auto *p_node = xml->getNodeByPath("/clickhouse");
EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
<map attr1="x" attr2="y" attr3="z">
<k1>val1</k1>
<k2>val2</k2>
<k3>val3</k3>
</map>
</clickhouse>
)CONFIG");
}
TEST(YamlParser, ClusterDef)
{
auto yaml_file = getFileWithContents("cluster_def.yaml", R"YAML(
test_cluster:
shard:
- internal_replication: false
replica:
- host: 127.0.0.1
port: 9000
- host: 127.0.0.2
port: 9000
- internal_replication: true
replica:
- host: 127.0.0.3
port: 9000
- host: 127.0.0.4
port: 9000
)YAML");
SCOPE_EXIT({ yaml_file->remove(); });
Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
auto *p_node = xml->getNodeByPath("/clickhouse");
EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
<test_cluster>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>127.0.0.1</host>
<port>9000</port>
</replica>
<replica>
<host>127.0.0.2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<internal_replication>true</internal_replication>
<replica>
<host>127.0.0.3</host>
<port>9000</port>
</replica>
<replica>
<host>127.0.0.4</host>
<port>9000</port>
</replica>
</shard>
</test_cluster>
</clickhouse>
)CONFIG");
}
#endif

View File

@ -13,8 +13,10 @@
#include <filesystem>
#include <memory>
#include <Common/logger_useful.h>
#include "Coordination/KeeperContext.h"
#include <Coordination/KeeperContext.h>
#include <Coordination/KeeperConstants.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
namespace DB
{
@ -146,33 +148,6 @@ namespace
}
}
namespace
{
enum class PathMatchResult
{
NOT_MATCH,
EXACT,
IS_CHILD
};
PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
{
using enum PathMatchResult;
auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
if (second_it != match_to.end())
return NOT_MATCH;
if (first_it == path.end())
return EXACT;
return *first_it == '/' ? IS_CHILD : NOT_MATCH;
}
}
void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
{
writeBinary(static_cast<uint8_t>(snapshot.version), out);
@ -217,7 +192,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
const auto & path = it->key;
// write only the root system path because of digest
if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
if (Coordination::matchPath(path.toView(), keeper_system_path) == Coordination::PathMatchResult::IS_CHILD)
{
++it;
continue;
@ -365,8 +340,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
KeeperStorage::Node node{};
readNode(node, in, current_version, storage.acl_map);
using enum PathMatchResult;
auto match_result = matchPath(path, keeper_system_path);
using enum Coordination::PathMatchResult;
auto match_result = Coordination::matchPath(path, keeper_system_path);
const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
if (match_result == IS_CHILD)

View File

@ -879,7 +879,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
path_created += seq_num_str.str();
}
if (path_created.starts_with(keeper_system_path))
if (Coordination::matchPath(path_created, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
{
auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);
@ -1049,7 +1049,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
std::vector<KeeperStorage::Delta> new_deltas;
if (request.path.starts_with(keeper_system_path))
if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
{
auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);
@ -1203,7 +1203,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
std::vector<KeeperStorage::Delta> new_deltas;
if (request.path.starts_with(keeper_system_path))
if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
{
auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
@ -1472,7 +1472,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
{
Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);
if (request.path.starts_with(keeper_system_path))
if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
{
auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);

View File

@ -2141,6 +2141,38 @@ TEST_P(CoordinationTest, TestCurrentApiVersion)
EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
}
TEST_P(CoordinationTest, TestSystemNodeModify)
{
using namespace Coordination;
int64_t zxid{0};
// On INIT we abort when a system path is modified
keeper_context->server_state = KeeperContext::Phase::RUNNING;
KeeperStorage storage{500, "", keeper_context};
const auto assert_create = [&](const std::string_view path, const auto expected_code)
{
auto request = std::make_shared<ZooKeeperCreateRequest>();
request->path = path;
storage.preprocessRequest(request, 0, 0, zxid);
auto responses = storage.processRequest(request, 0, zxid);
ASSERT_FALSE(responses.empty());
const auto & response = responses[0];
ASSERT_EQ(response.response->error, expected_code) << "Unexpected error for path " << path;
++zxid;
};
assert_create("/keeper", Error::ZBADARGUMENTS);
assert_create("/keeper/with_child", Error::ZBADARGUMENTS);
assert_create(DB::keeper_api_version_path, Error::ZBADARGUMENTS);
assert_create("/keeper_map", Error::ZOK);
assert_create("/keeper1", Error::ZOK);
assert_create("/keepe", Error::ZOK);
assert_create("/keeper1/test", Error::ZOK);
}
INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
CoordinationTest,
::testing::ValuesIn(std::initializer_list<CompressionParam>{

View File

@ -213,7 +213,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
\
M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \
\
M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \
M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \
M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in specified time (in milliseconds), exception will be thrown and insertion is aborted.", 0) \
M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \
M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \

View File

@ -44,15 +44,6 @@ struct AttributeConfiguration
using AttributeNameToConfiguration = std::unordered_map<std::string, AttributeConfiguration>;
/// Get value from field and convert it to string.
/// Also remove quotes from strings.
String getFieldAsString(const Field & field)
{
if (field.getType() == Field::Types::Which::String)
return field.get<String>();
return applyVisitor(FieldVisitorToString(), field);
}
String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr)
{
if (!dict_attr->expression)
@ -61,7 +52,7 @@ String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_att
/// EXPRESSION PROPERTY should be expression or string
String expression_str;
if (const auto * literal = dict_attr->expression->as<ASTLiteral>(); literal && literal->value.getType() == Field::Types::String)
expression_str = getFieldAsString(literal->value);
expression_str = convertFieldToString(literal->value);
else
expression_str = queryToString(dict_attr->expression);
@ -275,7 +266,7 @@ void buildSingleAttribute(
AutoPtr<Element> null_value_element(doc->createElement("null_value"));
String null_value_str;
if (dict_attr->default_value)
null_value_str = getFieldAsString(dict_attr->default_value->as<ASTLiteral>()->value);
null_value_str = convertFieldToString(dict_attr->default_value->as<ASTLiteral>()->value);
AutoPtr<Text> null_value(doc->createTextNode(null_value_str));
null_value_element->appendChild(null_value);
attribute_element->appendChild(null_value_element);
@ -452,7 +443,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
}
else if (const auto * literal = pair->second->as<const ASTLiteral>())
{
AutoPtr<Text> value(doc->createTextNode(getFieldAsString(literal->value)));
AutoPtr<Text> value(doc->createTextNode(convertFieldToString(literal->value)));
current_xml_element->appendChild(value);
}
else if (const auto * list = pair->second->as<const ASTExpressionList>())
@ -473,7 +464,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
Field value;
result->get(0, value);
AutoPtr<Text> text_value(doc->createTextNode(getFieldAsString(value)));
AutoPtr<Text> text_value(doc->createTextNode(convertFieldToString(value)));
current_xml_element->appendChild(text_value);
}
else
@ -519,7 +510,7 @@ void buildSourceConfiguration(
{
AutoPtr<Element> setting_change_element(doc->createElement(name));
settings_element->appendChild(setting_change_element);
AutoPtr<Text> setting_value(doc->createTextNode(getFieldAsString(value)));
AutoPtr<Text> setting_value(doc->createTextNode(convertFieldToString(value)));
setting_change_element->appendChild(setting_value);
}
}

View File

@ -239,7 +239,16 @@ public:
}
/// For one local path there might be multiple remote paths in case of Log family engines.
using LocalPathWithObjectStoragePaths = std::pair<String, StoredObjects>;
struct LocalPathWithObjectStoragePaths
{
std::string local_path;
std::string common_prefix_for_objects;
StoredObjects objects;
LocalPathWithObjectStoragePaths(
const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_)
: local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {}
};
virtual void getRemotePathsRecursive(const String &, std::vector<LocalPathWithObjectStoragePaths> &)
{

View File

@ -127,7 +127,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::
{
try
{
paths_map.emplace_back(local_path, getStorageObjects(local_path));
paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path));
}
catch (const Exception & e)
{
@ -282,7 +282,10 @@ String DiskObjectStorage::getUniqueId(const String & path) const
bool DiskObjectStorage::checkUniqueId(const String & id) const
{
if (!id.starts_with(object_storage_root_path))
{
LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}", id, object_storage_root_path);
return false;
}
auto object = StoredObject::create(*object_storage, id, {}, {}, true);
return object_storage->exists(object);

View File

@ -68,6 +68,14 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf)
}
}
void DiskObjectStorageMetadata::createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool read_only_)
{
storage_objects.emplace_back(relative_path, bytes_size);
total_size = bytes_size;
ref_count = ref_count_;
read_only = read_only_;
}
void DiskObjectStorageMetadata::deserializeFromString(const std::string & data)
{
ReadBufferFromString buf(data);

View File

@ -50,6 +50,7 @@ public:
void deserialize(ReadBuffer & buf);
void deserializeFromString(const std::string & data);
void createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool is_read_only_);
void serialize(WriteBuffer & buf, bool sync) const;
std::string serializeToString() const;

View File

@ -56,7 +56,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
if (!response.IsSuccess())
{
const auto & err = response.GetError();
throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
throw S3Exception(fmt::format("{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType())), err.GetErrorType());
}
}
@ -70,7 +70,7 @@ void throwIfUnexpectedError(const Aws::Utils::Outcome<Result, Error> & response,
if (!response.IsSuccess() && (!if_exists || !isNotFoundError(response.GetError().GetErrorType())))
{
const auto & err = response.GetError();
throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
throw S3Exception(err.GetErrorType(), "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
}
}

View File

@ -34,6 +34,7 @@ namespace ErrorCodes
extern const int CANNOT_SEEK_THROUGH_FILE;
extern const int SEEK_POSITION_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
extern const int CANNOT_ALLOCATE_MEMORY;
}
@ -136,6 +137,23 @@ bool ReadBufferFromS3::nextImpl()
ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);
if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
{
/// It doesn't make sense to retry Access Denied or No Such Key
if (!s3_exception->isRetryableError())
{
tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
throw;
}
}
/// It doesn't make sense to retry allocator errors
if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY)
{
tryLogCurrentException(log);
throw;
}
LOG_DEBUG(
log,
"Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}",
@ -306,7 +324,10 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), buffer_size);
}
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
{
const auto & error = outcome.GetError();
throw S3Exception(error.GetMessage(), error.GetErrorType());
}
}
SeekableReadBufferPtr ReadBufferS3Factory::getReader()

View File

@ -35,6 +35,26 @@
# include <fstream>
namespace DB
{
bool S3Exception::isRetryableError() const
{
/// Looks like these list is quite conservative, add more codes if you wish
static const std::unordered_set<Aws::S3::S3Errors> unretryable_errors = {
Aws::S3::S3Errors::NO_SUCH_KEY,
Aws::S3::S3Errors::ACCESS_DENIED,
Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID,
Aws::S3::S3Errors::INVALID_SIGNATURE,
Aws::S3::S3Errors::NO_SUCH_UPLOAD,
Aws::S3::S3Errors::NO_SUCH_BUCKET,
};
return !unretryable_errors.contains(code);
}
}
namespace
{

View File

@ -7,23 +7,62 @@
#include <base/types.h>
#include <aws/core/Aws.h>
#include <aws/core/client/ClientConfiguration.h>
#include <aws/s3/S3Errors.h>
#include <IO/S3/PocoHTTPClient.h>
#include <Poco/URI.h>
#include <Common/Exception.h>
namespace Aws::S3
{
class S3Client;
}
namespace DB
{
class RemoteHostFilter;
struct HttpHeader;
using HeaderCollection = std::vector<HttpHeader>;
namespace ErrorCodes
{
extern const int S3_ERROR;
}
class RemoteHostFilter;
struct HttpHeader;
using HeaderCollection = std::vector<HttpHeader>;
class S3Exception : public Exception
{
public:
// Format message with fmt::format, like the logging functions.
template <typename... Args>
S3Exception(Aws::S3::S3Errors code_, fmt::format_string<Args...> fmt, Args &&... args)
: Exception(fmt::format(fmt, std::forward<Args>(args)...), ErrorCodes::S3_ERROR)
, code(code_)
{
}
S3Exception(const std::string & msg, Aws::S3::S3Errors code_)
: Exception(msg, ErrorCodes::S3_ERROR)
, code(code_)
{}
Aws::S3::S3Errors getS3ErrorCode() const
{
return code;
}
bool isRetryableError() const;
private:
const Aws::S3::S3Errors code;
};
}
namespace DB::S3
{
class ClientFactory
{
public:

View File

@ -8,6 +8,7 @@
#include <IO/WriteBufferFromS3.h>
#include <IO/WriteHelpers.h>
#include <IO/S3Common.h>
#include <Interpreters/Context.h>
#include <aws/s3/S3Client.h>
@ -173,7 +174,9 @@ void WriteBufferFromS3::finalizeImpl()
auto response = client_ptr->HeadObject(request);
if (!response.IsSuccess())
throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket);
throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
else
LOG_TRACE(log, "Object {} exists after upload", key);
}
}
@ -197,7 +200,7 @@ void WriteBufferFromS3::createMultipartUpload()
LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
}
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
}
void WriteBufferFromS3::writePart()
@ -309,7 +312,7 @@ void WriteBufferFromS3::processUploadRequest(UploadPartTask & task)
LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size());
}
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
total_parts_uploaded++;
}
@ -343,9 +346,10 @@ void WriteBufferFromS3::completeMultipartUpload()
LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size());
else
{
throw Exception(ErrorCodes::S3_ERROR, "{} Tags:{}",
outcome.GetError().GetMessage(),
fmt::join(tags.begin(), tags.end(), " "));
throw S3Exception(
outcome.GetError().GetErrorType(),
"Message: {}, Key: {}, Bucket: {}, Tags: {}",
outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " "));
}
}
@ -430,7 +434,10 @@ void WriteBufferFromS3::processPutRequest(const PutObjectTask & task)
if (outcome.IsSuccess())
LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool);
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
throw S3Exception(
outcome.GetError().GetErrorType(),
"Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}",
outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool);
}
void WriteBufferFromS3::waitForReadyBackGroundTasks()

View File

@ -21,7 +21,6 @@ namespace DB
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int TABLE_IS_READ_ONLY;
extern const int SUPPORT_IS_DISABLED;
}
@ -34,11 +33,6 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex
BlockIO InterpreterDeleteQuery::execute()
{
if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
{
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
}
FunctionNameNormalizer().visit(query_ptr.get());
const ASTDeleteQuery & delete_query = query_ptr->as<ASTDeleteQuery &>();
auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary);
@ -49,10 +43,6 @@ BlockIO InterpreterDeleteQuery::execute()
/// First check table storage for validations.
StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
if (!merge_tree)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree family tables are supported");
checkStorageSupportsTransactionsIfNeeded(table, getContext());
if (table->isStaticStorage())
throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only");
@ -69,6 +59,27 @@ BlockIO InterpreterDeleteQuery::execute()
auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout);
auto metadata_snapshot = table->getInMemoryMetadataPtr();
auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
if (!merge_tree)
{
/// Convert to MutationCommand
MutationCommands mutation_commands;
MutationCommand mut_command;
mut_command.type = MutationCommand::Type::DELETE;
mut_command.predicate = delete_query.predicate;
mutation_commands.emplace_back(mut_command);
table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
table->mutate(mutation_commands, getContext());
return {};
}
if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
/// Convert to MutationCommand
MutationCommands mutation_commands;
MutationCommand mut_command;

View File

@ -226,7 +226,7 @@ bool isStorageTouchedByMutations(
ASTPtr select_query = prepareQueryAffectedAST(commands, storage, context_copy);
/// Interpreter must be alive, when we use result of execute() method.
/// For some reason it may copy context and and give it into ExpressionTransform
/// For some reason it may copy context and give it into ExpressionTransform
/// after that we will use context from destroyed stack frame in our stream.
InterpreterSelectQuery interpreter(
select_query, context_copy, storage, metadata_snapshot, SelectQueryOptions().ignoreLimits().ignoreProjections());
@ -288,13 +288,17 @@ MutationsInterpreter::MutationsInterpreter(
const StorageMetadataPtr & metadata_snapshot_,
MutationCommands commands_,
ContextPtr context_,
bool can_execute_)
bool can_execute_,
bool return_all_columns_,
bool return_deleted_rows_)
: storage(std::move(storage_))
, metadata_snapshot(metadata_snapshot_)
, commands(std::move(commands_))
, context(Context::createCopy(context_))
, can_execute(can_execute_)
, select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections())
, return_all_columns(return_all_columns_)
, return_deleted_rows(return_deleted_rows_)
{
mutation_ast = prepare(!can_execute);
}
@ -472,14 +476,21 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run)
/// First, break a sequence of commands into stages.
for (auto & command : commands)
{
// we can return deleted rows only if it's the only present command
assert(command.type == MutationCommand::DELETE || !return_deleted_rows);
if (command.type == MutationCommand::DELETE)
{
mutation_kind.set(MutationKind::MUTATE_OTHER);
if (stages.empty() || !stages.back().column_to_updated.empty())
stages.emplace_back(context);
auto negated_predicate = makeASTFunction("isZeroOrNull", getPartitionAndPredicateExpressionForMutationCommand(command));
stages.back().filters.push_back(negated_predicate);
auto predicate = getPartitionAndPredicateExpressionForMutationCommand(command);
if (!return_deleted_rows)
predicate = makeASTFunction("isZeroOrNull", predicate);
stages.back().filters.push_back(predicate);
}
else if (command.type == MutationCommand::UPDATE)
{
@ -789,7 +800,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector<Stage> &
/// Next, for each stage calculate columns changed by this and previous stages.
for (size_t i = 0; i < prepared_stages.size(); ++i)
{
if (!prepared_stages[i].filters.empty())
if (return_all_columns || !prepared_stages[i].filters.empty())
{
for (const auto & column : all_columns)
prepared_stages[i].output_columns.insert(column.name);

View File

@ -43,7 +43,9 @@ public:
const StorageMetadataPtr & metadata_snapshot_,
MutationCommands commands_,
ContextPtr context_,
bool can_execute_);
bool can_execute_,
bool return_all_columns_ = false,
bool return_deleted_rows_ = false);
void validate();
@ -156,6 +158,12 @@ private:
/// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
ColumnDependencies dependencies;
// whether all columns should be returned, not just updated
bool return_all_columns;
// whether we should return deleted or nondeleted rows on DELETE mutation
bool return_deleted_rows;
};
}

View File

@ -224,8 +224,6 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
select_query->group_by_with_rollup = true;
else if (s_cube.ignore(pos, expected))
select_query->group_by_with_cube = true;
else if (s_grouping_sets.ignore(pos, expected))
select_query->group_by_with_grouping_sets = true;
else if (s_totals.ignore(pos, expected))
select_query->group_by_with_totals = true;
else

View File

@ -251,14 +251,17 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
outputs.push_back(grouping_node);
const auto & missing_columns = grouping_sets_params[set_counter].missing_keys;
const auto & used_keys = grouping_sets_params[set_counter].used_keys;
auto to_nullable_function = FunctionFactory::instance().get("toNullable", nullptr);
for (size_t i = 0; i < output_header.columns(); ++i)
{
auto & col = output_header.getByPosition(i);
const auto it = std::find_if(
const auto missing_it = std::find_if(
missing_columns.begin(), missing_columns.end(), [&](const auto & missing_col) { return missing_col == col.name; });
if (it != missing_columns.end())
const auto used_it = std::find_if(
used_keys.begin(), used_keys.end(), [&](const auto & used_col) { return used_col == col.name; });
if (missing_it != missing_columns.end())
{
auto column_with_default = col.column->cloneEmpty();
col.type->insertDefaultInto(*column_with_default);
@ -270,7 +273,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
else
{
const auto * column_node = dag->getOutputs()[header.getPositionByName(col.name)];
if (group_by_use_nulls && column_node->result_type->canBeInsideNullable())
if (used_it != used_keys.end() && group_by_use_nulls && column_node->result_type->canBeInsideNullable())
outputs.push_back(&dag->addFunction(to_nullable_function, { column_node }, col.name));
else
outputs.push_back(column_node);

View File

@ -179,7 +179,6 @@ Pipe ReadFromMergeTree::readFromPool(
sum_marks,
min_marks_for_concurrent_read,
std::move(parts_with_range),
data,
storage_snapshot,
prewhere_info,
required_columns,

View File

@ -0,0 +1,24 @@
#pragma once
#include <string>
#include <unordered_map>
namespace DB
{
/// Alter conversions which should be applied on-fly for part. Build from of
/// the most recent mutation commands for part. Now we have only rename_map
/// here (from ALTER_RENAME) command, because for all other type of alters
/// we can deduce conversions for part from difference between
/// part->getColumns() and storage->getColumns().
struct AlterConversions
{
/// Rename map new_name -> old_name
std::unordered_map<std::string, std::string> rename_map;
bool isColumnRenamed(const std::string & new_name) const { return rename_map.count(new_name) > 0; }
std::string getColumnOldName(const std::string & new_name) const { return rename_map.at(new_name); }
};
}

View File

@ -399,7 +399,7 @@ MergeTreeData::DataPartPtr Service::findPart(const String & name)
throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in table", name);
}
MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
const String & part_name,
@ -420,6 +420,11 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
if (blocker.isCancelled())
throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);
const auto data_settings = data.getSettings();
if (data_settings->allow_remote_fs_zero_copy_replication && !try_zero_copy)
LOG_WARNING(log, "Zero copy replication enabled, but trying to fetch part {} without zero copy", part_name);
/// It should be "tmp-fetch_" and not "tmp_fetch_", because we can fetch part to detached/,
/// but detached part name prefix should not contain underscore.
static const String TMP_PREFIX = "tmp-fetch_";
@ -429,7 +434,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
/// Validation of the input that may come from malicious replica.
auto part_info = MergeTreePartInfo::fromPartName(part_name, data.format_version);
const auto data_settings = data.getSettings();
Poco::URI uri;
uri.setScheme(interserver_scheme);
@ -465,6 +469,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
capability.push_back(toString(disk->getDataSourceDescription().type));
}
}
if (!capability.empty())
{
::sort(capability.begin(), capability.end());
@ -474,6 +479,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
}
else
{
if (data_settings->allow_remote_fs_zero_copy_replication)
LOG_WARNING(log, "Cannot select any zero-copy disk for {}", part_name);
try_zero_copy = false;
}
@ -585,7 +593,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
temporary_directory_lock = {};
/// Try again but without zero-copy
return fetchPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
return fetchSelectedPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
user, password, interserver_scheme, throttler, to_detached, tmp_prefix, nullptr, false, disk);
}
}

View File

@ -66,7 +66,7 @@ public:
explicit Fetcher(StorageReplicatedMergeTree & data_) : data(data_), log(&Poco::Logger::get("Fetcher")) {}
/// Downloads a part to tmp_directory. If to_detached - downloads to the `detached` directory.
MergeTreeData::MutableDataPartPtr fetchPart(
MergeTreeData::MutableDataPartPtr fetchSelectedPart(
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
const String & part_name,

View File

@ -532,25 +532,6 @@ void IMergeTreeDataPart::removeIfNeeded()
LOG_TRACE(storage.log, "Removed part from old location {}", path);
}
}
catch (const Exception & ex)
{
tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
/// In this case we want to avoid assertions, because such errors are unavoidable in setup
/// with zero-copy replication.
if (const auto * keeper_exception = dynamic_cast<const Coordination::Exception *>(&ex))
{
if (Coordination::isHardwareError(keeper_exception->code))
return;
}
/// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
/// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
/// then all future attempts to execute part producing operation will fail with "directory already exists".
assert(!is_temp);
assert(state != MergeTreeDataPartState::DeleteOnDestroy);
assert(state != MergeTreeDataPartState::Temporary);
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
@ -558,11 +539,6 @@ void IMergeTreeDataPart::removeIfNeeded()
/// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
/// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
/// then all future attempts to execute part producing operation will fail with "directory already exists".
///
/// For remote disks this issue is really frequent, so we don't about server here
assert(!is_temp);
assert(state != MergeTreeDataPartState::DeleteOnDestroy);
assert(state != MergeTreeDataPartState::Temporary);
}
}
@ -1433,7 +1409,10 @@ std::pair<bool, NameSet> IMergeTreeDataPart::canRemovePart() const
{
/// NOTE: It's needed for zero-copy replication
if (force_keep_shared_data)
{
LOG_DEBUG(storage.log, "Blobs for part {} cannot be removed because it's forced to be keeped", name);
return std::make_pair(false, NameSet{});
}
return storage.unlockSharedData(*this);
}
@ -1457,6 +1436,12 @@ void IMergeTreeDataPart::remove() const
auto [can_remove, files_not_to_remove] = canRemovePart();
if (!can_remove)
LOG_TRACE(storage.log, "Blobs of part {} cannot be removed", name);
if (!files_not_to_remove.empty())
LOG_TRACE(storage.log, "Some blobs ({}) of part {} cannot be removed", fmt::join(files_not_to_remove, ", "), name);
if (!isStoredOnDisk())
return;

View File

@ -0,0 +1,68 @@
#pragma once
#include <Interpreters/Context.h>
#include <Storages/MergeTree/AlterConversions.h>
#include <Core/NamesAndTypes.h>
namespace DB
{
class IDataPartStorage;
using DataPartStoragePtr = std::shared_ptr<IDataPartStorage>;
class MergeTreeIndexGranularity;
struct MergeTreeDataPartChecksums;
struct MergeTreeIndexGranularityInfo;
class ISerialization;
using SerializationPtr = std::shared_ptr<const ISerialization>;
/**
* A class which contains all information about a data part that is required
* in order to use MergeTreeDataPartReader's.
* It is a separate interface and not a simple struct because
* otherwise it will need to copy all the information which might not
* be even used (for example, an IndexGranulary class object is quite heavy).
*/
class IMergeTreeDataPartInfoForReader : public WithContext
{
public:
explicit IMergeTreeDataPartInfoForReader(ContextPtr context_) : WithContext(context_) {}
virtual ~IMergeTreeDataPartInfoForReader() = default;
virtual bool isCompactPart() const = 0;
virtual bool isWidePart() const = 0;
virtual bool isInMemoryPart() const = 0;
virtual bool isProjectionPart() const = 0;
virtual const DataPartStoragePtr & getDataPartStorage() const = 0;
virtual const NamesAndTypesList & getColumns() const = 0;
virtual std::optional<size_t> getColumnPosition(const String & column_name) const = 0;
virtual String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const = 0;
virtual const MergeTreeDataPartChecksums & getChecksums() const = 0;
virtual AlterConversions getAlterConversions() const = 0;
virtual size_t getMarksCount() const = 0;
virtual size_t getFileSizeOrZero(const std::string & file_name) const = 0;
virtual const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const = 0;
virtual const MergeTreeIndexGranularity & getIndexGranularity() const = 0;
virtual SerializationPtr getSerialization(const NameAndTypePair & column) const = 0;
virtual const SerializationInfoByName & getSerializationInfos() const = 0;
virtual void reportBroken() = 0;
};
using MergeTreeDataPartInfoForReaderPtr = std::shared_ptr<IMergeTreeDataPartInfoForReader>;
}

View File

@ -23,7 +23,7 @@ namespace ErrorCodes
IMergeTreeReader::IMergeTreeReader(
const MergeTreeData::DataPartPtr & data_part_,
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
const NamesAndTypesList & columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
@ -31,19 +31,18 @@ IMergeTreeReader::IMergeTreeReader(
const MarkRanges & all_mark_ranges_,
const MergeTreeReaderSettings & settings_,
const ValueSizeMap & avg_value_size_hints_)
: data_part(data_part_)
: data_part_info_for_read(data_part_info_for_read_)
, avg_value_size_hints(avg_value_size_hints_)
, uncompressed_cache(uncompressed_cache_)
, mark_cache(mark_cache_)
, settings(settings_)
, storage(data_part_->storage)
, metadata_snapshot(metadata_snapshot_)
, all_mark_ranges(all_mark_ranges_)
, alter_conversions(storage.getAlterConversionsForPart(data_part))
, alter_conversions(data_part_info_for_read->getAlterConversions())
/// For wide parts convert plain arrays of Nested to subcolumns
/// to allow to use shared offset column from cache.
, requested_columns(isWidePart(data_part) ? Nested::convertToSubcolumns(columns_) : columns_)
, part_columns(isWidePart(data_part) ? Nested::collect(data_part->getColumns()) : data_part->getColumns())
, requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_)
, part_columns(data_part_info_for_read->isWidePart() ? Nested::collect(data_part_info_for_read->getColumns()) : data_part_info_for_read->getColumns())
{
columns_to_read.reserve(requested_columns.size());
serializations.reserve(requested_columns.size());
@ -71,7 +70,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
catch (Exception & e)
{
/// Better diagnostics.
e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
throw;
}
}
@ -99,13 +98,13 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
}
auto dag = DB::evaluateMissingDefaults(
additional_columns, requested_columns, metadata_snapshot->getColumns(), storage.getContext());
additional_columns, requested_columns, metadata_snapshot->getColumns(), data_part_info_for_read->getContext());
if (dag)
{
dag->addMaterializingOutputActions();
auto actions = std::make_shared<
ExpressionActions>(std::move(dag),
ExpressionActionsSettings::fromSettings(storage.getContext()->getSettingsRef()));
ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef()));
actions->execute(additional_columns);
}
@ -117,7 +116,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
catch (Exception & e)
{
/// Better diagnostics.
e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
throw;
}
}
@ -151,7 +150,7 @@ SerializationPtr IMergeTreeReader::getSerializationInPart(const NameAndTypePair
if (!column_in_part)
return IDataType::getSerialization(required_column);
const auto & infos = data_part->getSerializationInfos();
const auto & infos = data_part_info_for_read->getSerializationInfos();
if (auto it = infos.find(column_in_part->getNameInStorage()); it != infos.end())
return IDataType::getSerialization(*column_in_part, *it->second);
@ -187,7 +186,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
copy_block.insert({res_columns[pos], getColumnInPart(*name_and_type).type, name_and_type->name});
}
DB::performRequiredConversions(copy_block, requested_columns, storage.getContext());
DB::performRequiredConversions(copy_block, requested_columns, data_part_info_for_read->getContext());
/// Move columns from block.
name_and_type = requested_columns.begin();
@ -197,7 +196,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
catch (Exception & e)
{
/// Better diagnostics.
e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
throw;
}
}
@ -205,11 +204,11 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const String & column_name) const
{
String table_name = Nested::extractTableName(column_name);
for (const auto & part_column : data_part->getColumns())
for (const auto & part_column : data_part_info_for_read->getColumns())
{
if (typeid_cast<const DataTypeArray *>(part_column.type.get()))
{
auto position = data_part->getColumnPosition(part_column.getNameInStorage());
auto position = data_part_info_for_read->getColumnPosition(part_column.getNameInStorage());
if (position && Nested::extractTableName(part_column.name) == table_name)
return position;
}

View File

@ -4,6 +4,8 @@
#include <Common/HashTable/HashMap.h>
#include <Storages/MergeTree/MergeTreeReaderStream.h>
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
namespace DB
{
@ -20,7 +22,7 @@ public:
using DeserializeBinaryBulkStateMap = std::map<std::string, ISerialization::DeserializeBinaryBulkStatePtr>;
IMergeTreeReader(
const MergeTreeData::DataPartPtr & data_part_,
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
const NamesAndTypesList & columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
@ -57,7 +59,7 @@ public:
size_t getFirstMarkToRead() const { return all_mark_ranges.front().begin; }
MergeTreeData::DataPartPtr data_part;
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read;
protected:
/// Returns actual column name in part, which can differ from table metadata.
@ -86,7 +88,6 @@ protected:
MergeTreeReaderSettings settings;
const MergeTreeData & storage;
StorageMetadataPtr metadata_snapshot;
MarkRanges all_mark_ranges;
@ -95,7 +96,7 @@ protected:
private:
/// Alter conversions, which must be applied on fly if required
MergeTreeData::AlterConversions alter_conversions;
AlterConversions alter_conversions;
/// Columns that are requested to read.
NamesAndTypesList requested_columns;

View File

@ -0,0 +1,55 @@
#pragma once
#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
#include <Storages/MergeTree/MergeTreeData.h>
namespace DB
{
class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader
{
public:
explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_)
: IMergeTreeDataPartInfoForReader(data_part_->storage.getContext())
, data_part(data_part_)
{}
bool isCompactPart() const override { return DB::isCompactPart(data_part); }
bool isWidePart() const override { return DB::isWidePart(data_part); }
bool isInMemoryPart() const override { return DB::isInMemoryPart(data_part); }
bool isProjectionPart() const override { return data_part->isProjectionPart(); }
const DataPartStoragePtr & getDataPartStorage() const override { return data_part->data_part_storage; }
const NamesAndTypesList & getColumns() const override { return data_part->getColumns(); }
std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); }
AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); }
String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); }
const MergeTreeDataPartChecksums & getChecksums() const override { return data_part->checksums; }
void reportBroken() override { data_part->storage.reportBrokenPart(data_part); }
size_t getMarksCount() const override { return data_part->getMarksCount(); }
size_t getFileSizeOrZero(const std::string & file_name) const override { return data_part->getFileSizeOrZero(file_name); }
const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const override { return data_part->index_granularity_info; }
const MergeTreeIndexGranularity & getIndexGranularity() const override { return data_part->index_granularity; }
const SerializationInfoByName & getSerializationInfos() const override { return data_part->getSerializationInfos(); }
SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); }
private:
MergeTreeData::DataPartPtr data_part;
};
}

View File

@ -36,4 +36,16 @@ size_t getLastMark(const MarkRanges & ranges)
return current_task_last_mark;
}
std::string toString(const MarkRanges & ranges)
{
std::string result;
for (const auto & mark_range : ranges)
{
if (!result.empty())
result += ", ";
result += "(" + std::to_string(mark_range.begin) + ", " + std::to_string(mark_range.end) + ")";
}
return result;
}
}

View File

@ -32,4 +32,6 @@ using MarkRanges = std::deque<MarkRange>;
*/
size_t getLastMark(const MarkRanges & ranges);
std::string toString(const MarkRanges & ranges);
}

View File

@ -43,6 +43,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
, storage(storage_)
, storage_snapshot(storage_snapshot_)
, prewhere_info(prewhere_info_)
, prewhere_actions(getPrewhereActions(prewhere_info, actions_settings))
, max_block_size_rows(max_block_size_rows_)
, preferred_block_size_bytes(preferred_block_size_bytes_)
, preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_)
@ -72,7 +73,12 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
header_without_virtual_columns.erase(*it);
}
}
}
std::unique_ptr<PrewhereExprInfo> MergeTreeBaseSelectProcessor::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings)
{
std::unique_ptr<PrewhereExprInfo> prewhere_actions;
if (prewhere_info)
{
prewhere_actions = std::make_unique<PrewhereExprInfo>();
@ -100,6 +106,8 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
prewhere_actions->steps.emplace_back(std::move(prewhere_step));
}
return prewhere_actions;
}
@ -262,45 +270,62 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart(
void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task)
{
MergeTreeRangeReader* prev_reader = nullptr;
return initializeRangeReadersImpl(
current_task.range_reader, current_task.pre_range_readers, prewhere_info, prewhere_actions.get(),
reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings,
pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names);
}
void MergeTreeBaseSelectProcessor::initializeRangeReadersImpl(
MergeTreeRangeReader & range_reader, std::deque<MergeTreeRangeReader> & pre_range_readers,
PrewhereInfoPtr prewhere_info, const PrewhereExprInfo * prewhere_actions,
IMergeTreeReader * reader, bool has_lightweight_delete, const MergeTreeReaderSettings & reader_settings,
const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
const PrewhereExprStep & lightweight_delete_filter_step, const Names & non_const_virtual_column_names)
{
MergeTreeRangeReader * prev_reader = nullptr;
bool last_reader = false;
size_t pre_readers_shift = 0;
/// Add filtering step with lightweight delete mask
if (reader_settings.apply_deleted_mask && current_task.data_part->hasLightweightDelete())
if (reader_settings.apply_deleted_mask && has_lightweight_delete)
{
current_task.pre_range_readers.push_back(
MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names));
prev_reader = &current_task.pre_range_readers.back();
MergeTreeRangeReader pre_range_reader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names);
pre_range_readers.push_back(std::move(pre_range_reader));
prev_reader = &pre_range_readers.back();
pre_readers_shift++;
}
if (prewhere_info)
{
if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"PREWHERE steps count mismatch, actions: {}, readers: {}",
prewhere_actions->steps.size(), pre_reader_for_step.size());
{
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"PREWHERE steps count mismatch, actions: {}, readers: {}",
prewhere_actions->steps.size(), pre_reader_for_step.size());
}
for (size_t i = 0; i < prewhere_actions->steps.size(); ++i)
{
last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size());
current_task.pre_range_readers.push_back(
MergeTreeRangeReader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names));
prev_reader = &current_task.pre_range_readers.back();
MergeTreeRangeReader current_reader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names);
pre_range_readers.push_back(std::move(current_reader));
prev_reader = &pre_range_readers.back();
}
}
if (!last_reader)
{
current_task.range_reader = MergeTreeRangeReader(reader.get(), prev_reader, nullptr, true, non_const_virtual_column_names);
range_reader = MergeTreeRangeReader(reader, prev_reader, nullptr, true, non_const_virtual_column_names);
}
else
{
/// If all columns are read by pre_range_readers than move last pre_range_reader into range_reader
current_task.range_reader = std::move(current_task.pre_range_readers.back());
current_task.pre_range_readers.pop_back();
range_reader = std::move(pre_range_readers.back());
pre_range_readers.pop_back();
}
}

View File

@ -89,6 +89,20 @@ protected:
static void
injectVirtualColumns(Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns);
static std::unique_ptr<PrewhereExprInfo> getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings);
static void initializeRangeReadersImpl(
MergeTreeRangeReader & range_reader,
std::deque<MergeTreeRangeReader> & pre_range_readers,
PrewhereInfoPtr prewhere_info,
const PrewhereExprInfo * prewhere_actions,
IMergeTreeReader * reader,
bool has_lightweight_delete,
const MergeTreeReaderSettings & reader_settings,
const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
const PrewhereExprStep & lightweight_delete_filter_step,
const Names & non_const_virtual_column_names);
/// Sets up data readers for each step of prewhere and where
void initializeMergeTreeReadersForPart(
MergeTreeData::DataPartPtr & data_part,

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
#include <DataTypes/NestedUtils.h>
#include <Core/NamesAndTypes.h>
#include <Common/checkStackSize.h>
@ -28,8 +29,8 @@ namespace
bool injectRequiredColumnsRecursively(
const String & column_name,
const StorageSnapshotPtr & storage_snapshot,
const MergeTreeData::AlterConversions & alter_conversions,
const MergeTreeData::DataPartPtr & part,
const AlterConversions & alter_conversions,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
const GetColumnsOptions & options,
Names & columns,
NameSet & required_columns,
@ -47,7 +48,7 @@ bool injectRequiredColumnsRecursively(
if (alter_conversions.isColumnRenamed(column_name_in_part))
column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);
auto column_in_part = part->getColumns().tryGetByName(column_name_in_part);
auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part);
if (column_in_part
&& (!column_in_storage->isSubcolumn()
@ -78,7 +79,7 @@ bool injectRequiredColumnsRecursively(
bool result = false;
for (const auto & identifier : identifiers)
result |= injectRequiredColumnsRecursively(
identifier, storage_snapshot, alter_conversions, part,
identifier, storage_snapshot, alter_conversions, data_part_info_for_reader,
options, columns, required_columns, injected_columns);
return result;
@ -87,9 +88,8 @@ bool injectRequiredColumnsRecursively(
}
NameSet injectRequiredColumns(
const MergeTreeData & storage,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
const StorageSnapshotPtr & storage_snapshot,
const MergeTreeData::DataPartPtr & part,
bool with_subcolumns,
Names & columns)
{
@ -97,9 +97,9 @@ NameSet injectRequiredColumns(
NameSet injected_columns;
bool have_at_least_one_physical_column = false;
MergeTreeData::AlterConversions alter_conversions;
if (!part->isProjectionPart())
alter_conversions = storage.getAlterConversionsForPart(part);
AlterConversions alter_conversions;
if (!data_part_info_for_reader.isProjectionPart())
alter_conversions = data_part_info_for_reader.getAlterConversions();
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
.withExtendedObjects()
@ -115,7 +115,7 @@ NameSet injectRequiredColumns(
have_at_least_one_physical_column |= injectRequiredColumnsRecursively(
columns[i], storage_snapshot, alter_conversions,
part, options, columns, required_columns, injected_columns);
data_part_info_for_reader, options, columns, required_columns, injected_columns);
}
/** Add a column of the minimum size.
@ -124,7 +124,7 @@ NameSet injectRequiredColumns(
*/
if (!have_at_least_one_physical_column)
{
const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(with_subcolumns);
const auto minimum_size_column_name = data_part_info_for_reader.getColumnNameWithMinimumCompressedSize(with_subcolumns);
columns.push_back(minimum_size_column_name);
/// correctly report added column
injected_columns.insert(columns.back());
@ -135,13 +135,22 @@ NameSet injectRequiredColumns(
MergeTreeReadTask::MergeTreeReadTask(
const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_,
const MergeTreeData::DataPartPtr & data_part_,
const MarkRanges & mark_ranges_,
size_t part_index_in_query_,
const Names & ordered_names_,
const NameSet & column_name_set_,
const MergeTreeReadTaskColumns & task_columns_,
bool remove_prewhere_column_,
MergeTreeBlockSizePredictorPtr && size_predictor_)
: data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_},
ordered_names{ordered_names_}, column_name_set{column_name_set_}, task_columns{task_columns_},
remove_prewhere_column{remove_prewhere_column_}, size_predictor{std::move(size_predictor_)}
: data_part{data_part_}
, mark_ranges{mark_ranges_}
, part_index_in_query{part_index_in_query_}
, ordered_names{ordered_names_}
, column_name_set{column_name_set_}
, task_columns{task_columns_}
, remove_prewhere_column{remove_prewhere_column_}
, size_predictor{std::move(size_predictor_)}
{
}
@ -270,9 +279,8 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum
MergeTreeReadTaskColumns getReadTaskColumns(
const MergeTreeData & storage,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
const StorageSnapshotPtr & storage_snapshot,
const MergeTreeData::DataPartPtr & data_part,
const Names & required_columns,
const Names & system_columns,
const PrewhereInfoPtr & prewhere_info,
@ -284,13 +292,13 @@ MergeTreeReadTaskColumns getReadTaskColumns(
/// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part
for (const auto & name : system_columns)
{
if (data_part->getColumns().contains(name))
if (data_part_info_for_reader.getColumns().contains(name))
column_names.push_back(name);
}
/// inject columns required for defaults evaluation
injectRequiredColumns(
storage, storage_snapshot, data_part, with_subcolumns, column_names);
data_part_info_for_reader, storage_snapshot, with_subcolumns, column_names);
MergeTreeReadTaskColumns result;
auto options = GetColumnsOptions(GetColumnsOptions::All)
@ -316,7 +324,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
Names all_pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames();
const auto injected_pre_columns = injectRequiredColumns(
storage, storage_snapshot, data_part, with_subcolumns, all_pre_column_names);
data_part_info_for_reader, storage_snapshot, with_subcolumns, all_pre_column_names);
for (const auto & name : all_pre_column_names)
{

View File

@ -12,6 +12,7 @@ namespace DB
class MergeTreeData;
struct MergeTreeReadTask;
struct MergeTreeBlockSizePredictor;
class IMergeTreeDataPartInfoForReader;
using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredictor>;
@ -23,9 +24,8 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredict
* Adds them to the `columns`.
*/
NameSet injectRequiredColumns(
const MergeTreeData & storage,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
const StorageSnapshotPtr & storage_snapshot,
const MergeTreeData::DataPartPtr & part,
bool with_subcolumns,
Names & columns);
@ -68,16 +68,19 @@ struct MergeTreeReadTask
bool isFinished() const { return mark_ranges.empty() && range_reader.isCurrentRangeFinished(); }
MergeTreeReadTask(
const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_,
const MergeTreeData::DataPartPtr & data_part_,
const MarkRanges & mark_ranges_,
size_t part_index_in_query_,
const Names & ordered_names_,
const NameSet & column_name_set_,
const MergeTreeReadTaskColumns & task_columns_,
bool remove_prewhere_column_,
MergeTreeBlockSizePredictorPtr && size_predictor_);
};
MergeTreeReadTaskColumns getReadTaskColumns(
const MergeTreeData & storage,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
const StorageSnapshotPtr & storage_snapshot,
const MergeTreeData::DataPartPtr & data_part,
const Names & required_columns,
const Names & system_columns,
const PrewhereInfoPtr & prewhere_info,

View File

@ -1047,12 +1047,12 @@ void MergeTreeData::loadDataPartsFromDisk(
throw;
broken = true;
tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while loading part {} on path {}", part->name, part_path));
tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
}
catch (...)
{
broken = true;
tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while loading part {} on path {}", part->name, part_path));
tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
}
/// Ignore broken parts that can appear as a result of hard server restart.
@ -1066,7 +1066,7 @@ void MergeTreeData::loadDataPartsFromDisk(
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while calculating part size {} on path {}", part->name, part_path));
tryLogCurrentException(log, fmt::format("while calculating part size {} on path {}", part->name, part_path));
}
std::string part_size_str = "failed to calculate size";
@ -1902,7 +1902,9 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts, bool
void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_to_remove, NameSet * part_names_succeed)
{
const auto settings = getSettings();
if (parts_to_remove.size() > 1 && settings->max_part_removal_threads > 1 && parts_to_remove.size() > settings->concurrent_part_removal_threshold)
if (parts_to_remove.size() > 1
&& settings->max_part_removal_threads > 1
&& parts_to_remove.size() > settings->concurrent_part_removal_threshold)
{
/// Parallel parts removal.
size_t num_threads = std::min<size_t>(settings->max_part_removal_threads, parts_to_remove.size());
@ -1917,7 +1919,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t
if (thread_group)
CurrentThread::attachToIfDetached(thread_group);
LOG_DEBUG(log, "Removing part from filesystem {}", part->name);
LOG_DEBUG(log, "Removing part from filesystem {} (concurrently)", part->name);
part->remove();
if (part_names_succeed)
{
@ -5069,6 +5071,8 @@ void MergeTreeData::Transaction::rollbackPartsToTemporaryState()
void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part, DataPartStorageBuilderPtr builder)
{
precommitted_parts.insert(part);
if (asInMemoryPart(part))
has_in_memory_parts = true;
part_builders.push_back(builder);
}
@ -5091,6 +5095,12 @@ void MergeTreeData::Transaction::rollback()
clear();
}
void MergeTreeData::Transaction::clear()
{
precommitted_parts.clear();
has_in_memory_parts = false;
}
MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData::DataPartsLock * acquired_parts_lock)
{
DataPartsVector total_covered_parts;
@ -5098,20 +5108,30 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:
if (!isEmpty())
{
auto settings = data.getSettings();
MergeTreeData::WriteAheadLogPtr wal;
auto parts_lock = acquired_parts_lock ? MergeTreeData::DataPartsLock() : data.lockParts();
auto * owing_parts_lock = acquired_parts_lock ? acquired_parts_lock : &parts_lock;
for (auto & builder : part_builders)
builder->commit();
if (txn)
bool commit_to_wal = has_in_memory_parts && settings->in_memory_parts_enable_wal;
if (txn || commit_to_wal)
{
MergeTreeData::WriteAheadLogPtr wal;
if (commit_to_wal)
wal = data.getWriteAheadLog();
for (const DataPartPtr & part : precommitted_parts)
{
DataPartPtr covering_part;
DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
if (txn)
{
DataPartPtr covering_part;
DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
}
if (auto part_in_memory = asInMemoryPart(part))
wal->addPart(part_in_memory);
}
}
@ -5128,15 +5148,6 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:
for (const DataPartPtr & part : precommitted_parts)
{
auto part_in_memory = asInMemoryPart(part);
if (part_in_memory && settings->in_memory_parts_enable_wal)
{
if (!wal)
wal = data.getWriteAheadLog();
wal->addPart(part_in_memory);
}
DataPartPtr covering_part;
DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
if (covering_part)
@ -6717,7 +6728,7 @@ bool MergeTreeData::canUsePolymorphicParts(const MergeTreeSettings & settings, S
return true;
}
MergeTreeData::AlterConversions MergeTreeData::getAlterConversionsForPart(const MergeTreeDataPartPtr part) const
AlterConversions MergeTreeData::getAlterConversionsForPart(const MergeTreeDataPartPtr part) const
{
MutationCommands commands = getFirstAlterMutationCommandsForPart(part);

View File

@ -24,6 +24,7 @@
#include <Storages/MergeTree/ZeroCopyLock.h>
#include <Storages/MergeTree/TemporaryParts.h>
#include <Storages/IndicesDescription.h>
#include <Storages/MergeTree/AlterConversions.h>
#include <Storages/DataDestinationType.h>
#include <Storages/extractKeyExpressionList.h>
#include <Storages/PartitionCommands.h>
@ -167,20 +168,6 @@ public:
STRONG_TYPEDEF(String, PartitionID)
/// Alter conversions which should be applied on-fly for part. Build from of
/// the most recent mutation commands for part. Now we have only rename_map
/// here (from ALTER_RENAME) command, because for all other type of alters
/// we can deduce conversions for part from difference between
/// part->getColumns() and storage->getColumns().
struct AlterConversions
{
/// Rename map new_name -> old_name
std::unordered_map<String, String> rename_map;
bool isColumnRenamed(const String & new_name) const { return rename_map.contains(new_name); }
String getColumnOldName(const String & new_name) const { return rename_map.at(new_name); }
};
struct LessDataPart
{
using is_transparent = void;
@ -290,8 +277,9 @@ public:
DataParts precommitted_parts;
std::vector<DataPartStorageBuilderPtr> part_builders;
DataParts locked_parts;
bool has_in_memory_parts = false;
void clear() { precommitted_parts.clear(); }
void clear();
};
using TransactionUniquePtr = std::unique_ptr<Transaction>;

View File

@ -2,6 +2,7 @@
#include <DataTypes/NestedUtils.h>
#include <Storages/MergeTree/MergeTreeReaderCompact.h>
#include <Storages/MergeTree/MergeTreeDataPartWriterCompact.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
namespace DB
@ -45,9 +46,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader(
const ValueSizeMap & avg_value_size_hints,
const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
{
auto ptr = std::static_pointer_cast<const MergeTreeDataPartCompact>(shared_from_this());
auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
return std::make_unique<MergeTreeReaderCompact>(
ptr, columns_to_read, metadata_snapshot, uncompressed_cache,
read_info, columns_to_read, metadata_snapshot, uncompressed_cache,
mark_cache, mark_ranges, reader_settings,
avg_value_size_hints, profile_callback);
}
@ -90,39 +91,44 @@ void MergeTreeDataPartCompact::calculateEachColumnSizes(ColumnSizeByName & /*eac
total_size.marks += mrk_checksum->second.file_size;
}
void MergeTreeDataPartCompact::loadIndexGranularity()
void MergeTreeDataPartCompact::loadIndexGranularityImpl(
MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_,
const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_)
{
//String full_path = getRelativePath();
if (columns.empty())
throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
if (!index_granularity_info.is_adaptive)
if (!index_granularity_info_.is_adaptive)
throw Exception("MergeTreeDataPartCompact cannot be created with non-adaptive granulary.", ErrorCodes::NOT_IMPLEMENTED);
auto marks_file_path = index_granularity_info.getMarksFilePath("data");
if (!data_part_storage->exists(marks_file_path))
auto marks_file_path = index_granularity_info_.getMarksFilePath("data");
if (!data_part_storage_->exists(marks_file_path))
throw Exception(
ErrorCodes::NO_FILE_IN_DATA_PART,
"Marks file '{}' doesn't exist",
std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
size_t marks_file_size = data_part_storage->getFileSize(marks_file_path);
size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path);
auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
while (!buffer->eof())
{
/// Skip offsets for columns
buffer->seek(columns.size() * sizeof(MarkInCompressedFile), SEEK_CUR);
buffer->seek(columns_.size() * sizeof(MarkInCompressedFile), SEEK_CUR);
size_t granularity;
readIntBinary(granularity, *buffer);
index_granularity.appendMark(granularity);
index_granularity_.appendMark(granularity);
}
if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes(columns.size()) != marks_file_size)
if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes(columns_.size()) != marks_file_size)
throw Exception("Cannot read all marks from file " + marks_file_path, ErrorCodes::CANNOT_READ_ALL_DATA);
index_granularity.setInitialized();
index_granularity_.setInitialized();
}
void MergeTreeDataPartCompact::loadIndexGranularity()
{
if (columns.empty())
throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
loadIndexGranularityImpl(index_granularity, index_granularity_info, columns, data_part_storage);
}
bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) const

View File

@ -65,6 +65,11 @@ public:
~MergeTreeDataPartCompact() override;
protected:
static void loadIndexGranularityImpl(
MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_,
const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_);
private:
void checkConsistency(bool require_part_metadata) const override;

View File

@ -3,6 +3,7 @@
#include <Storages/MergeTree/MergedBlockOutputStream.h>
#include <Storages/MergeTree/MergeTreeDataPartWriterInMemory.h>
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <DataTypes/NestedUtils.h>
#include <Interpreters/Context.h>
#include <Poco/Logger.h>
@ -48,9 +49,10 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader(
const ValueSizeMap & /* avg_value_size_hints */,
const ReadBufferFromFileBase::ProfileCallback & /* profile_callback */) const
{
auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
auto ptr = std::static_pointer_cast<const MergeTreeDataPartInMemory>(shared_from_this());
return std::make_unique<MergeTreeReaderInMemory>(
ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings);
read_info, ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings);
}
IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter(

View File

@ -2,6 +2,7 @@
#include <Storages/MergeTree/MergeTreeReaderWide.h>
#include <Storages/MergeTree/MergeTreeDataPartWriterWide.h>
#include <Storages/MergeTree/IMergeTreeDataPartWriter.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <DataTypes/NestedUtils.h>
#include <Core/NamesAndTypes.h>
@ -47,9 +48,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader(
const ValueSizeMap & avg_value_size_hints,
const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
{
auto ptr = std::static_pointer_cast<const MergeTreeDataPartWide>(shared_from_this());
auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
return std::make_unique<MergeTreeReaderWide>(
ptr, columns_to_read,
read_info, columns_to_read,
metadata_snapshot, uncompressed_cache,
mark_cache, mark_ranges, reader_settings,
avg_value_size_hints, profile_callback);
@ -103,46 +104,52 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl(
return size;
}
void MergeTreeDataPartWide::loadIndexGranularity()
void MergeTreeDataPartWide::loadIndexGranularityImpl(
MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_,
const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name)
{
index_granularity_info.changeGranularityIfRequired(data_part_storage);
if (columns.empty())
throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
index_granularity_info_.changeGranularityIfRequired(data_part_storage_);
/// We can use any column, it doesn't matter
std::string marks_file_path = index_granularity_info.getMarksFilePath(getFileNameForColumn(columns.front()));
if (!data_part_storage->exists(marks_file_path))
std::string marks_file_path = index_granularity_info_.getMarksFilePath(any_column_file_name);
if (!data_part_storage_->exists(marks_file_path))
throw Exception(
ErrorCodes::NO_FILE_IN_DATA_PART, "Marks file '{}' doesn't exist",
std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
size_t marks_file_size = data_part_storage->getFileSize(marks_file_path);
size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path);
if (!index_granularity_info.is_adaptive)
if (!index_granularity_info_.is_adaptive)
{
size_t marks_count = marks_file_size / index_granularity_info.getMarkSizeInBytes();
index_granularity.resizeWithFixedGranularity(marks_count, index_granularity_info.fixed_index_granularity); /// all the same
size_t marks_count = marks_file_size / index_granularity_info_.getMarkSizeInBytes();
index_granularity_.resizeWithFixedGranularity(marks_count, index_granularity_info_.fixed_index_granularity); /// all the same
}
else
{
auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
while (!buffer->eof())
{
buffer->seek(sizeof(size_t) * 2, SEEK_CUR); /// skip offset_in_compressed file and offset_in_decompressed_block
size_t granularity;
readIntBinary(granularity, *buffer);
index_granularity.appendMark(granularity);
index_granularity_.appendMark(granularity);
}
if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes() != marks_file_size)
if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes() != marks_file_size)
throw Exception(
ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all marks from file {}",
std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
}
index_granularity.setInitialized();
index_granularity_.setInitialized();
}
void MergeTreeDataPartWide::loadIndexGranularity()
{
if (columns.empty())
throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
loadIndexGranularityImpl(index_granularity, index_granularity_info, data_part_storage, getFileNameForColumn(columns.front()));
}
bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const

View File

@ -61,6 +61,11 @@ public:
bool hasColumnFiles(const NameAndTypePair & column) const override;
protected:
static void loadIndexGranularityImpl(
MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_,
const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name);
private:
void checkConsistency(bool require_part_metadata) const override;

View File

@ -29,6 +29,8 @@ public:
MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MergeTreeDataPartType type_);
MergeTreeIndexGranularityInfo(MergeTreeDataPartType type_, bool is_adaptive_, size_t index_granularity_, size_t index_granularity_bytes_);
void changeGranularityIfRequired(const DataPartStoragePtr & data_part_storage);
String getMarksFilePath(const String & path_prefix) const

View File

@ -83,7 +83,7 @@ MergeTreeRangeReader::DelayedStream::DelayedStream(
: current_mark(from_mark), current_offset(0), num_delayed_rows(0)
, current_task_last_mark(current_task_last_mark_)
, merge_tree_reader(merge_tree_reader_)
, index_granularity(&(merge_tree_reader->data_part->index_granularity))
, index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
, continue_reading(false), is_finished(false)
{
}
@ -181,7 +181,7 @@ MergeTreeRangeReader::Stream::Stream(
: current_mark(from_mark), offset_after_current_mark(0)
, last_mark(to_mark)
, merge_tree_reader(merge_tree_reader_)
, index_granularity(&(merge_tree_reader->data_part->index_granularity))
, index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
, current_mark_index_granularity(index_granularity->getMarkRows(from_mark))
, stream(from_mark, current_task_last_mark, merge_tree_reader)
{
@ -652,7 +652,7 @@ MergeTreeRangeReader::MergeTreeRangeReader(
bool last_reader_in_chain_,
const Names & non_const_virtual_column_names_)
: merge_tree_reader(merge_tree_reader_)
, index_granularity(&(merge_tree_reader->data_part->index_granularity))
, index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
, prev_reader(prev_reader_)
, prewhere_info(prewhere_info_)
, last_reader_in_chain(last_reader_in_chain_)
@ -946,7 +946,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t
result.addRows(stream.finalize(result.columns));
/// Last granule may be incomplete.
result.adjustLastGranule();
if (!result.rowsPerGranule().empty())
result.adjustLastGranule();
for (const auto & column_name : non_const_virtual_column_names)
{

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeTreeReadPool.h>
#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <Common/formatReadable.h>
#include <base/range.h>
@ -22,7 +23,6 @@ MergeTreeReadPool::MergeTreeReadPool(
size_t sum_marks_,
size_t min_marks_for_concurrent_read_,
RangesInDataParts && parts_,
const MergeTreeData & data_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
const Names & column_names_,
@ -32,7 +32,6 @@ MergeTreeReadPool::MergeTreeReadPool(
bool do_not_steal_tasks_)
: backoff_settings{backoff_settings_}
, backoff_state{threads_}
, data{data_}
, storage_snapshot{storage_snapshot_}
, column_names{column_names_}
, virtual_column_names{virtual_column_names_}
@ -214,7 +213,7 @@ std::vector<size_t> MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts &
per_part_sum_marks.push_back(sum_marks);
auto task_columns = getReadTaskColumns(
data, storage_snapshot, part.data_part,
LoadedMergeTreeDataPartInfoForReader(part.data_part), storage_snapshot,
column_names, virtual_column_names, prewhere_info, /*with_subcolumns=*/ true);
auto size_predictor = !predict_block_size_bytes ? nullptr

View File

@ -70,11 +70,16 @@ private:
public:
MergeTreeReadPool(
size_t threads_, size_t sum_marks_, size_t min_marks_for_concurrent_read_,
RangesInDataParts && parts_, const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_,
size_t threads_,
size_t sum_marks_,
size_t min_marks_for_concurrent_read_,
RangesInDataParts && parts_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
const Names & column_names_, const Names & virtual_column_names_,
const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_,
const Names & column_names_,
const Names & virtual_column_names_,
const BackoffSettings & backoff_settings_,
size_t preferred_block_size_bytes_,
bool do_not_steal_tasks_ = false);
MergeTreeReadTaskPtr getTask(size_t min_marks_to_read, size_t thread, const Names & ordered_names);
@ -94,7 +99,6 @@ private:
size_t threads, size_t sum_marks, std::vector<size_t> per_part_sum_marks,
const RangesInDataParts & parts, size_t min_marks_for_concurrent_read);
const MergeTreeData & data;
StorageSnapshotPtr storage_snapshot;
const Names column_names;
const Names virtual_column_names;

View File

@ -15,7 +15,7 @@ namespace ErrorCodes
MergeTreeReaderCompact::MergeTreeReaderCompact(
DataPartCompactPtr data_part_,
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
@ -26,7 +26,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
clockid_t clock_type_)
: IMergeTreeReader(
data_part_,
data_part_info_for_read_,
columns_,
metadata_snapshot_,
uncompressed_cache_,
@ -35,14 +35,14 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
settings_,
avg_value_size_hints_)
, marks_loader(
data_part->data_part_storage,
data_part_info_for_read_->getDataPartStorage(),
mark_cache,
data_part->index_granularity_info.getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
data_part->getMarksCount(),
data_part->index_granularity_info,
data_part_info_for_read_->getIndexGranularityInfo().getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
data_part_info_for_read_->getMarksCount(),
data_part_info_for_read_->getIndexGranularityInfo(),
settings.save_marks_in_cache,
settings.read_settings,
data_part->getColumns().size())
data_part_info_for_read_->getColumns().size())
{
try
{
@ -64,7 +64,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
continue;
}
auto position = data_part->getColumnPosition(column_to_read.getNameInStorage());
auto position = data_part_info_for_read->getColumnPosition(column_to_read.getNameInStorage());
if (!position && typeid_cast<const DataTypeArray *>(column_to_read.type.get()))
{
/// If array of Nested column is missing in part,
@ -77,7 +77,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
}
/// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data.
auto buffer_size = getReadBufferSize(data_part, marks_loader, column_positions, all_mark_ranges);
auto buffer_size = getReadBufferSize(*data_part_info_for_read, marks_loader, column_positions, all_mark_ranges);
if (buffer_size)
settings.read_settings = settings.read_settings.adjustBufferSize(buffer_size);
@ -88,10 +88,10 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
if (uncompressed_cache)
{
auto buffer = std::make_unique<CachedCompressedReadBuffer>(
std::string(fs::path(data_part->data_part_storage->getFullPath()) / path),
std::string(fs::path(data_part_info_for_read->getDataPartStorage()->getFullPath()) / path),
[this, path]()
{
return data_part->data_part_storage->readFile(
return data_part_info_for_read->getDataPartStorage()->readFile(
path,
settings.read_settings,
std::nullopt, std::nullopt);
@ -113,7 +113,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
{
auto buffer =
std::make_unique<CompressedReadBufferFromFile>(
data_part->data_part_storage->readFile(
data_part_info_for_read->getDataPartStorage()->readFile(
path,
settings.read_settings,
std::nullopt, std::nullopt),
@ -132,7 +132,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
}
catch (...)
{
storage.reportBrokenPart(data_part);
data_part_info_for_read->reportBroken();
throw;
}
}
@ -156,7 +156,7 @@ size_t MergeTreeReaderCompact::readRows(
while (read_rows < max_rows_to_read)
{
size_t rows_to_read = data_part->index_granularity.getMarkRows(from_mark);
size_t rows_to_read = data_part_info_for_read->getIndexGranularity().getMarkRows(from_mark);
for (size_t pos = 0; pos < num_columns; ++pos)
{
@ -179,7 +179,7 @@ size_t MergeTreeReaderCompact::readRows(
catch (Exception & e)
{
if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
storage.reportBrokenPart(data_part);
data_part_info_for_read->reportBroken();
/// Better diagnostics.
e.addMessage("(while reading column " + columns_to_read[pos].name + ")");
@ -187,7 +187,7 @@ size_t MergeTreeReaderCompact::readRows(
}
catch (...)
{
storage.reportBrokenPart(data_part);
data_part_info_for_read->reportBroken();
throw;
}
}
@ -279,7 +279,7 @@ void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index)
void MergeTreeReaderCompact::adjustUpperBound(size_t last_mark)
{
size_t right_offset = 0;
if (last_mark < data_part->getMarksCount()) /// Otherwise read until the end of file
if (last_mark < data_part_info_for_read->getMarksCount()) /// Otherwise read until the end of file
right_offset = marks_loader.getMark(last_mark).offset_in_compressed_file;
if (right_offset == 0)
@ -307,7 +307,7 @@ bool MergeTreeReaderCompact::isContinuousReading(size_t mark, size_t column_posi
return false;
const auto & [last_mark, last_column] = *last_read_granule;
return (mark == last_mark && column_position == last_column + 1)
|| (mark == last_mark + 1 && column_position == 0 && last_column == data_part->getColumns().size() - 1);
|| (mark == last_mark + 1 && column_position == 0 && last_column == data_part_info_for_read->getColumns().size() - 1);
}
namespace
@ -359,16 +359,16 @@ private:
}
size_t MergeTreeReaderCompact::getReadBufferSize(
const DataPartPtr & part,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
MergeTreeMarksLoader & marks_loader,
const ColumnPositions & column_positions,
const MarkRanges & mark_ranges)
{
size_t buffer_size = 0;
size_t columns_num = column_positions.size();
size_t file_size = part->getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION);
size_t file_size = data_part_info_for_reader.getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION);
MarksCounter counter(part->getMarksCount(), part->getColumns().size());
MarksCounter counter(data_part_info_for_reader.getMarksCount(), data_part_info_for_reader.getColumns().size());
for (const auto & mark_range : mark_ranges)
{

View File

@ -19,7 +19,7 @@ class MergeTreeReaderCompact : public IMergeTreeReader
{
public:
MergeTreeReaderCompact(
DataPartCompactPtr data_part_,
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
@ -67,7 +67,7 @@ private:
/// Returns maximal value of granule size in compressed file from @mark_ranges.
/// This value is used as size of read buffer.
static size_t getReadBufferSize(
const DataPartPtr & part,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
MergeTreeMarksLoader & marks_loader,
const ColumnPositions & column_positions,
const MarkRanges & mark_ranges);

View File

@ -16,13 +16,14 @@ namespace ErrorCodes
MergeTreeReaderInMemory::MergeTreeReaderInMemory(
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
DataPartInMemoryPtr data_part_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,
MarkRanges mark_ranges_,
MergeTreeReaderSettings settings_)
: IMergeTreeReader(
data_part_,
data_part_info_for_read_,
columns_,
metadata_snapshot_,
nullptr,
@ -48,7 +49,7 @@ size_t MergeTreeReaderInMemory::readRows(
if (!continue_reading)
total_rows_read = 0;
size_t total_marks = data_part->index_granularity.getMarksCount();
size_t total_marks = data_part_info_for_read->getIndexGranularity().getMarksCount();
if (from_mark >= total_marks)
throw Exception("Mark " + toString(from_mark) + " is out of bound. Max mark: "
+ toString(total_marks), ErrorCodes::ARGUMENT_OUT_OF_BOUND);

View File

@ -15,6 +15,7 @@ class MergeTreeReaderInMemory : public IMergeTreeReader
{
public:
MergeTreeReaderInMemory(
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
DataPartInMemoryPtr data_part_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,

View File

@ -26,7 +26,7 @@ namespace ErrorCodes
}
MergeTreeReaderWide::MergeTreeReaderWide(
DataPartWidePtr data_part_,
MergeTreeDataPartInfoForReaderPtr data_part_info_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
@ -37,7 +37,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
clockid_t clock_type_)
: IMergeTreeReader(
data_part_,
data_part_info_,
columns_,
metadata_snapshot_,
uncompressed_cache_,
@ -53,7 +53,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
}
catch (...)
{
storage.reportBrokenPart(data_part);
data_part_info_for_read->reportBroken();
throw;
}
}
@ -73,7 +73,7 @@ size_t MergeTreeReaderWide::readRows(
std::unordered_map<String, ISerialization::SubstreamsCache> caches;
std::unordered_set<std::string> prefetched_streams;
if (data_part->data_part_storage->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch)
if (data_part_info_for_read->getDataPartStorage()->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch)
{
/// Request reading of data in advance,
/// so if reading can be asynchronous, it will also be performed in parallel for all columns.
@ -136,17 +136,17 @@ size_t MergeTreeReaderWide::readRows(
catch (Exception & e)
{
if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
storage.reportBrokenPart(data_part);
data_part_info_for_read->reportBroken();
/// Better diagnostics.
e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + " "
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + " "
"from mark " + toString(from_mark) + " "
"with max_rows_to_read = " + toString(max_rows_to_read) + ")");
throw;
}
catch (...)
{
storage.reportBrokenPart(data_part);
data_part_info_for_read->reportBroken();
throw;
}
@ -167,7 +167,7 @@ void MergeTreeReaderWide::addStreams(
if (streams.contains(stream_name))
return;
bool data_file_exists = data_part->checksums.files.contains(stream_name + DATA_FILE_EXTENSION);
bool data_file_exists = data_part_info_for_read->getChecksums().files.contains(stream_name + DATA_FILE_EXTENSION);
/** If data file is missing then we will not try to open it.
* It is necessary since it allows to add new column to structure of the table without creating new files for old parts.
@ -178,10 +178,10 @@ void MergeTreeReaderWide::addStreams(
bool is_lc_dict = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys;
streams.emplace(stream_name, std::make_unique<MergeTreeReaderStream>(
data_part->data_part_storage, stream_name, DATA_FILE_EXTENSION,
data_part->getMarksCount(), all_mark_ranges, settings, mark_cache,
uncompressed_cache, data_part->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
&data_part->index_granularity_info,
data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION,
data_part_info_for_read->getMarksCount(), all_mark_ranges, settings, mark_cache,
uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
&data_part_info_for_read->getIndexGranularityInfo(),
profile_callback, clock_type, is_lc_dict));
};

View File

@ -15,7 +15,7 @@ class MergeTreeReaderWide : public IMergeTreeReader
{
public:
MergeTreeReaderWide(
DataPartWidePtr data_part_,
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,

View File

@ -1,6 +1,7 @@
#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <Interpreters/Context.h>
@ -51,7 +52,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor(
void MergeTreeSelectProcessor::initializeReaders()
{
task_columns = getReadTaskColumns(
storage, storage_snapshot, data_part,
LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot,
required_columns, virt_column_names, prewhere_info, /*with_subcolumns=*/ true);
/// Will be used to distinguish between PREWHERE and WHERE columns when applying filter

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeTreeSequentialSource.h>
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <Processors/Transforms/FilterTransform.h>
#include <QueryPipeline/Pipe.h>
#include <Interpreters/Context.h>
@ -102,7 +103,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
addTotalRowsApprox(data_part->rows_count);
/// Add columns because we don't want to read empty blocks
injectRequiredColumns(storage, storage_snapshot, data_part, /*with_subcolumns=*/ false, columns_to_read);
injectRequiredColumns(LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot, /*with_subcolumns=*/ false, columns_to_read);
NamesAndTypesList columns_for_reader;
if (take_column_types_from_storage)

View File

@ -23,6 +23,7 @@ MergeTreeSink::MergeTreeSink(
, metadata_snapshot(metadata_snapshot_)
, max_parts_per_block(max_parts_per_block_)
, context(context_)
, storage_snapshot(storage.getStorageSnapshot(metadata_snapshot, context))
{
}
@ -54,7 +55,6 @@ struct MergeTreeSink::DelayedChunk
void MergeTreeSink::consume(Chunk chunk)
{
auto block = getHeader().cloneWithColumns(chunk.detachColumns());
auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context);
storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);

View File

@ -9,6 +9,8 @@ namespace DB
class Block;
class StorageMergeTree;
struct StorageSnapshot;
using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
class MergeTreeSink : public SinkToStorage
@ -32,6 +34,7 @@ private:
StorageMetadataPtr metadata_snapshot;
size_t max_parts_per_block;
ContextPtr context;
StorageSnapshotPtr storage_snapshot;
uint64_t chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token
/// We can delay processing for previous chunk and start writing a new one.

View File

@ -41,23 +41,29 @@ struct ReplicatedMergeTreeSink::DelayedChunk
String block_id;
};
DelayedChunk() = default;
explicit DelayedChunk(size_t replicas_num_) : replicas_num(replicas_num_) {}
size_t replicas_num = 0;
std::vector<Partition> partitions;
};
ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
StorageReplicatedMergeTree & storage_,
const StorageMetadataPtr & metadata_snapshot_,
size_t quorum_,
size_t quorum_size,
size_t quorum_timeout_ms_,
size_t max_parts_per_block_,
bool quorum_parallel_,
bool deduplicate_,
bool majority_quorum,
ContextPtr context_,
bool is_attach_)
: SinkToStorage(metadata_snapshot_->getSampleBlock())
, storage(storage_)
, metadata_snapshot(metadata_snapshot_)
, quorum(quorum_)
, required_quorum_size(majority_quorum ? std::nullopt : std::make_optional<size_t>(quorum_size))
, quorum_timeout_ms(quorum_timeout_ms_)
, max_parts_per_block(max_parts_per_block_)
, is_attach(is_attach_)
@ -65,15 +71,15 @@ ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
, deduplicate(deduplicate_)
, log(&Poco::Logger::get(storage.getLogName() + " (Replicated OutputStream)"))
, context(context_)
, storage_snapshot(storage.getStorageSnapshot(metadata_snapshot, context))
{
/// The quorum value `1` has the same meaning as if it is disabled.
if (quorum == 1)
quorum = 0;
if (required_quorum_size == 1)
required_quorum_size = 0;
}
ReplicatedMergeTreeSink::~ReplicatedMergeTreeSink() = default;
/// Allow to verify that the session in ZooKeeper is still alive.
static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
{
@ -84,9 +90,11 @@ static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
throw Exception("ZooKeeper session has been expired.", ErrorCodes::NO_ZOOKEEPER);
}
void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
{
if (!isQuorumEnabled())
return 0;
quorum_info.status_path = storage.zookeeper_path + "/quorum/status";
Strings replicas = zookeeper->getChildren(fs::path(storage.zookeeper_path) / "replicas");
@ -104,9 +112,12 @@ void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zoo
if (status.get().error == Coordination::Error::ZOK)
++active_replicas;
if (active_replicas < quorum)
throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}).",
active_replicas, quorum);
size_t replicas_number = replicas.size();
size_t quorum_size = getQuorumSize(replicas_number);
if (active_replicas < quorum_size)
throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}/{}).",
active_replicas, quorum_size, replicas_number);
/** Is there a quorum for the last part for which a quorum is needed?
* Write of all the parts with the included quorum is linearly ordered.
@ -132,8 +143,9 @@ void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zoo
quorum_info.is_active_node_value = is_active.data;
quorum_info.is_active_node_version = is_active.stat.version;
quorum_info.host_node_version = host.stat.version;
}
return replicas_number;
}
void ReplicatedMergeTreeSink::consume(Chunk chunk)
{
@ -147,10 +159,8 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
* And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
* TODO Too complex logic, you can do better.
*/
if (quorum)
checkQuorumPrecondition(zookeeper);
size_t replicas_num = checkQuorumPrecondition(zookeeper);
auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context);
storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
@ -193,11 +203,11 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
}
block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token);
LOG_DEBUG(log, "Wrote block with ID '{}', {} rows", block_id, current_block.block.rows());
LOG_DEBUG(log, "Wrote block with ID '{}', {} rows on {} replicas", block_id, current_block.block.rows(), replicas_num);
}
else
{
LOG_DEBUG(log, "Wrote block with {} rows", current_block.block.rows());
LOG_DEBUG(log, "Wrote block with {} rows on {} replicas", current_block.block.rows(), replicas_num);
}
UInt64 elapsed_ns = watch.elapsed();
@ -211,7 +221,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
if (streams > max_insert_delayed_streams_for_parallel_write)
{
finishDelayedChunk(zookeeper);
delayed_chunk = std::make_unique<ReplicatedMergeTreeSink::DelayedChunk>();
delayed_chunk = std::make_unique<ReplicatedMergeTreeSink::DelayedChunk>(replicas_num);
delayed_chunk->partitions = std::move(partitions);
finishDelayedChunk(zookeeper);
@ -254,7 +264,7 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe
try
{
commitPart(zookeeper, part, partition.block_id, partition.temp_part.builder);
commitPart(zookeeper, part, partition.block_id, partition.temp_part.builder, delayed_chunk->replicas_num);
last_block_is_duplicate = last_block_is_duplicate || part->is_duplicate;
@ -273,7 +283,6 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe
delayed_chunk.reset();
}
void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPtr & part)
{
/// NOTE: No delay in this case. That's Ok.
@ -281,15 +290,14 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
auto zookeeper = storage.getZooKeeper();
assertSessionIsNotExpired(zookeeper);
if (quorum)
checkQuorumPrecondition(zookeeper);
size_t replicas_num = checkQuorumPrecondition(zookeeper);
Stopwatch watch;
try
{
part->version.setCreationTID(Tx::PrehistoricTID, nullptr);
commitPart(zookeeper, part, "", part->data_part_storage->getBuilder());
commitPart(zookeeper, part, "", part->data_part_storage->getBuilder(), replicas_num);
PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
}
catch (...)
@ -299,12 +307,12 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
}
}
void ReplicatedMergeTreeSink::commitPart(
zkutil::ZooKeeperPtr & zookeeper,
MergeTreeData::MutableDataPartPtr & part,
const String & block_id,
DataPartStorageBuilderPtr builder)
DataPartStorageBuilderPtr builder,
size_t replicas_num)
{
metadata_snapshot->check(part->getColumns());
assertSessionIsNotExpired(zookeeper);
@ -367,7 +375,7 @@ void ReplicatedMergeTreeSink::commitPart(
log_entry.source_replica = storage.replica_name;
log_entry.new_part_name = part->name;
/// TODO maybe add UUID here as well?
log_entry.quorum = quorum;
log_entry.quorum = getQuorumSize(replicas_num);
log_entry.block_id = block_id;
log_entry.new_part_type = part->getType();
@ -384,11 +392,11 @@ void ReplicatedMergeTreeSink::commitPart(
* but for it the quorum has not yet been reached.
* You can not do the next quorum record at this time.)
*/
if (quorum)
if (isQuorumEnabled())
{
ReplicatedMergeTreeQuorumEntry quorum_entry;
quorum_entry.part_name = part->name;
quorum_entry.required_number_of_replicas = quorum;
quorum_entry.required_number_of_replicas = getQuorumSize(replicas_num);
quorum_entry.replicas.insert(storage.replica_name);
/** At this point, this node will contain information that the current replica received a part.
@ -436,7 +444,7 @@ void ReplicatedMergeTreeSink::commitPart(
{
part->is_duplicate = true;
ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
if (quorum)
if (isQuorumEnabled())
{
LOG_INFO(log, "Block with ID {} already exists locally as part {}; ignoring it, but checking quorum.", block_id, existing_part_name);
@ -446,7 +454,7 @@ void ReplicatedMergeTreeSink::commitPart(
else
quorum_path = storage.zookeeper_path + "/quorum/status";
waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value);
waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value, replicas_num);
}
else
{
@ -593,7 +601,7 @@ void ReplicatedMergeTreeSink::commitPart(
break;
}
if (quorum)
if (isQuorumEnabled())
{
if (is_already_existing_part)
{
@ -605,7 +613,7 @@ void ReplicatedMergeTreeSink::commitPart(
storage.updateQuorum(part->name, false);
}
waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value);
waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value, replicas_num);
}
}
@ -627,10 +635,11 @@ void ReplicatedMergeTreeSink::waitForQuorum(
zkutil::ZooKeeperPtr & zookeeper,
const std::string & part_name,
const std::string & quorum_path,
const std::string & is_active_node_value) const
const std::string & is_active_node_value,
size_t replicas_num) const
{
/// We are waiting for quorum to be satisfied.
LOG_TRACE(log, "Waiting for quorum");
LOG_TRACE(log, "Waiting for quorum '{}' for part {} on {} replicas", quorum_path, part_name, replicas_num);
try
{
@ -654,7 +663,7 @@ void ReplicatedMergeTreeSink::waitForQuorum(
if (!event->tryWait(quorum_timeout_ms))
throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED);
LOG_TRACE(log, "Quorum {} updated, will check quorum node still exists", quorum_path);
LOG_TRACE(log, "Quorum {} for part {} updated, will check quorum node still exists", quorum_path, part_name);
}
/// And what if it is possible that the current replica at this time has ceased to be active
@ -672,8 +681,23 @@ void ReplicatedMergeTreeSink::waitForQuorum(
ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
}
LOG_TRACE(log, "Quorum satisfied");
LOG_TRACE(log, "Quorum '{}' for part {} satisfied", quorum_path, part_name);
}
size_t ReplicatedMergeTreeSink::getQuorumSize(size_t replicas_num) const
{
if (!isQuorumEnabled())
return 0;
if (required_quorum_size)
return required_quorum_size.value();
return replicas_num / 2 + 1;
}
bool ReplicatedMergeTreeSink::isQuorumEnabled() const
{
return !required_quorum_size.has_value() || required_quorum_size.value() > 1;
}
}

View File

@ -17,6 +17,8 @@ namespace DB
{
class StorageReplicatedMergeTree;
struct StorageSnapshot;
using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
class ReplicatedMergeTreeSink : public SinkToStorage
@ -30,6 +32,7 @@ public:
size_t max_parts_per_block_,
bool quorum_parallel_,
bool deduplicate_,
bool majority_quorum_,
ContextPtr context_,
// special flag to determine the ALTER TABLE ATTACH PART without the query context,
// needed to set the special LogEntryType::ATTACH_PART
@ -66,24 +69,34 @@ private:
};
QuorumInfo quorum_info;
void checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);
/// Checks active replicas.
/// Returns total number of replicas.
size_t checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);
/// Rename temporary part and commit to ZooKeeper.
void commitPart(
zkutil::ZooKeeperPtr & zookeeper,
MergeTreeData::MutableDataPartPtr & part,
const String & block_id,
DataPartStorageBuilderPtr part_builder);
DataPartStorageBuilderPtr part_builder,
size_t replicas_num);
/// Wait for quorum to be satisfied on path (quorum_path) form part (part_name)
/// Also checks that replica still alive.
void waitForQuorum(
zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name,
const std::string & quorum_path, const std::string & is_active_node_value) const;
const std::string & quorum_path, const std::string & is_active_node_value, size_t replicas_num) const;
StorageReplicatedMergeTree & storage;
StorageMetadataPtr metadata_snapshot;
size_t quorum;
/// Empty means use majority quorum.
std::optional<size_t> required_quorum_size;
size_t getQuorumSize(size_t replicas_num) const;
bool isQuorumEnabled() const;
size_t quorum_timeout_ms;
size_t max_parts_per_block;
@ -96,6 +109,8 @@ private:
Poco::Logger * log;
ContextPtr context;
StorageSnapshotPtr storage_snapshot;
UInt64 chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token
/// We can delay processing for previous chunk and start writing a new one.

View File

@ -1,6 +1,7 @@
#include <Storages/checkAndGetLiteralArgument.h>
#include <Storages/RocksDB/StorageEmbeddedRocksDB.h>
#include <Storages/RocksDB/EmbeddedRocksDBSink.h>
#include <Storages/MutationCommands.h>
#include <DataTypes/DataTypesNumber.h>
@ -10,11 +11,15 @@
#include <Parsers/ASTCreateQuery.h>
#include <QueryPipeline/Pipe.h>
#include <QueryPipeline/QueryPipelineBuilder.h>
#include <Processors/ISource.h>
#include <Interpreters/castColumn.h>
#include <Interpreters/Context.h>
#include <Interpreters/TreeRewriter.h>
#include <Interpreters/MutationsInterpreter.h>
#include <Processors/Executors/PullingPipelineExecutor.h>
#include <Poco/Logger.h>
#include <Poco/Util/AbstractConfiguration.h>
@ -200,6 +205,92 @@ void StorageEmbeddedRocksDB::truncate(const ASTPtr &, const StorageMetadataPtr &
initDB();
}
void StorageEmbeddedRocksDB::checkMutationIsPossible(const MutationCommands & commands, const Settings & /* settings */) const
{
if (commands.empty())
return;
if (commands.size() > 1)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mutations cannot be combined for EmbeddedRocksDB");
const auto command_type = commands.front().type;
if (command_type != MutationCommand::Type::UPDATE && command_type != MutationCommand::Type::DELETE)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only DELETE and UPDATE mutation supported for EmbeddedRocksDB");
}
void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPtr context_)
{
if (commands.empty())
return;
assert(commands.size() == 1);
auto metadata_snapshot = getInMemoryMetadataPtr();
auto storage = getStorageID();
auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context_);
if (commands.front().type == MutationCommand::Type::DELETE)
{
auto interpreter = std::make_unique<MutationsInterpreter>(
storage_ptr,
metadata_snapshot,
commands,
context_,
/*can_execute_*/ true,
/*return_all_columns_*/ true,
/*return_deleted_rows_*/ true);
auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
PullingPipelineExecutor executor(pipeline);
auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
Block block;
while (executor.pull(block))
{
auto column_it = std::find_if(block.begin(), block.end(), [&](const auto & column) { return column.name == primary_key; });
assert(column_it != block.end());
auto column = column_it->column;
auto size = column->size();
rocksdb::WriteBatch batch;
WriteBufferFromOwnString wb_key;
for (size_t i = 0; i < size; ++i)
{
wb_key.restart();
column_it->type->getDefaultSerialization()->serializeBinary(*column, i, wb_key);
auto status = batch.Delete(wb_key.str());
if (!status.ok())
throw Exception("RocksDB write error: " + status.ToString(), ErrorCodes::ROCKSDB_ERROR);
}
auto status = rocksdb_ptr->Write(rocksdb::WriteOptions(), &batch);
if (!status.ok())
throw Exception("RocksDB write error: " + status.ToString(), ErrorCodes::ROCKSDB_ERROR);
}
return;
}
assert(commands.front().type == MutationCommand::Type::UPDATE);
if (commands.front().column_to_update_expression.contains(primary_key))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated");
auto interpreter = std::make_unique<MutationsInterpreter>(
storage_ptr, metadata_snapshot, commands, context_, /*can_execute_*/ true, /*return_all_columns*/ true);
auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
PullingPipelineExecutor executor(pipeline);
auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
Block block;
while (executor.pull(block))
{
sink->consume(Chunk{block.getColumns(), block.rows()});
}
}
void StorageEmbeddedRocksDB::initDB()
{
rocksdb::Status status;

View File

@ -51,6 +51,9 @@ public:
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override;
void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &) override;
void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
void mutate(const MutationCommands &, ContextPtr) override;
bool supportsParallelInsert() const override { return true; }
bool supportsIndexForIn() const override { return true; }
bool mayBenefitFromIndexForIn(

View File

@ -2180,7 +2180,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
if (interserver_scheme != address.scheme)
throw Exception("Interserver schemas are different '" + interserver_scheme + "' != '" + address.scheme + "', can't fetch part from " + address.host, ErrorCodes::LOGICAL_ERROR);
part_desc->res_part = fetcher.fetchPart(
part_desc->res_part = fetcher.fetchSelectedPart(
metadata_snapshot, getContext(), part_desc->found_new_part_name, source_replica_path,
address.host, address.replication_port, timeouts, credentials->getUser(), credentials->getPassword(),
interserver_scheme, replicated_fetches_throttler, false, TMP_PREFIX + "fetch_");
@ -2299,7 +2299,7 @@ void StorageReplicatedMergeTree::executeClonePartFromShard(const LogEntry & entr
+ "' != '" + address.scheme + "', can't fetch part from " + address.host,
ErrorCodes::LOGICAL_ERROR);
return fetcher.fetchPart(
return fetcher.fetchSelectedPart(
metadata_snapshot, getContext(), entry.new_part_name, source_replica_path,
address.host, address.replication_port,
timeouts, credentials->getUser(), credentials->getPassword(), interserver_scheme,
@ -3641,8 +3641,8 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
if (quorum_entry.replicas.size() >= quorum_entry.required_number_of_replicas)
{
/// The quorum is reached. Delete the node, and update information about the last part that was successfully written with quorum.
LOG_TRACE(log, "Got {} replicas confirmed quorum {}, going to remove node",
quorum_entry.replicas.size(), quorum_status_path);
LOG_TRACE(log, "Got {} (of {}) replicas confirmed quorum {}, going to remove node",
quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas, quorum_status_path);
Coordination::Requests ops;
Coordination::Responses responses;
@ -3690,8 +3690,8 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
}
else
{
LOG_TRACE(log, "Quorum {} still not satisfied (have only {} replicas), updating node",
quorum_status_path, quorum_entry.replicas.size());
LOG_TRACE(log, "Quorum {} still not satisfied (have only {} of {} replicas), updating node",
quorum_status_path, quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas);
/// We update the node, registering there one more replica.
auto code = zookeeper->trySet(quorum_status_path, quorum_entry.toString(), stat.version);
@ -3831,9 +3831,10 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);
auto settings_ptr = getSettings();
TableLockHolder table_lock_holder;
if (!to_detached)
table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, settings_ptr->lock_acquire_timeout_for_background_operations);
/// Logging
Stopwatch stopwatch;
@ -3857,7 +3858,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
covered_part_info.mutation = 0;
auto source_part = getActiveContainingPart(covered_part_info);
if (source_part)
/// Fetch for zero-copy replication is cheap and straightforward, so we don't use local clone here
if (source_part && (!settings_ptr->allow_remote_fs_zero_copy_replication || !source_part->data_part_storage->supportZeroCopyReplication()))
{
auto source_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
source_part->getColumns(), source_part->checksums);
@ -3897,7 +3899,6 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
part_to_clone = source_part;
}
}
}
ReplicatedMergeTreeAddress address;
@ -3933,7 +3934,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
+ "' != '" + address.scheme + "', can't fetch part from " + address.host,
ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH);
return fetcher.fetchPart(
return fetcher.fetchSelectedPart(
metadata_snapshot,
getContext(),
part_name,
@ -4070,7 +4071,7 @@ DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
currently_fetching_parts.erase(part_name);
});
LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);
LOG_DEBUG(log, "Fetching already known part {} from {}", part_name, source_replica_path);
TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
@ -4100,7 +4101,7 @@ DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
+ "' != '" + address.scheme + "', can't fetch part from " + address.host,
ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH);
return fetcher.fetchPart(
return fetcher.fetchSelectedPart(
metadata_snapshot, getContext(), part_name, source_replica_path,
address.host, address.replication_port,
timeouts, credentials->getUser(), credentials->getPassword(),
@ -4304,12 +4305,12 @@ ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMerg
auto added_parts = part_with_quorum.added_parts;
for (const auto & added_part : added_parts)
{
if (!getActiveContainingPart(added_part.second))
throw Exception(
"Replica doesn't have part " + added_part.second
+ " which was successfully written to quorum of other replicas."
" Send query to another replica or disable 'select_sequential_consistency' setting.",
ErrorCodes::REPLICA_IS_NOT_IN_QUORUM);
throw Exception(ErrorCodes::REPLICA_IS_NOT_IN_QUORUM,
"Replica doesn't have part '{}' which was successfully written to quorum of other replicas. "
"Send query to another replica or disable 'select_sequential_consistency' setting", added_part.second);
}
for (const auto & max_block : part_with_quorum.getMaxInsertedBlocks())
max_added_blocks[max_block.first] = max_block.second;
@ -4430,13 +4431,13 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con
bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate;
// TODO: should we also somehow pass list of columns to deduplicate on to the ReplicatedMergeTreeSink?
// TODO: insert_quorum = 'auto' would be supported in https://github.com/ClickHouse/ClickHouse/pull/39970, now it's same as 0.
return std::make_shared<ReplicatedMergeTreeSink>(
*this, metadata_snapshot, query_settings.insert_quorum.valueOr(0),
query_settings.insert_quorum_timeout.totalMilliseconds(),
query_settings.max_partitions_per_insert_block,
query_settings.insert_quorum_parallel,
deduplicate,
query_settings.insert_quorum.is_auto,
local_context);
}
@ -5125,7 +5126,7 @@ PartitionCommandsResultInfo StorageReplicatedMergeTree::attachPartition(
MutableDataPartsVector loaded_parts = tryLoadPartsToAttach(partition, attach_part, query_context, renamed_parts);
/// TODO Allow to use quorum here.
ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, query_context,
ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, false, query_context,
/*is_attach*/true);
for (size_t i = 0; i < loaded_parts.size(); ++i)
@ -7538,21 +7539,42 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
{
if (!part.data_part_storage || !part.isStoredOnDisk())
auto settings = getSettings();
if (!settings->allow_remote_fs_zero_copy_replication)
return std::make_pair(true, NameSet{});
if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication())
if (!part.data_part_storage)
LOG_WARNING(log, "Datapart storage for part {} (temp: {}) is not initialzied", part.name, part.is_temp);
if (!part.data_part_storage || !part.isStoredOnDisk())
{
LOG_TRACE(log, "Part {} is not stored on disk, blobs can be removed", part.name);
return std::make_pair(true, NameSet{});
}
if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication())
{
LOG_TRACE(log, "Part {} is not stored on zero-copy replicaed disk, blobs can be removed", part.name);
return std::make_pair(true, NameSet{});
}
/// If part is temporary refcount file may be absent
if (part.data_part_storage->exists(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK))
{
auto ref_count = part.data_part_storage->getRefCount(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK);
if (ref_count > 0) /// Keep part shard info for frozen backups
{
LOG_TRACE(log, "Part {} has more than zero local references ({}), blobs cannot be removed", part.name, ref_count);
return std::make_pair(false, NameSet{});
}
else
{
LOG_TRACE(log, "Part {} local references is zero, will check blobs can be removed in zookeeper", part.name);
}
}
else
{
LOG_TRACE(log, "Part {} looks temporary, because checksums file doesn't exists, blobs can be removed", part.name);
/// Temporary part with some absent file cannot be locked in shared mode
return std::make_pair(true, NameSet{});
}
@ -7600,10 +7622,14 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
if (!children.empty())
{
LOG_TRACE(logger, "Found {} ({}) zookeeper locks for {}", zookeeper_part_uniq_node, children.size(), fmt::join(children, ", "));
LOG_TRACE(logger, "Found {} ({}) zookeper locks for {}", children.size(), fmt::join(children, ", "), zookeeper_part_uniq_node);
part_has_no_more_locks = false;
continue;
}
else
{
LOG_TRACE(logger, "No more children left for for {}, will try to remove the whole node", zookeeper_part_uniq_node);
}
auto error_code = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node);
@ -7654,7 +7680,7 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
}
else
{
LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) were concurrently created",
LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) exists",
zookeeper_part_node, part_name, children.size(), fmt::join(children, ", "));
}
}
@ -8394,7 +8420,7 @@ void StorageReplicatedMergeTree::restoreDataFromBackup(RestorerFromBackup & rest
void StorageReplicatedMergeTree::attachRestoredParts(MutableDataPartsVector && parts)
{
auto metadata_snapshot = getInMemoryMetadataPtr();
auto sink = std::make_shared<ReplicatedMergeTreeSink>(*this, metadata_snapshot, 0, 0, 0, false, false, getContext(), /*is_attach*/true);
auto sink = std::make_shared<ReplicatedMergeTreeSink>(*this, metadata_snapshot, 0, 0, 0, false, false, false, getContext(), /*is_attach*/true);
for (auto part : parts)
sink->writeExistingPart(part);
}

View File

@ -1,6 +1,7 @@
#include "StorageSystemRemoteDataPaths.h"
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesNumber.h>
#include <Interpreters/Cache/FileCache.h>
#include <Interpreters/Cache/FileCacheFactory.h>
#include <Columns/ColumnString.h>
@ -23,6 +24,8 @@ StorageSystemRemoteDataPaths::StorageSystemRemoteDataPaths(const StorageID & tab
{"cache_base_path", std::make_shared<DataTypeString>()},
{"local_path", std::make_shared<DataTypeString>()},
{"remote_path", std::make_shared<DataTypeString>()},
{"size", std::make_shared<DataTypeUInt64>()},
{"common_prefix_for_blobs", std::make_shared<DataTypeString>()},
{"cache_paths", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
}));
setInMemoryMetadata(storage_metadata);
@ -44,6 +47,8 @@ Pipe StorageSystemRemoteDataPaths::read(
MutableColumnPtr col_cache_base_path = ColumnString::create();
MutableColumnPtr col_local_path = ColumnString::create();
MutableColumnPtr col_remote_path = ColumnString::create();
MutableColumnPtr col_size = ColumnUInt64::create();
MutableColumnPtr col_namespace = ColumnString::create();
MutableColumnPtr col_cache_paths = ColumnArray::create(ColumnString::create());
auto disks = context->getDisksMap();
@ -61,7 +66,7 @@ Pipe StorageSystemRemoteDataPaths::read(
if (!cache_base_path.empty())
cache = FileCacheFactory::instance().get(cache_base_path);
for (const auto & [local_path, storage_objects] : remote_paths_by_local_path)
for (const auto & [local_path, common_prefox_for_objects, storage_objects] : remote_paths_by_local_path)
{
for (const auto & object : storage_objects)
{
@ -70,6 +75,8 @@ Pipe StorageSystemRemoteDataPaths::read(
col_cache_base_path->insert(cache_base_path);
col_local_path->insert(local_path);
col_remote_path->insert(object.absolute_path);
col_size->insert(object.bytes_size);
col_namespace->insert(common_prefox_for_objects);
if (cache)
{
@ -91,6 +98,8 @@ Pipe StorageSystemRemoteDataPaths::read(
res_columns.emplace_back(std::move(col_cache_base_path));
res_columns.emplace_back(std::move(col_local_path));
res_columns.emplace_back(std::move(col_remote_path));
res_columns.emplace_back(std::move(col_size));
res_columns.emplace_back(std::move(col_namespace));
res_columns.emplace_back(std::move(col_cache_paths));
UInt64 num_rows = res_columns.at(0)->size();

View File

@ -161,6 +161,16 @@ CI_CONFIG = {
"tidy": "disable",
"with_coverage": False,
},
"binary_amd64sse2": {
"compiler": "clang-14-amd64sse2",
"build_type": "",
"sanitizer": "",
"package_type": "binary",
"static_binary_name": "amd64sse2",
"libraries": "static",
"tidy": "disable",
"with_coverage": False,
},
},
"builds_report_config": {
"ClickHouse build check": [
@ -182,6 +192,7 @@ CI_CONFIG = {
"binary_freebsd",
"binary_darwin_aarch64",
"binary_ppc64le",
"binary_amd64sse2",
],
},
"tests_config": {

View File

@ -87,14 +87,19 @@ def should_run_checks_for_pr(pr_info: PRInfo) -> Tuple[bool, str, str]:
# Consider the labels and whether the user is trusted.
print("Got labels", pr_info.labels)
if FORCE_TESTS_LABEL in pr_info.labels:
print(f"Label '{FORCE_TESTS_LABEL}' set, forcing remaining checks")
return True, f"Labeled '{FORCE_TESTS_LABEL}'", "pending"
if DO_NOT_TEST_LABEL in pr_info.labels:
print(f"Label '{DO_NOT_TEST_LABEL}' set, skipping remaining checks")
return False, f"Labeled '{DO_NOT_TEST_LABEL}'", "success"
if CAN_BE_TESTED_LABEL not in pr_info.labels and not pr_is_by_trusted_user(
pr_info.user_login, pr_info.user_orgs
):
print(
f"PRs by untrusted users need the '{CAN_BE_TESTED_LABEL}' label - please contact a member of the core team"
)
return False, "Needs 'can be tested' label", "failure"
if OK_SKIP_LABELS.intersection(pr_info.labels):
@ -219,7 +224,7 @@ if __name__ == "__main__":
elif SUBMODULE_CHANGED_LABEL in pr_info.labels:
pr_labels_to_remove.append(SUBMODULE_CHANGED_LABEL)
print(f"change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}")
print(f"Change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}")
if pr_labels_to_add:
post_labels(gh, pr_info, pr_labels_to_add)

View File

@ -1,18 +1,6 @@
path:
- /var/lib/clickhouse
- "@replace": replace
tmp_path:
- /var/lib/clickhouse/tmp/
- "@replace": replace
user_files_path:
- /var/lib/clickhouse/user_files/
- "@replace": replace
format_schema_path:
- /var/lib/clickhouse/format_schemas/
- "@replace": replace
access_control_path:
- /var/lib/clickhouse/access/
- "@replace": replace
top_level_domains_path:
- /var/lib/clickhouse/top_level_domains/
- "@replace": replace
path: /var/lib/clickhouse
tmp_path: /var/lib/clickhouse/tmp/
user_files_path: /var/lib/clickhouse/user_files/
format_schema_path: /var/lib/clickhouse/format_schemas/
access_control_path: /var/lib/clickhouse/access/
top_level_domains_path: /var/lib/clickhouse/top_level_domains/

View File

@ -6,7 +6,6 @@ users:
default:
password: ''
networks:
"@replace": replace
ip: '::/0'
profile: default

Some files were not shown because too many files have changed in this diff Show More