Merge branch 'master' into revert-61200-revert-60430-max-parallel-replicas-validate

This commit is contained in:
Kruglov Pavel 2024-03-20 13:59:38 +01:00 committed by GitHub
commit e7e11d738b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
821 changed files with 16393 additions and 11982 deletions

View File

@ -5,9 +5,7 @@
# a) the new check is not controversial (this includes many checks in readability-* and google-*) or
# b) too noisy (checks with > 100 new warnings are considered noisy, this includes e.g. cppcoreguidelines-*).
# TODO Let clang-tidy check headers in further directories
# --> HeaderFilterRegex: '^.*/(src|base|programs|utils)/.*(h|hpp)$'
HeaderFilterRegex: '^.*/(base|programs|utils)/.*(h|hpp)$'
HeaderFilterRegex: '^.*/(base|src|programs|utils)/.*(h|hpp)$'
Checks: [
'*',
@ -22,6 +20,7 @@ Checks: [
'-bugprone-branch-clone',
'-bugprone-easily-swappable-parameters',
'-bugprone-exception-escape',
'-bugprone-forward-declaration-namespace',
'-bugprone-implicit-widening-of-multiplication-result',
'-bugprone-narrowing-conversions',
'-bugprone-not-null-terminated-result',
@ -37,6 +36,8 @@ Checks: [
'-cert-oop54-cpp',
'-cert-oop57-cpp',
'-clang-analyzer-optin.performance.Padding',
'-clang-analyzer-unix.Malloc',
'-cppcoreguidelines-*', # impractical in a codebase as large as ClickHouse, also slow

View File

@ -45,62 +45,3 @@ jobs:
with:
data: "${{ needs.RunConfig.outputs.data }}"
set_latest: true
SonarCloud:
runs-on: [self-hosted, builder]
env:
SONAR_SCANNER_VERSION: 4.8.0.2856
SONAR_SERVER_URL: "https://sonarcloud.io"
BUILD_WRAPPER_OUT_DIR: build_wrapper_output_directory # Directory where build-wrapper output will be placed
CC: clang-17
CXX: clang++-17
steps:
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
filter: tree:0
submodules: true
- name: Set up JDK 11
uses: actions/setup-java@v1
with:
java-version: 11
- name: Download and set up sonar-scanner
env:
SONAR_SCANNER_DOWNLOAD_URL: https://binaries.sonarsource.com/Distribution/sonar-scanner-cli/sonar-scanner-cli-${{ env.SONAR_SCANNER_VERSION }}-linux.zip
run: |
mkdir -p "$HOME/.sonar"
curl -sSLo "$HOME/.sonar/sonar-scanner.zip" "${{ env.SONAR_SCANNER_DOWNLOAD_URL }}"
unzip -o "$HOME/.sonar/sonar-scanner.zip" -d "$HOME/.sonar/"
echo "$HOME/.sonar/sonar-scanner-${{ env.SONAR_SCANNER_VERSION }}-linux/bin" >> "$GITHUB_PATH"
- name: Download and set up build-wrapper
env:
BUILD_WRAPPER_DOWNLOAD_URL: ${{ env.SONAR_SERVER_URL }}/static/cpp/build-wrapper-linux-x86.zip
run: |
curl -sSLo "$HOME/.sonar/build-wrapper-linux-x86.zip" "${{ env.BUILD_WRAPPER_DOWNLOAD_URL }}"
unzip -o "$HOME/.sonar/build-wrapper-linux-x86.zip" -d "$HOME/.sonar/"
echo "$HOME/.sonar/build-wrapper-linux-x86" >> "$GITHUB_PATH"
- name: Set Up Build Tools
run: |
sudo apt-get update
sudo apt-get install -yq git cmake ccache ninja-build python3 yasm nasm
sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
- name: Run build-wrapper
run: |
mkdir build
cd build
cmake ..
cd ..
build-wrapper-linux-x86-64 --out-dir ${{ env.BUILD_WRAPPER_OUT_DIR }} cmake --build build/
- name: Run sonar-scanner
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
run: |
sonar-scanner \
--define sonar.host.url="${{ env.SONAR_SERVER_URL }}" \
--define sonar.cfamily.build-wrapper-output="${{ env.BUILD_WRAPPER_OUT_DIR }}" \
--define sonar.projectKey="ClickHouse_ClickHouse" \
--define sonar.organization="clickhouse-java" \
--define sonar.cfamily.cpp23.enabled=true \
--define sonar.exclusions="**/*.java,**/*.ts,**/*.js,**/*.css,**/*.sql"

View File

@ -172,6 +172,7 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py --check-approved
#############################################################################################

View File

@ -43,8 +43,7 @@ jobs:
runs-on: [self-hosted, '${{inputs.runner_type}}']
steps:
- name: Check out repository code
# WIP: temporary try commit with limited perallelization of checkout
uses: ClickHouse/checkout@0be3f7b3098bae494d3ef5d29d2e0676fb606232
uses: ClickHouse/checkout@v1
with:
clear-repository: true
ref: ${{ fromJson(inputs.data).git_ref }}

View File

@ -56,13 +56,13 @@ option(ENABLE_CHECK_HEAVY_BUILDS "Don't allow C++ translation units to compile t
if (ENABLE_CHECK_HEAVY_BUILDS)
# set DATA (since RSS does not work since 2.6.x+) to 5G
set (RLIMIT_DATA 5000000000)
# set VIRT (RLIMIT_AS) to 10G (DATA*10)
# set VIRT (RLIMIT_AS) to 10G (DATA*2)
set (RLIMIT_AS 10000000000)
# set CPU time limit to 1000 seconds
set (RLIMIT_CPU 1000)
# -fsanitize=memory and address are too heavy
if (SANITIZE)
if (SANITIZE OR SANITIZE_COVERAGE OR WITH_COVERAGE)
set (RLIMIT_DATA 10000000000) # 10G
endif()
@ -110,11 +110,6 @@ endif()
# - sanitize.cmake
add_library(global-libs INTERFACE)
# We don't want to instrument everything with fuzzer, but only specific targets (see below),
# also, since we build our own llvm, we specifically don't want to instrument
# libFuzzer library itself - it would result in infinite recursion
#include (cmake/fuzzer.cmake)
include (cmake/sanitize.cmake)
option(ENABLE_COLORED_BUILD "Enable colors in compiler output" ON)
@ -554,7 +549,9 @@ if (ENABLE_RUST)
endif()
endif()
if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64))
if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO"
AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND NOT ENABLE_FUZZING
AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64))
set(CHECK_LARGE_OBJECT_SIZES_DEFAULT ON)
else ()
set(CHECK_LARGE_OBJECT_SIZES_DEFAULT OFF)
@ -577,10 +574,7 @@ if (FUZZER)
if (NOT(target_type STREQUAL "INTERFACE_LIBRARY" OR target_type STREQUAL "UTILITY"))
target_compile_options(${target} PRIVATE "-fsanitize=fuzzer-no-link")
endif()
# clickhouse fuzzer isn't working correctly
# initial PR https://github.com/ClickHouse/ClickHouse/pull/27526
#if (target MATCHES ".+_fuzzer" OR target STREQUAL "clickhouse")
if (target_type STREQUAL "EXECUTABLE" AND target MATCHES ".+_fuzzer")
if (target_type STREQUAL "EXECUTABLE" AND (target MATCHES ".+_fuzzer" OR target STREQUAL "clickhouse"))
message(STATUS "${target} instrumented with fuzzer")
target_link_libraries(${target} PUBLIC ch_contrib::fuzzer)
# Add to fuzzers bundle

View File

@ -20,6 +20,7 @@ set (SRCS
getPageSize.cpp
getThreadId.cpp
int8_to_string.cpp
itoa.cpp
JSON.cpp
mremap.cpp
phdr_cache.cpp

View File

@ -50,9 +50,6 @@ std::optional<uint64_t> getCgroupsV2MemoryLimit()
}
/** Returns the size of physical memory (RAM) in bytes.
* Returns 0 on unsupported platform
*/
uint64_t getMemoryAmountOrZero()
{
int64_t num_pages = sysconf(_SC_PHYS_PAGES);

View File

@ -2,11 +2,10 @@
#include <cstdint>
/** Returns the size of physical memory (RAM) in bytes.
* Returns 0 on unsupported platform or if it cannot determine the size of physical memory.
*/
/// Returns the size in bytes of physical memory (RAM) available to the process. The value can
/// be smaller than the total available RAM available to the system due to cgroups settings.
/// Returns 0 on unsupported platform or if it cannot determine the size of physical memory.
uint64_t getMemoryAmountOrZero();
/** Throws exception if it cannot determine the size of physical memory.
*/
/// Throws exception if it cannot determine the size of physical memory.
uint64_t getMemoryAmount();

503
base/base/itoa.cpp Normal file
View File

@ -0,0 +1,503 @@
// Based on https://github.com/amdn/itoa and combined with our optimizations
//
//=== itoa.cpp - Fast integer to ascii conversion --*- C++ -*-//
//
// The MIT License (MIT)
// Copyright (c) 2016 Arturo Martin-de-Nicolas
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//===----------------------------------------------------------------------===//
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <type_traits>
#include <base/defines.h>
#include <base/extended_types.h>
#include <base/itoa.h>
namespace
{
template <typename T>
ALWAYS_INLINE inline constexpr T pow10(size_t x)
{
return x ? 10 * pow10<T>(x - 1) : 1;
}
// Division by a power of 10 is implemented using a multiplicative inverse.
// This strength reduction is also done by optimizing compilers, but
// presently the fastest results are produced by using the values
// for the multiplication and the shift as given by the algorithm
// described by Agner Fog in "Optimizing Subroutines in Assembly Language"
//
// http://www.agner.org/optimize/optimizing_assembly.pdf
//
// "Integer division by a constant (all processors)
// A floating point number can be divided by a constant by multiplying
// with the reciprocal. If we want to do the same with integers, we have
// to scale the reciprocal by 2n and then shift the product to the right
// by n. There are various algorithms for finding a suitable value of n
// and compensating for rounding errors. The algorithm described below
// was invented by Terje Mathisen, Norway, and not published elsewhere."
/// Division by constant is performed by:
/// 1. Adding 1 if needed;
/// 2. Multiplying by another constant;
/// 3. Shifting right by another constant.
template <typename UInt, bool add_, UInt multiplier_, unsigned shift_>
struct Division
{
static constexpr bool add{add_};
static constexpr UInt multiplier{multiplier_};
static constexpr unsigned shift{shift_};
};
/// Select a type with appropriate number of bytes from the list of types.
/// First parameter is the number of bytes requested. Then goes a list of types with 1, 2, 4, ... number of bytes.
/// Example: SelectType<4, uint8_t, uint16_t, uint32_t, uint64_t> will select uint32_t.
template <size_t N, typename T, typename... Ts>
struct SelectType
{
using Result = typename SelectType<N / 2, Ts...>::Result;
};
template <typename T, typename... Ts>
struct SelectType<1, T, Ts...>
{
using Result = T;
};
/// Division by 10^N where N is the size of the type.
template <size_t N>
using DivisionBy10PowN = typename SelectType<
N,
Division<uint8_t, false, 205U, 11>, /// divide by 10
Division<uint16_t, true, 41943U, 22>, /// divide by 100
Division<uint32_t, false, 3518437209U, 45>, /// divide by 10000
Division<uint64_t, false, 12379400392853802749ULL, 90> /// divide by 100000000
>::Result;
template <size_t N>
using UnsignedOfSize = typename SelectType<N, uint8_t, uint16_t, uint32_t, uint64_t, __uint128_t>::Result;
/// Holds the result of dividing an unsigned N-byte variable by 10^N resulting in
template <size_t N>
struct QuotientAndRemainder
{
UnsignedOfSize<N> quotient; // quotient with fewer than 2*N decimal digits
UnsignedOfSize<N / 2> remainder; // remainder with at most N decimal digits
};
template <size_t N>
QuotientAndRemainder<N> inline split(UnsignedOfSize<N> value)
{
constexpr DivisionBy10PowN<N> division;
UnsignedOfSize<N> quotient = (division.multiplier * (UnsignedOfSize<2 * N>(value) + division.add)) >> division.shift;
UnsignedOfSize<N / 2> remainder = static_cast<UnsignedOfSize<N / 2>>(value - quotient * pow10<UnsignedOfSize<N / 2>>(N));
return {quotient, remainder};
}
ALWAYS_INLINE inline char * outDigit(char * p, uint8_t value)
{
*p = '0' + value;
++p;
return p;
}
// Using a lookup table to convert binary numbers from 0 to 99
// into ascii characters as described by Andrei Alexandrescu in
// https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/
const char digits[201] = "00010203040506070809"
"10111213141516171819"
"20212223242526272829"
"30313233343536373839"
"40414243444546474849"
"50515253545556575859"
"60616263646566676869"
"70717273747576777879"
"80818283848586878889"
"90919293949596979899";
ALWAYS_INLINE inline char * outTwoDigits(char * p, uint8_t value)
{
memcpy(p, &digits[value * 2], 2);
p += 2;
return p;
}
namespace convert
{
template <typename UInt, size_t N = sizeof(UInt)>
char * head(char * p, UInt u);
template <typename UInt, size_t N = sizeof(UInt)>
char * tail(char * p, UInt u);
//===----------------------------------------------------------===//
// head: find most significant digit, skip leading zeros
//===----------------------------------------------------------===//
// "x" contains quotient and remainder after division by 10^N
// quotient is less than 10^N
template <size_t N>
ALWAYS_INLINE inline char * head(char * p, QuotientAndRemainder<N> x)
{
p = head(p, UnsignedOfSize<N / 2>(x.quotient));
p = tail(p, x.remainder);
return p;
}
// "u" is less than 10^2*N
template <typename UInt, size_t N>
ALWAYS_INLINE inline char * head(char * p, UInt u)
{
return u < pow10<UnsignedOfSize<N>>(N) ? head(p, UnsignedOfSize<N / 2>(u)) : head<N>(p, split<N>(u));
}
// recursion base case, selected when "u" is one byte
template <>
ALWAYS_INLINE inline char * head<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
{
return u < 10 ? outDigit(p, u) : outTwoDigits(p, u);
}
//===----------------------------------------------------------===//
// tail: produce all digits including leading zeros
//===----------------------------------------------------------===//
// recursive step, "u" is less than 10^2*N
template <typename UInt, size_t N>
ALWAYS_INLINE inline char * tail(char * p, UInt u)
{
QuotientAndRemainder<N> x = split<N>(u);
p = tail(p, UnsignedOfSize<N / 2>(x.quotient));
p = tail(p, x.remainder);
return p;
}
// recursion base case, selected when "u" is one byte
template <>
ALWAYS_INLINE inline char * tail<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
{
return outTwoDigits(p, u);
}
//===----------------------------------------------------------===//
// large values are >= 10^2*N
// where x contains quotient and remainder after division by 10^N
//===----------------------------------------------------------===//
template <size_t N>
ALWAYS_INLINE inline char * large(char * p, QuotientAndRemainder<N> x)
{
QuotientAndRemainder<N> y = split<N>(x.quotient);
p = head(p, UnsignedOfSize<N / 2>(y.quotient));
p = tail(p, y.remainder);
p = tail(p, x.remainder);
return p;
}
//===----------------------------------------------------------===//
// handle values of "u" that might be >= 10^2*N
// where N is the size of "u" in bytes
//===----------------------------------------------------------===//
template <typename UInt, size_t N = sizeof(UInt)>
ALWAYS_INLINE inline char * uitoa(char * p, UInt u)
{
if (u < pow10<UnsignedOfSize<N>>(N))
return head(p, UnsignedOfSize<N / 2>(u));
QuotientAndRemainder<N> x = split<N>(u);
return u < pow10<UnsignedOfSize<N>>(2 * N) ? head<N>(p, x) : large<N>(p, x);
}
// selected when "u" is one byte
template <>
ALWAYS_INLINE inline char * uitoa<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
{
if (u < 10)
return outDigit(p, u);
else if (u < 100)
return outTwoDigits(p, u);
else
{
p = outDigit(p, u / 100);
p = outTwoDigits(p, u % 100);
return p;
}
}
//===----------------------------------------------------------===//
// handle unsigned and signed integral operands
//===----------------------------------------------------------===//
// itoa: handle unsigned integral operands (selected by SFINAE)
template <typename U, std::enable_if_t<!std::is_signed_v<U> && std::is_integral_v<U>> * = nullptr>
ALWAYS_INLINE inline char * itoa(U u, char * p)
{
return convert::uitoa(p, u);
}
// itoa: handle signed integral operands (selected by SFINAE)
template <typename I, size_t N = sizeof(I), std::enable_if_t<std::is_signed_v<I> && std::is_integral_v<I>> * = nullptr>
ALWAYS_INLINE inline char * itoa(I i, char * p)
{
// Need "mask" to be filled with a copy of the sign bit.
// If "i" is a negative value, then the result of "operator >>"
// is implementation-defined, though usually it is an arithmetic
// right shift that replicates the sign bit.
// Use a conditional expression to be portable,
// a good optimizing compiler generates an arithmetic right shift
// and avoids the conditional branch.
UnsignedOfSize<N> mask = i < 0 ? ~UnsignedOfSize<N>(0) : 0;
// Now get the absolute value of "i" and cast to unsigned type UnsignedOfSize<N>.
// Cannot use std::abs() because the result is undefined
// in 2's complement systems for the most-negative value.
// Want to avoid conditional branch for performance reasons since
// CPU branch prediction will be ineffective when negative values
// occur randomly.
// Let "u" be "i" cast to unsigned type UnsignedOfSize<N>.
// Subtract "u" from 2*u if "i" is positive or 0 if "i" is negative.
// This yields the absolute value with the desired type without
// using a conditional branch and without invoking undefined or
// implementation defined behavior:
UnsignedOfSize<N> u = ((2 * UnsignedOfSize<N>(i)) & ~mask) - UnsignedOfSize<N>(i);
// Unconditionally store a minus sign when producing digits
// in a forward direction and increment the pointer only if
// the value is in fact negative.
// This avoids a conditional branch and is safe because we will
// always produce at least one digit and it will overwrite the
// minus sign when the value is not negative.
*p = '-';
p += (mask & 1);
p = convert::uitoa(p, u);
return p;
}
}
const uint64_t max_multiple_of_hundred_that_fits_in_64_bits = 1'00'00'00'00'00'00'00'00'00ull;
const int max_multiple_of_hundred_blocks = 9;
static_assert(max_multiple_of_hundred_that_fits_in_64_bits % 100 == 0);
ALWAYS_INLINE inline char * writeUIntText(UInt128 _x, char * p)
{
/// If we the highest 64bit item is empty, we can print just the lowest item as u64
if (_x.items[UInt128::_impl::little(1)] == 0)
return convert::itoa(_x.items[UInt128::_impl::little(0)], p);
/// Doing operations using __int128 is faster and we already rely on this feature
using T = unsigned __int128;
T x = (T(_x.items[UInt128::_impl::little(1)]) << 64) + T(_x.items[UInt128::_impl::little(0)]);
/// We are going to accumulate blocks of 2 digits to print until the number is small enough to be printed as u64
/// To do this we could do: x / 100, x % 100
/// But these would mean doing many iterations with long integers, so instead we divide by a much longer integer
/// multiple of 100 (100^9) and then get the blocks out of it (as u64)
/// Once we reach u64::max we can stop and use the fast method to print that in the front
static const T large_divisor = max_multiple_of_hundred_that_fits_in_64_bits;
static const T largest_uint64 = std::numeric_limits<uint64_t>::max();
uint8_t two_values[20] = {0}; // 39 Max characters / 2
int current_block = 0;
while (x > largest_uint64)
{
uint64_t u64_remainder = uint64_t(x % large_divisor);
x /= large_divisor;
int pos = current_block;
while (u64_remainder)
{
two_values[pos] = uint8_t(u64_remainder % 100);
pos++;
u64_remainder /= 100;
}
current_block += max_multiple_of_hundred_blocks;
}
char * highest_part_print = convert::itoa(uint64_t(x), p);
for (int i = 0; i < current_block; i++)
{
outTwoDigits(highest_part_print, two_values[current_block - 1 - i]);
highest_part_print += 2;
}
return highest_part_print;
}
ALWAYS_INLINE inline char * writeUIntText(UInt256 _x, char * p)
{
/// If possible, treat it as a smaller integer as they are much faster to print
if (_x.items[UInt256::_impl::little(3)] == 0 && _x.items[UInt256::_impl::little(2)] == 0)
return writeUIntText(UInt128{_x.items[UInt256::_impl::little(0)], _x.items[UInt256::_impl::little(1)]}, p);
/// If available (x86) we transform from our custom class to _BitInt(256) which has better support in the compiler
/// and produces better code
using T =
#if defined(__x86_64__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wbit-int-extension"
unsigned _BitInt(256)
# pragma clang diagnostic pop
#else
UInt256
#endif
;
#if defined(__x86_64__)
T x = (T(_x.items[UInt256::_impl::little(3)]) << 192) + (T(_x.items[UInt256::_impl::little(2)]) << 128)
+ (T(_x.items[UInt256::_impl::little(1)]) << 64) + T(_x.items[UInt256::_impl::little(0)]);
#else
T x = _x;
#endif
/// Similar to writeUIntText(UInt128) only that in this case we will stop as soon as we reach the largest u128
/// and switch to that function
uint8_t two_values[39] = {0}; // 78 Max characters / 2
int current_pos = 0;
static const T large_divisor = max_multiple_of_hundred_that_fits_in_64_bits;
static const T largest_uint128 = T(std::numeric_limits<uint64_t>::max()) << 64 | T(std::numeric_limits<uint64_t>::max());
while (x > largest_uint128)
{
uint64_t u64_remainder = uint64_t(x % large_divisor);
x /= large_divisor;
int pos = current_pos;
while (u64_remainder)
{
two_values[pos] = uint8_t(u64_remainder % 100);
pos++;
u64_remainder /= 100;
}
current_pos += max_multiple_of_hundred_blocks;
}
#if defined(__x86_64__)
UInt128 pending{uint64_t(x), uint64_t(x >> 64)};
#else
UInt128 pending{x.items[UInt256::_impl::little(0)], x.items[UInt256::_impl::little(1)]};
#endif
char * highest_part_print = writeUIntText(pending, p);
for (int i = 0; i < current_pos; i++)
{
outTwoDigits(highest_part_print, two_values[current_pos - 1 - i]);
highest_part_print += 2;
}
return highest_part_print;
}
ALWAYS_INLINE inline char * writeLeadingMinus(char * pos)
{
*pos = '-';
return pos + 1;
}
template <typename T>
ALWAYS_INLINE inline char * writeSIntText(T x, char * pos)
{
static_assert(std::is_same_v<T, Int128> || std::is_same_v<T, Int256>);
using UnsignedT = make_unsigned_t<T>;
static constexpr T min_int = UnsignedT(1) << (sizeof(T) * 8 - 1);
if (unlikely(x == min_int))
{
if constexpr (std::is_same_v<T, Int128>)
{
const char * res = "-170141183460469231731687303715884105728";
memcpy(pos, res, strlen(res));
return pos + strlen(res);
}
else if constexpr (std::is_same_v<T, Int256>)
{
const char * res = "-57896044618658097711785492504343953926634992332820282019728792003956564819968";
memcpy(pos, res, strlen(res));
return pos + strlen(res);
}
}
if (x < 0)
{
x = -x;
pos = writeLeadingMinus(pos);
}
return writeUIntText(UnsignedT(x), pos);
}
}
char * itoa(UInt8 i, char * p)
{
return convert::itoa(uint8_t(i), p);
}
char * itoa(Int8 i, char * p)
{
return convert::itoa(int8_t(i), p);
}
char * itoa(UInt128 i, char * p)
{
return writeUIntText(i, p);
}
char * itoa(Int128 i, char * p)
{
return writeSIntText(i, p);
}
char * itoa(UInt256 i, char * p)
{
return writeUIntText(i, p);
}
char * itoa(Int256 i, char * p)
{
return writeSIntText(i, p);
}
#define DEFAULT_ITOA(T) \
char * itoa(T i, char * p) \
{ \
return convert::itoa(i, p); \
}
#define FOR_MISSING_INTEGER_TYPES(M) \
M(uint8_t) \
M(UInt16) \
M(UInt32) \
M(UInt64) \
M(int8_t) \
M(Int16) \
M(Int32) \
M(Int64)
FOR_MISSING_INTEGER_TYPES(DEFAULT_ITOA)
#if defined(OS_DARWIN)
DEFAULT_ITOA(unsigned long)
DEFAULT_ITOA(long)
#endif
#undef FOR_MISSING_INTEGER_TYPES
#undef DEFAULT_ITOA

View File

@ -1,446 +1,30 @@
#pragma once
// Based on https://github.com/amdn/itoa and combined with our optimizations
//
//=== itoa.h - Fast integer to ascii conversion --*- C++ -*-//
//
// The MIT License (MIT)
// Copyright (c) 2016 Arturo Martin-de-Nicolas
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//===----------------------------------------------------------------------===//
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <type_traits>
#include <base/extended_types.h>
#define FOR_INTEGER_TYPES(M) \
M(uint8_t) \
M(UInt8) \
M(UInt16) \
M(UInt32) \
M(UInt64) \
M(UInt128) \
M(UInt256) \
M(int8_t) \
M(Int8) \
M(Int16) \
M(Int32) \
M(Int64) \
M(Int128) \
M(Int256)
template <typename T>
inline int digits10(T x)
{
if (x < 10ULL)
return 1;
if (x < 100ULL)
return 2;
if (x < 1000ULL)
return 3;
#define INSTANTIATION(T) char * itoa(T i, char * p);
FOR_INTEGER_TYPES(INSTANTIATION)
if (x < 1000000000000ULL)
{
if (x < 100000000ULL)
{
if (x < 1000000ULL)
{
if (x < 10000ULL)
return 4;
else
return 5 + (x >= 100000ULL);
}
#if defined(OS_DARWIN)
INSTANTIATION(unsigned long)
INSTANTIATION(long)
#endif
return 7 + (x >= 10000000ULL);
}
if (x < 10000000000ULL)
return 9 + (x >= 1000000000ULL);
return 11 + (x >= 100000000000ULL);
}
return 12 + digits10(x / 1000000000000ULL);
}
namespace impl
{
template <typename T>
static constexpr T pow10(size_t x)
{
return x ? 10 * pow10<T>(x - 1) : 1;
}
// Division by a power of 10 is implemented using a multiplicative inverse.
// This strength reduction is also done by optimizing compilers, but
// presently the fastest results are produced by using the values
// for the multiplication and the shift as given by the algorithm
// described by Agner Fog in "Optimizing Subroutines in Assembly Language"
//
// http://www.agner.org/optimize/optimizing_assembly.pdf
//
// "Integer division by a constant (all processors)
// A floating point number can be divided by a constant by multiplying
// with the reciprocal. If we want to do the same with integers, we have
// to scale the reciprocal by 2n and then shift the product to the right
// by n. There are various algorithms for finding a suitable value of n
// and compensating for rounding errors. The algorithm described below
// was invented by Terje Mathisen, Norway, and not published elsewhere."
/// Division by constant is performed by:
/// 1. Adding 1 if needed;
/// 2. Multiplying by another constant;
/// 3. Shifting right by another constant.
template <typename UInt, bool add_, UInt multiplier_, unsigned shift_>
struct Division
{
static constexpr bool add{add_};
static constexpr UInt multiplier{multiplier_};
static constexpr unsigned shift{shift_};
};
/// Select a type with appropriate number of bytes from the list of types.
/// First parameter is the number of bytes requested. Then goes a list of types with 1, 2, 4, ... number of bytes.
/// Example: SelectType<4, uint8_t, uint16_t, uint32_t, uint64_t> will select uint32_t.
template <size_t N, typename T, typename... Ts>
struct SelectType
{
using Result = typename SelectType<N / 2, Ts...>::Result;
};
template <typename T, typename... Ts>
struct SelectType<1, T, Ts...>
{
using Result = T;
};
/// Division by 10^N where N is the size of the type.
template <size_t N>
using DivisionBy10PowN = typename SelectType
<
N,
Division<uint8_t, false, 205U, 11>, /// divide by 10
Division<uint16_t, true, 41943U, 22>, /// divide by 100
Division<uint32_t, false, 3518437209U, 45>, /// divide by 10000
Division<uint64_t, false, 12379400392853802749ULL, 90> /// divide by 100000000
>::Result;
template <size_t N>
using UnsignedOfSize = typename SelectType
<
N,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
__uint128_t
>::Result;
/// Holds the result of dividing an unsigned N-byte variable by 10^N resulting in
template <size_t N>
struct QuotientAndRemainder
{
UnsignedOfSize<N> quotient; // quotient with fewer than 2*N decimal digits
UnsignedOfSize<N / 2> remainder; // remainder with at most N decimal digits
};
template <size_t N>
QuotientAndRemainder<N> static inline split(UnsignedOfSize<N> value)
{
constexpr DivisionBy10PowN<N> division;
UnsignedOfSize<N> quotient = (division.multiplier * (UnsignedOfSize<2 * N>(value) + division.add)) >> division.shift;
UnsignedOfSize<N / 2> remainder = static_cast<UnsignedOfSize<N / 2>>(value - quotient * pow10<UnsignedOfSize<N / 2>>(N));
return {quotient, remainder};
}
static inline char * outDigit(char * p, uint8_t value)
{
*p = '0' + value;
++p;
return p;
}
// Using a lookup table to convert binary numbers from 0 to 99
// into ascii characters as described by Andrei Alexandrescu in
// https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/
static const char digits[201] = "00010203040506070809"
"10111213141516171819"
"20212223242526272829"
"30313233343536373839"
"40414243444546474849"
"50515253545556575859"
"60616263646566676869"
"70717273747576777879"
"80818283848586878889"
"90919293949596979899";
static inline char * outTwoDigits(char * p, uint8_t value)
{
memcpy(p, &digits[value * 2], 2);
p += 2;
return p;
}
namespace convert
{
template <typename UInt, size_t N = sizeof(UInt)> static char * head(char * p, UInt u);
template <typename UInt, size_t N = sizeof(UInt)> static char * tail(char * p, UInt u);
//===----------------------------------------------------------===//
// head: find most significant digit, skip leading zeros
//===----------------------------------------------------------===//
// "x" contains quotient and remainder after division by 10^N
// quotient is less than 10^N
template <size_t N>
static inline char * head(char * p, QuotientAndRemainder<N> x)
{
p = head(p, UnsignedOfSize<N / 2>(x.quotient));
p = tail(p, x.remainder);
return p;
}
// "u" is less than 10^2*N
template <typename UInt, size_t N>
static inline char * head(char * p, UInt u)
{
return u < pow10<UnsignedOfSize<N>>(N)
? head(p, UnsignedOfSize<N / 2>(u))
: head<N>(p, split<N>(u));
}
// recursion base case, selected when "u" is one byte
template <>
inline char * head<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
{
return u < 10
? outDigit(p, u)
: outTwoDigits(p, u);
}
//===----------------------------------------------------------===//
// tail: produce all digits including leading zeros
//===----------------------------------------------------------===//
// recursive step, "u" is less than 10^2*N
template <typename UInt, size_t N>
static inline char * tail(char * p, UInt u)
{
QuotientAndRemainder<N> x = split<N>(u);
p = tail(p, UnsignedOfSize<N / 2>(x.quotient));
p = tail(p, x.remainder);
return p;
}
// recursion base case, selected when "u" is one byte
template <>
inline char * tail<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
{
return outTwoDigits(p, u);
}
//===----------------------------------------------------------===//
// large values are >= 10^2*N
// where x contains quotient and remainder after division by 10^N
//===----------------------------------------------------------===//
template <size_t N>
static inline char * large(char * p, QuotientAndRemainder<N> x)
{
QuotientAndRemainder<N> y = split<N>(x.quotient);
p = head(p, UnsignedOfSize<N / 2>(y.quotient));
p = tail(p, y.remainder);
p = tail(p, x.remainder);
return p;
}
//===----------------------------------------------------------===//
// handle values of "u" that might be >= 10^2*N
// where N is the size of "u" in bytes
//===----------------------------------------------------------===//
template <typename UInt, size_t N = sizeof(UInt)>
static inline char * uitoa(char * p, UInt u)
{
if (u < pow10<UnsignedOfSize<N>>(N))
return head(p, UnsignedOfSize<N / 2>(u));
QuotientAndRemainder<N> x = split<N>(u);
return u < pow10<UnsignedOfSize<N>>(2 * N)
? head<N>(p, x)
: large<N>(p, x);
}
// selected when "u" is one byte
template <>
inline char * uitoa<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
{
if (u < 10)
return outDigit(p, u);
else if (u < 100)
return outTwoDigits(p, u);
else
{
p = outDigit(p, u / 100);
p = outTwoDigits(p, u % 100);
return p;
}
}
//===----------------------------------------------------------===//
// handle unsigned and signed integral operands
//===----------------------------------------------------------===//
// itoa: handle unsigned integral operands (selected by SFINAE)
template <typename U, std::enable_if_t<!std::is_signed_v<U> && std::is_integral_v<U>> * = nullptr>
static inline char * itoa(U u, char * p)
{
return convert::uitoa(p, u);
}
// itoa: handle signed integral operands (selected by SFINAE)
template <typename I, size_t N = sizeof(I), std::enable_if_t<std::is_signed_v<I> && std::is_integral_v<I>> * = nullptr>
static inline char * itoa(I i, char * p)
{
// Need "mask" to be filled with a copy of the sign bit.
// If "i" is a negative value, then the result of "operator >>"
// is implementation-defined, though usually it is an arithmetic
// right shift that replicates the sign bit.
// Use a conditional expression to be portable,
// a good optimizing compiler generates an arithmetic right shift
// and avoids the conditional branch.
UnsignedOfSize<N> mask = i < 0 ? ~UnsignedOfSize<N>(0) : 0;
// Now get the absolute value of "i" and cast to unsigned type UnsignedOfSize<N>.
// Cannot use std::abs() because the result is undefined
// in 2's complement systems for the most-negative value.
// Want to avoid conditional branch for performance reasons since
// CPU branch prediction will be ineffective when negative values
// occur randomly.
// Let "u" be "i" cast to unsigned type UnsignedOfSize<N>.
// Subtract "u" from 2*u if "i" is positive or 0 if "i" is negative.
// This yields the absolute value with the desired type without
// using a conditional branch and without invoking undefined or
// implementation defined behavior:
UnsignedOfSize<N> u = ((2 * UnsignedOfSize<N>(i)) & ~mask) - UnsignedOfSize<N>(i);
// Unconditionally store a minus sign when producing digits
// in a forward direction and increment the pointer only if
// the value is in fact negative.
// This avoids a conditional branch and is safe because we will
// always produce at least one digit and it will overwrite the
// minus sign when the value is not negative.
*p = '-';
p += (mask & 1);
p = convert::uitoa(p, u);
return p;
}
}
template <typename T>
static inline char * writeUIntText(T x, char * p)
{
static_assert(is_unsigned_v<T>);
int len = digits10(x);
auto * pp = p + len;
while (x >= 100)
{
const auto i = x % 100;
x /= 100;
pp -= 2;
outTwoDigits(pp, i);
}
if (x < 10)
*p = '0' + x;
else
outTwoDigits(p, x);
return p + len;
}
static inline char * writeLeadingMinus(char * pos)
{
*pos = '-';
return pos + 1;
}
template <typename T>
static inline char * writeSIntText(T x, char * pos)
{
static_assert(std::is_same_v<T, Int128> || std::is_same_v<T, Int256>);
using UnsignedT = make_unsigned_t<T>;
static constexpr T min_int = UnsignedT(1) << (sizeof(T) * 8 - 1);
if (unlikely(x == min_int))
{
if constexpr (std::is_same_v<T, Int128>)
{
const char * res = "-170141183460469231731687303715884105728";
memcpy(pos, res, strlen(res));
return pos + strlen(res);
}
else if constexpr (std::is_same_v<T, Int256>)
{
const char * res = "-57896044618658097711785492504343953926634992332820282019728792003956564819968";
memcpy(pos, res, strlen(res));
return pos + strlen(res);
}
}
if (x < 0)
{
x = -x;
pos = writeLeadingMinus(pos);
}
return writeUIntText(UnsignedT(x), pos);
}
}
template <typename I>
char * itoa(I i, char * p)
{
return impl::convert::itoa(i, p);
}
template <>
inline char * itoa(char8_t i, char * p)
{
return impl::convert::itoa(uint8_t(i), p);
}
template <>
inline char * itoa(UInt128 i, char * p)
{
return impl::writeUIntText(i, p);
}
template <>
inline char * itoa(Int128 i, char * p)
{
return impl::writeSIntText(i, p);
}
template <>
inline char * itoa(UInt256 i, char * p)
{
return impl::writeUIntText(i, p);
}
template <>
inline char * itoa(Int256 i, char * p)
{
return impl::writeSIntText(i, p);
}
#undef FOR_INTEGER_TYPES
#undef INSTANTIATION

View File

@ -30,7 +30,6 @@ namespace Net
class HTTPServerRequest;
class HTTPServerResponse;
class HTTPRequestHandler;

View File

@ -1,17 +0,0 @@
# see ./CMakeLists.txt for variable declaration
if (FUZZER)
if (FUZZER STREQUAL "libfuzzer")
# NOTE: Eldar Zaitov decided to name it "libfuzzer" instead of "fuzzer" to keep in mind another possible fuzzer backends.
# NOTE: no-link means that all the targets are built with instrumentation for fuzzer, but only some of them
# (tests) have entry point for fuzzer and it's not checked.
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link -DFUZZER=1")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link -DFUZZER=1")
# NOTE: oss-fuzz can change LIB_FUZZING_ENGINE variable
if (NOT LIB_FUZZING_ENGINE)
set (LIB_FUZZING_ENGINE "-fsanitize=fuzzer")
endif ()
else ()
message (FATAL_ERROR "Unknown fuzzer type: ${FUZZER}")
endif ()
endif()

View File

@ -26,13 +26,13 @@ const uint8_t MetroHash64::test_seed_1[8] = { 0x3B, 0x0D, 0x48, 0x1C, 0xF4, 0x
MetroHash64::MetroHash64(const uint64_t seed)
MetroHash64::MetroHash64(uint64_t seed)
{
Initialize(seed);
}
void MetroHash64::Initialize(const uint64_t seed)
void MetroHash64::Initialize(uint64_t seed)
{
vseed = (static_cast<uint64_t>(seed) + k2) * k0;
@ -47,7 +47,7 @@ void MetroHash64::Initialize(const uint64_t seed)
}
void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length)
void MetroHash64::Update(const uint8_t * const buffer, uint64_t length)
{
const uint8_t * ptr = reinterpret_cast<const uint8_t*>(buffer);
const uint8_t * const end = ptr + length;
@ -62,7 +62,7 @@ void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length)
memcpy(input.b + (bytes % 32), ptr, static_cast<size_t>(fill));
ptr += fill;
bytes += fill;
// input buffer is still partially filled
if ((bytes % 32) != 0) return;
@ -72,7 +72,7 @@ void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length)
state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0];
state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1];
}
// bulk update
bytes += static_cast<uint64_t>(end - ptr);
while (ptr <= (end - 32))
@ -83,14 +83,14 @@ void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length)
state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0];
state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1];
}
// store remaining bytes in input buffer
if (ptr < end)
memcpy(input.b, ptr, static_cast<size_t>(end - ptr));
}
void MetroHash64::Finalize(uint8_t * const hash)
void MetroHash64::Finalize(uint8_t * hash)
{
// finalize bulk loop, if used
if (bytes >= 32)
@ -102,11 +102,11 @@ void MetroHash64::Finalize(uint8_t * const hash)
state.v[0] = vseed + (state.v[0] ^ state.v[1]);
}
// process any bytes remaining in the input buffer
const uint8_t * ptr = reinterpret_cast<const uint8_t*>(input.b);
const uint8_t * const end = ptr + (bytes % 32);
if ((end - ptr) >= 16)
{
state.v[1] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[1] = rotate_right(state.v[1],29) * k3;
@ -139,7 +139,7 @@ void MetroHash64::Finalize(uint8_t * const hash)
state.v[0] += read_u8 (ptr) * k3;
state.v[0] ^= rotate_right(state.v[0], 37) * k1;
}
state.v[0] ^= rotate_right(state.v[0], 28);
state.v[0] *= k0;
state.v[0] ^= rotate_right(state.v[0], 29);
@ -152,7 +152,7 @@ void MetroHash64::Finalize(uint8_t * const hash)
}
void MetroHash64::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed)
void MetroHash64::Hash(const uint8_t * buffer, uint64_t length, uint8_t * const hash, uint64_t seed)
{
const uint8_t * ptr = reinterpret_cast<const uint8_t*>(buffer);
const uint8_t * const end = ptr + length;
@ -238,7 +238,7 @@ bool MetroHash64::ImplementationVerified()
// verify incremental implementation
MetroHash64 metro;
metro.Initialize(0);
metro.Update(reinterpret_cast<const uint8_t *>(MetroHash64::test_string), strlen(MetroHash64::test_string));
metro.Finalize(hash);
@ -262,9 +262,9 @@ void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
const uint8_t * const end = ptr + len;
uint64_t hash = ((static_cast<uint64_t>(seed) + k2) * k0) + len;
if (len >= 32)
{
uint64_t v[4];
@ -272,7 +272,7 @@ void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
v[1] = hash;
v[2] = hash;
v[3] = hash;
do
{
v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
@ -288,7 +288,7 @@ void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
hash += v[0] ^ v[1];
}
if ((end - ptr) >= 16)
{
uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
@ -297,32 +297,32 @@ void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
v1 ^= rotate_right(v1 * k3, 35) + v0;
hash += v1;
}
if ((end - ptr) >= 8)
{
hash += read_u64(ptr) * k3; ptr += 8;
hash ^= rotate_right(hash, 33) * k1;
}
if ((end - ptr) >= 4)
{
hash += read_u32(ptr) * k3; ptr += 4;
hash ^= rotate_right(hash, 15) * k1;
}
if ((end - ptr) >= 2)
{
hash += read_u16(ptr) * k3; ptr += 2;
hash ^= rotate_right(hash, 13) * k1;
}
if ((end - ptr) >= 1)
{
hash += read_u8 (ptr) * k3;
hash ^= rotate_right(hash, 25) * k1;
}
hash ^= rotate_right(hash, 33);
hash *= k0;
hash ^= rotate_right(hash, 33);
@ -336,13 +336,13 @@ void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
static const uint64_t k0 = 0xD6D018F5;
static const uint64_t k1 = 0xA2AA033B;
static const uint64_t k2 = 0x62992FC1;
static const uint64_t k3 = 0x30BC5B29;
static const uint64_t k3 = 0x30BC5B29;
const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
const uint8_t * const end = ptr + len;
uint64_t hash = ((static_cast<uint64_t>(seed) + k2) * k0) + len;
if (len >= 32)
{
uint64_t v[4];
@ -350,7 +350,7 @@ void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
v[1] = hash;
v[2] = hash;
v[3] = hash;
do
{
v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
@ -366,7 +366,7 @@ void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0;
hash += v[0] ^ v[1];
}
if ((end - ptr) >= 16)
{
uint64_t v0 = hash + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3;
@ -375,31 +375,31 @@ void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * o
v1 ^= rotate_right(v1 * k3, 34) + v0;
hash += v1;
}
if ((end - ptr) >= 8)
{
hash += read_u64(ptr) * k3; ptr += 8;
hash ^= rotate_right(hash, 36) * k1;
}
if ((end - ptr) >= 4)
{
hash += read_u32(ptr) * k3; ptr += 4;
hash ^= rotate_right(hash, 15) * k1;
}
if ((end - ptr) >= 2)
{
hash += read_u16(ptr) * k3; ptr += 2;
hash ^= rotate_right(hash, 15) * k1;
}
if ((end - ptr) >= 1)
{
hash += read_u8 (ptr) * k3;
hash ^= rotate_right(hash, 23) * k1;
}
hash ^= rotate_right(hash, 28);
hash *= k0;
hash ^= rotate_right(hash, 29);

View File

@ -25,24 +25,24 @@ public:
static const uint32_t bits = 64;
// Constructor initializes the same as Initialize()
explicit MetroHash64(const uint64_t seed=0);
explicit MetroHash64(uint64_t seed=0);
// Initializes internal state for new hash with optional seed
void Initialize(const uint64_t seed=0);
void Initialize(uint64_t seed=0);
// Update the hash state with a string of bytes. If the length
// is sufficiently long, the implementation switches to a bulk
// hashing algorithm directly on the argument buffer for speed.
void Update(const uint8_t * buffer, const uint64_t length);
void Update(const uint8_t * buffer, uint64_t length);
// Constructs the final hash and writes it to the argument buffer.
// After a hash is finalized, this instance must be Initialized()-ed
// again or the behavior of Update() and Finalize() is undefined.
void Finalize(uint8_t * const hash);
void Finalize(uint8_t * hash);
// A non-incremental function implementation. This can be significantly
// faster than the incremental implementation for some usage patterns.
static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0);
static void Hash(const uint8_t * buffer, uint64_t length, uint8_t * hash, uint64_t seed=0);
// Does implementation correctly execute test vectors?
static bool ImplementationVerified();

View File

@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="24.2.1.2248"
ARG VERSION="24.2.2.71"
ARG PACKAGES="clickhouse-keeper"
ARG DIRECT_DOWNLOAD_URLS=""

View File

@ -4,6 +4,9 @@ FROM clickhouse/fasttest:$FROM_TAG
ENV CC=clang-${LLVM_VERSION}
ENV CXX=clang++-${LLVM_VERSION}
# If the cctools is updated, then first build it in the CI, then update here in a different commit
COPY --from=clickhouse/cctools:d9e3596e706b /cctools /cctools
# Rust toolchain and libraries
ENV RUSTUP_HOME=/rust/rustup
ENV CARGO_HOME=/rust/cargo
@ -73,9 +76,6 @@ RUN curl -Lo /usr/bin/clang-tidy-cache \
"https://raw.githubusercontent.com/matus-chochlik/ctcache/$CLANG_TIDY_SHA1/clang-tidy-cache" \
&& chmod +x /usr/bin/clang-tidy-cache
# If the cctools is updated, then first build it in the CI, then update here in a different commit
COPY --from=clickhouse/cctools:5a908f73878a /cctools /cctools
RUN mkdir /workdir && chmod 777 /workdir
WORKDIR /workdir

View File

@ -2,7 +2,7 @@
# It's based on the assumption that we don't care of the cctools version so much
# It event does not depend on the clickhouse/fasttest in the `docker/images.json`
ARG FROM_TAG=latest
FROM clickhouse/fasttest:$FROM_TAG
FROM clickhouse/fasttest:$FROM_TAG as builder
ENV CC=clang-${LLVM_VERSION}
ENV CXX=clang++-${LLVM_VERSION}
@ -29,3 +29,6 @@ RUN git clone https://github.com/tpoechtrager/cctools-port.git \
&& make install -j$(nproc) \
&& cd ../.. \
&& rm -rf cctools-port
FROM scratch
COPY --from=builder /cctools /cctools

View File

@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="24.2.1.2248"
ARG VERSION="24.2.2.71"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
ARG DIRECT_DOWNLOAD_URLS=""

View File

@ -27,7 +27,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
ARG VERSION="24.2.1.2248"
ARG VERSION="24.2.2.71"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
# set non-empty deb_location_url url to create a docker image

View File

@ -174,7 +174,14 @@ function fuzz
mkdir -p /var/run/clickhouse-server
# NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server
clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > server.log 2>&1 &
# server.log -> CH logs
# stderr.log -> Process logs (sanitizer)
clickhouse-server \
--config-file db/config.xml \
--pid-file /var/run/clickhouse-server/clickhouse-server.pid \
-- --path db \
--logger.console=0 \
--logger.log=server.log > stderr.log 2>&1 &
server_pid=$!
kill -0 $server_pid
@ -303,7 +310,7 @@ quit
if [ "$server_died" == 1 ]
then
# The server has died.
if ! rg --text -o 'Received signal.*|Logical error.*|Assertion.*failed|Failed assertion.*|.*runtime error: .*|.*is located.*|(SUMMARY|ERROR): [a-zA-Z]+Sanitizer:.*|.*_LIBCPP_ASSERT.*|.*Child process was terminated by signal 9.*' server.log > description.txt
if ! rg --text -o 'Received signal.*|Logical error.*|Assertion.*failed|Failed assertion.*|.*runtime error: .*|.*is located.*|(SUMMARY|ERROR): [a-zA-Z]+Sanitizer:.*|.*_LIBCPP_ASSERT.*|.*Child process was terminated by signal 9.*' server.log stderr.log > description.txt
then
echo "Lost connection to server. See the logs." > description.txt
fi
@ -427,6 +434,7 @@ p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-s
<a href="run.log">run.log</a>
<a href="fuzzer.log.zst">fuzzer.log.zst</a>
<a href="server.log.zst">server.log.zst</a>
<a href="stderr.log">stderr.log</a>
<a href="main.log">main.log</a>
<a href="dmesg.log">dmesg.log</a>
${CORE_LINK}

View File

@ -51,22 +51,22 @@ fi
config_logs_export_cluster /etc/clickhouse-server/config.d/system_logs_export.yaml
if [[ -n "$BUGFIX_VALIDATE_CHECK" ]] && [[ "$BUGFIX_VALIDATE_CHECK" -eq 1 ]]; then
sudo cat /etc/clickhouse-server/config.d/zookeeper.xml \
| sed "/<use_compression>1<\/use_compression>/d" \
> /etc/clickhouse-server/config.d/zookeeper.xml.tmp
sudo mv /etc/clickhouse-server/config.d/zookeeper.xml.tmp /etc/clickhouse-server/config.d/zookeeper.xml
sudo sed -i "/<use_compression>1<\/use_compression>/d" /etc/clickhouse-server/config.d/zookeeper.xml
# it contains some new settings, but we can safely remove it
rm /etc/clickhouse-server/config.d/handlers.yaml
rm /etc/clickhouse-server/users.d/s3_cache_new.xml
rm /etc/clickhouse-server/config.d/zero_copy_destructive_operations.xml
#todo: remove these after 24.3 released.
sudo sed -i "s|<object_storage_type>azure<|<object_storage_type>azure_blob_storage<|" /etc/clickhouse-server/config.d/azure_storage_conf.xml
#todo: remove these after 24.3 released.
sudo sed -i "s|<object_storage_type>local<|<object_storage_type>local_blob_storage<|" /etc/clickhouse-server/config.d/storage_conf.xml
function remove_keeper_config()
{
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
| sed "/<$1>$2<\/$1>/d" \
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
sudo sed -i "/<$1>$2<\/$1>/d" /etc/clickhouse-server/config.d/keeper_port.xml
}
# commit_logs_cache_size_threshold setting doesn't exist on some older versions
remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
@ -77,7 +77,7 @@ fi
if [ "$NUM_TRIES" -gt "1" ]; then
export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
export THREAD_FUZZER_SLEEP_TIME_US=100000
export THREAD_FUZZER_SLEEP_TIME_US_MAX=100000
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
@ -88,10 +88,10 @@ if [ "$NUM_TRIES" -gt "1" ]; then
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US_MAX=10000
mkdir -p /var/run/clickhouse-server
# simplest way to forward env variables to server
@ -101,25 +101,13 @@ else
fi
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
sudo cat /etc/clickhouse-server1/config.d/filesystem_caches_path.xml \
| sed "s|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches/</filesystem_caches_path>|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches_1/</filesystem_caches_path>|" \
> /etc/clickhouse-server1/config.d/filesystem_caches_path.xml.tmp
mv /etc/clickhouse-server1/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server1/config.d/filesystem_caches_path.xml
sudo sed -i "s|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches/</filesystem_caches_path>|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches_1/</filesystem_caches_path>|" /etc/clickhouse-server1/config.d/filesystem_caches_path.xml
sudo cat /etc/clickhouse-server2/config.d/filesystem_caches_path.xml \
| sed "s|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches/</filesystem_caches_path>|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches_2/</filesystem_caches_path>|" \
> /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp
mv /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server2/config.d/filesystem_caches_path.xml
sudo sed -i "s|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches/</filesystem_caches_path>|<filesystem_caches_path>/var/lib/clickhouse/filesystem_caches_2/</filesystem_caches_path>|" /etc/clickhouse-server2/config.d/filesystem_caches_path.xml
sudo cat /etc/clickhouse-server1/config.d/filesystem_caches_path.xml \
| sed "s|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches/</custom_cached_disks_base_directory>|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches_1/</custom_cached_disks_base_directory>|" \
> /etc/clickhouse-server1/config.d/filesystem_caches_path.xml.tmp
mv /etc/clickhouse-server1/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server1/config.d/filesystem_caches_path.xml
sudo sed -i "s|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches/</custom_cached_disks_base_directory>|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches_1/</custom_cached_disks_base_directory>|" /etc/clickhouse-server1/config.d/filesystem_caches_path.xml
sudo cat /etc/clickhouse-server2/config.d/filesystem_caches_path.xml \
| sed "s|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches/</custom_cached_disks_base_directory>|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches_2/</custom_cached_disks_base_directory>|" \
> /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp
mv /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server2/config.d/filesystem_caches_path.xml
sudo sed -i "s|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches/</custom_cached_disks_base_directory>|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches_2/</custom_cached_disks_base_directory>|" /etc/clickhouse-server2/config.d/filesystem_caches_path.xml
mkdir -p /var/run/clickhouse-server1
sudo chown clickhouse:clickhouse /var/run/clickhouse-server1

View File

@ -215,7 +215,7 @@ function check_server_start()
function check_logs_for_critical_errors()
{
# Sanitizer asserts
sed -n '/WARNING:.*anitizer/,/^$/p' >> /test_output/tmp
sed -n '/WARNING:.*anitizer/,/^$/p' /var/log/clickhouse-server/stderr.log >> /test_output/tmp
rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
&& echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
|| echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv

View File

@ -27,7 +27,7 @@ install_packages package_folder
# and find more potential issues.
export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
export THREAD_FUZZER_SLEEP_TIME_US=100000
export THREAD_FUZZER_SLEEP_TIME_US_MAX=100000
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
@ -38,11 +38,11 @@ export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US_MAX=10000
export THREAD_FUZZER_EXPLICIT_SLEEP_PROBABILITY=0.01
export THREAD_FUZZER_EXPLICIT_MEMORY_EXCEPTION_PROBABILITY=0.01

View File

@ -67,10 +67,7 @@ configure
function remove_keeper_config()
{
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
| sed "/<$1>$2<\/$1>/d" \
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
sudo sed -i "/<$1>$2<\/$1>/d" /etc/clickhouse-server/config.d/keeper_port.xml
}
# async_replication setting doesn't exist on some older versions
@ -80,16 +77,10 @@ remove_keeper_config "async_replication" "1"
remove_keeper_config "create_if_not_exists" "[01]"
#todo: remove these after 24.3 released.
sudo cat /etc/clickhouse-server/config.d/azure_storage_conf.xml \
| sed "s|<object_storage_type>azure|<object_storage_type>azure_blob_storage|" \
> /etc/clickhouse-server/config.d/azure_storage_conf.xml.tmp
sudo mv /etc/clickhouse-server/config.d/azure_storage_conf.xml.tmp /etc/clickhouse-server/config.d/azure_storage_conf.xml
sudo sed -i "s|<object_storage_type>azure<|<object_storage_type>azure_blob_storage<|" /etc/clickhouse-server/config.d/azure_storage_conf.xml
#todo: remove these after 24.3 released.
sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
| sed "s|<object_storage_type>local|<object_storage_type>local_blob_storage|" \
> /etc/clickhouse-server/config.d/storage_conf.xml.tmp
sudo mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml
sudo sed -i "s|<object_storage_type>local<|<object_storage_type>local_blob_storage<|" /etc/clickhouse-server/config.d/storage_conf.xml
# latest_logs_cache_size_threshold setting doesn't exist on some older versions
remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
@ -120,22 +111,13 @@ export ZOOKEEPER_FAULT_INJECTION=0
configure
# force_sync=false doesn't work correctly on some older versions
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
| sed "s|<force_sync>false</force_sync>|<force_sync>true</force_sync>|" \
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
sudo sed -i "s|<force_sync>false</force_sync>|<force_sync>true</force_sync>|" /etc/clickhouse-server/config.d/keeper_port.xml
#todo: remove these after 24.3 released.
sudo cat /etc/clickhouse-server/config.d/azure_storage_conf.xml \
| sed "s|<object_storage_type>azure|<object_storage_type>azure_blob_storage|" \
> /etc/clickhouse-server/config.d/azure_storage_conf.xml.tmp
sudo mv /etc/clickhouse-server/config.d/azure_storage_conf.xml.tmp /etc/clickhouse-server/config.d/azure_storage_conf.xml
sudo sed -i "s|<object_storage_type>azure<|<object_storage_type>azure_blob_storage<|" /etc/clickhouse-server/config.d/azure_storage_conf.xml
#todo: remove these after 24.3 released.
sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
| sed "s|<object_storage_type>local|<object_storage_type>local_blob_storage|" \
> /etc/clickhouse-server/config.d/storage_conf.xml.tmp
sudo mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml
sudo sed -i "s|<object_storage_type>local<|<object_storage_type>local_blob_storage<|" /etc/clickhouse-server/config.d/storage_conf.xml
# async_replication setting doesn't exist on some older versions
remove_keeper_config "async_replication" "1"
@ -150,10 +132,7 @@ remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
# But we still need default disk because some tables loaded only into it
sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
| sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \
> /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp
mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
sudo sed -i "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
@ -256,10 +235,7 @@ then
fi
# Just in case previous version left some garbage in zk
sudo cat /etc/clickhouse-server/config.d/lost_forever_check.xml \
| sed "s|>1<|>0<|g" \
> /etc/clickhouse-server/config.d/lost_forever_check.xml.tmp
sudo mv /etc/clickhouse-server/config.d/lost_forever_check.xml.tmp /etc/clickhouse-server/config.d/lost_forever_check.xml
sudo sed -i "s|>1<|>0<|g" /etc/clickhouse-server/config.d/lost_forever_check.xml \
rm /etc/clickhouse-server/config.d/filesystem_caches_path.xml
start 500

View File

@ -0,0 +1,64 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v23.12.5.81-stable (a0fbe3ae813) FIXME as compared to v23.12.4.15-stable (4233d111d20)
#### Improvement
* Backported in [#60290](https://github.com/ClickHouse/ClickHouse/issues/60290): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)).
* Backported in [#60830](https://github.com/ClickHouse/ClickHouse/issues/60830): Update tzdata to 2024a. [#60768](https://github.com/ClickHouse/ClickHouse/pull/60768) ([Raúl Marín](https://github.com/Algunenano)).
#### Build/Testing/Packaging Improvement
* Backported in [#59883](https://github.com/ClickHouse/ClickHouse/issues/59883): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix_kql_issue_found_by_wingfuzz [#59626](https://github.com/ClickHouse/ClickHouse/pull/59626) ([Yong Wang](https://github.com/kashwy)).
* Fix error "Read beyond last offset" for AsynchronousBoundedReadBuffer [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix query start time on non initial queries [#59662](https://github.com/ClickHouse/ClickHouse/pull/59662) ([Raúl Marín](https://github.com/Algunenano)).
* rabbitmq: fix having neither acked nor nacked messages [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix parsing of partition expressions surrounded by parens [#59901](https://github.com/ClickHouse/ClickHouse/pull/59901) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Fix optimize_uniq_to_count removing the column alias [#60026](https://github.com/ClickHouse/ClickHouse/pull/60026) ([Raúl Marín](https://github.com/Algunenano)).
* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
* Hide sensitive info for s3queue [#60233](https://github.com/ClickHouse/ClickHouse/pull/60233) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix deadlock in parallel parsing when lots of rows are skipped due to errors [#60516](https://github.com/ClickHouse/ClickHouse/pull/60516) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix_max_query_size_for_kql_compound_operator: [#60534](https://github.com/ClickHouse/ClickHouse/pull/60534) ([Yong Wang](https://github.com/kashwy)).
* Reduce the number of read rows from `system.numbers` [#60546](https://github.com/ClickHouse/ClickHouse/pull/60546) ([JackyWoo](https://github.com/JackyWoo)).
* Fix buffer overflow in CompressionCodecMultiple [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Remove nonsense from SQL/JSON [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Prevent setting custom metadata headers on unsupported multipart upload operations [#60748](https://github.com/ClickHouse/ClickHouse/pull/60748) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)).
* Fix crash in arrayEnumerateRanked [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)).
* Fix crash when using input() in INSERT SELECT JOIN [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix crash with different allow_experimental_analyzer value in subqueries [#60770](https://github.com/ClickHouse/ClickHouse/pull/60770) ([Dmitry Novik](https://github.com/novikd)).
* Remove recursion when reading from S3 [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)).
* Fix Keeper reconfig for standalone binary [#61233](https://github.com/ClickHouse/ClickHouse/pull/61233) ([Antonio Andelic](https://github.com/antonio2368)).
#### CI Fix or Improvement (changelog entry is not required)
* Backported in [#60767](https://github.com/ClickHouse/ClickHouse/issues/60767): Decoupled changes from [#60408](https://github.com/ClickHouse/ClickHouse/issues/60408). [#60553](https://github.com/ClickHouse/ClickHouse/pull/60553) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#60582](https://github.com/ClickHouse/ClickHouse/issues/60582): Arm and amd docker build jobs use similar job names and thus overwrite job reports - aarch64 and amd64 suffixes added to fix this. [#60554](https://github.com/ClickHouse/ClickHouse/pull/60554) ([Max K.](https://github.com/maxknv)).
* Backported in [#61041](https://github.com/ClickHouse/ClickHouse/issues/61041): Debug and fix markreleaseready. [#60611](https://github.com/ClickHouse/ClickHouse/pull/60611) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#61030](https://github.com/ClickHouse/ClickHouse/issues/61030): ... [#61022](https://github.com/ClickHouse/ClickHouse/pull/61022) ([Max K.](https://github.com/maxknv)).
* Backported in [#61224](https://github.com/ClickHouse/ClickHouse/issues/61224): ... [#61183](https://github.com/ClickHouse/ClickHouse/pull/61183) ([Han Fei](https://github.com/hanfei1991)).
* Backported in [#61190](https://github.com/ClickHouse/ClickHouse/issues/61190): ... [#61185](https://github.com/ClickHouse/ClickHouse/pull/61185) ([Max K.](https://github.com/maxknv)).
#### NO CL ENTRY
* NO CL ENTRY: 'Revert "Backport [#59798](https://github.com/ClickHouse/ClickHouse/issues/59798) to 23.12: CI: do not reuse builds on release branches"'. [#59979](https://github.com/ClickHouse/ClickHouse/pull/59979) ([Max K.](https://github.com/maxknv)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* CI: move ci-specifics from job scripts to ci.py [#58516](https://github.com/ClickHouse/ClickHouse/pull/58516) ([Max K.](https://github.com/maxknv)).
* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix special build reports in release branches [#59797](https://github.com/ClickHouse/ClickHouse/pull/59797) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* CI: do not reuse builds on release branches [#59798](https://github.com/ClickHouse/ClickHouse/pull/59798) ([Max K.](https://github.com/maxknv)).
* Fix mark release ready [#59994](https://github.com/ClickHouse/ClickHouse/pull/59994) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Ability to detect undead ZooKeeper sessions [#60044](https://github.com/ClickHouse/ClickHouse/pull/60044) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Detect io_uring in tests [#60373](https://github.com/ClickHouse/ClickHouse/pull/60373) ([Azat Khuzhin](https://github.com/azat)).
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#60499](https://github.com/ClickHouse/ClickHouse/pull/60499) ([Kruglov Pavel](https://github.com/Avogar)).
* Remove broken test while we fix it [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)).
* Speed up cctools building [#61011](https://github.com/ClickHouse/ClickHouse/pull/61011) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).

View File

@ -0,0 +1,24 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v23.3.21.26-lts (d9672a3731f) FIXME as compared to v23.3.20.27-lts (cc974ba4f81)
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix reading from sparse columns after restart [#49660](https://github.com/ClickHouse/ClickHouse/pull/49660) ([Anton Popov](https://github.com/CurtizJ)).
* Fix buffer overflow in CompressionCodecMultiple [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Remove nonsense from SQL/JSON [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix crash in arrayEnumerateRanked [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)).
* Fix crash when using input() in INSERT SELECT JOIN [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)).
* Remove recursion when reading from S3 [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#57104](https://github.com/ClickHouse/ClickHouse/pull/57104) ([Kruglov Pavel](https://github.com/Avogar)).
* Detect io_uring in tests [#60373](https://github.com/ClickHouse/ClickHouse/pull/60373) ([Azat Khuzhin](https://github.com/azat)).
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#60499](https://github.com/ClickHouse/ClickHouse/pull/60499) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,30 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v23.8.11.28-lts (31879d2ab4c) FIXME as compared to v23.8.10.43-lts (a278225bba9)
#### Improvement
* Backported in [#60828](https://github.com/ClickHouse/ClickHouse/issues/60828): Update tzdata to 2024a. [#60768](https://github.com/ClickHouse/ClickHouse/pull/60768) ([Raúl Marín](https://github.com/Algunenano)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix buffer overflow in CompressionCodecMultiple [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Remove nonsense from SQL/JSON [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix crash in arrayEnumerateRanked [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)).
* Fix crash when using input() in INSERT SELECT JOIN [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)).
* Remove recursion when reading from S3 [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)).
#### NO CL ENTRY
* NO CL ENTRY: 'Use the current branch test-utils to build cctools'. [#61276](https://github.com/ClickHouse/ClickHouse/pull/61276) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#57104](https://github.com/ClickHouse/ClickHouse/pull/57104) ([Kruglov Pavel](https://github.com/Avogar)).
* Detect io_uring in tests [#60373](https://github.com/ClickHouse/ClickHouse/pull/60373) ([Azat Khuzhin](https://github.com/azat)).
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#60499](https://github.com/ClickHouse/ClickHouse/pull/60499) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,26 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.1.7.18-stable (90925babd78) FIXME as compared to v24.1.6.52-stable (fa09f677bc9)
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix deadlock in parallel parsing when lots of rows are skipped due to errors [#60516](https://github.com/ClickHouse/ClickHouse/pull/60516) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix_max_query_size_for_kql_compound_operator: [#60534](https://github.com/ClickHouse/ClickHouse/pull/60534) ([Yong Wang](https://github.com/kashwy)).
* Fix crash with different allow_experimental_analyzer value in subqueries [#60770](https://github.com/ClickHouse/ClickHouse/pull/60770) ([Dmitry Novik](https://github.com/novikd)).
* Fix Keeper reconfig for standalone binary [#61233](https://github.com/ClickHouse/ClickHouse/pull/61233) ([Antonio Andelic](https://github.com/antonio2368)).
#### CI Fix or Improvement (changelog entry is not required)
* Backported in [#61043](https://github.com/ClickHouse/ClickHouse/issues/61043): Debug and fix markreleaseready. [#60611](https://github.com/ClickHouse/ClickHouse/pull/60611) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#61168](https://github.com/ClickHouse/ClickHouse/issues/61168): Just a preparation for the merge queue support. [#61099](https://github.com/ClickHouse/ClickHouse/pull/61099) ([Max K.](https://github.com/maxknv)).
* Backported in [#61192](https://github.com/ClickHouse/ClickHouse/issues/61192): ... [#61185](https://github.com/ClickHouse/ClickHouse/pull/61185) ([Max K.](https://github.com/maxknv)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#60499](https://github.com/ClickHouse/ClickHouse/pull/60499) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,55 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.2.2.71-stable (9293d361e72) FIXME as compared to v24.2.1.2248-stable (891689a4150)
#### Improvement
* Backported in [#60834](https://github.com/ClickHouse/ClickHouse/issues/60834): Update tzdata to 2024a. [#60768](https://github.com/ClickHouse/ClickHouse/pull/60768) ([Raúl Marín](https://github.com/Algunenano)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* PartsSplitter invalid ranges for the same part [#60041](https://github.com/ClickHouse/ClickHouse/pull/60041) ([Maksim Kita](https://github.com/kitaisreal)).
* Try to avoid calculation of scalar subqueries for CREATE TABLE. [#60464](https://github.com/ClickHouse/ClickHouse/pull/60464) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix deadlock in parallel parsing when lots of rows are skipped due to errors [#60516](https://github.com/ClickHouse/ClickHouse/pull/60516) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix_max_query_size_for_kql_compound_operator: [#60534](https://github.com/ClickHouse/ClickHouse/pull/60534) ([Yong Wang](https://github.com/kashwy)).
* Reduce the number of read rows from `system.numbers` [#60546](https://github.com/ClickHouse/ClickHouse/pull/60546) ([JackyWoo](https://github.com/JackyWoo)).
* Don't output number tips for date types [#60577](https://github.com/ClickHouse/ClickHouse/pull/60577) ([Raúl Marín](https://github.com/Algunenano)).
* Fix buffer overflow in CompressionCodecMultiple [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Remove nonsense from SQL/JSON [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Prevent setting custom metadata headers on unsupported multipart upload operations [#60748](https://github.com/ClickHouse/ClickHouse/pull/60748) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)).
* Fix crash in arrayEnumerateRanked [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)).
* Fix crash when using input() in INSERT SELECT JOIN [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix crash with different allow_experimental_analyzer value in subqueries [#60770](https://github.com/ClickHouse/ClickHouse/pull/60770) ([Dmitry Novik](https://github.com/novikd)).
* Remove recursion when reading from S3 [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)).
* Fix multiple bugs in groupArraySorted [#61203](https://github.com/ClickHouse/ClickHouse/pull/61203) ([Raúl Marín](https://github.com/Algunenano)).
* Fix Keeper reconfig for standalone binary [#61233](https://github.com/ClickHouse/ClickHouse/pull/61233) ([Antonio Andelic](https://github.com/antonio2368)).
#### CI Fix or Improvement (changelog entry is not required)
* Backported in [#60758](https://github.com/ClickHouse/ClickHouse/issues/60758): Decoupled changes from [#60408](https://github.com/ClickHouse/ClickHouse/issues/60408). [#60553](https://github.com/ClickHouse/ClickHouse/pull/60553) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#60706](https://github.com/ClickHouse/ClickHouse/issues/60706): Eliminates the need to provide input args to docker server jobs to clean yml files. [#60602](https://github.com/ClickHouse/ClickHouse/pull/60602) ([Max K.](https://github.com/maxknv)).
* Backported in [#61045](https://github.com/ClickHouse/ClickHouse/issues/61045): Debug and fix markreleaseready. [#60611](https://github.com/ClickHouse/ClickHouse/pull/60611) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#60721](https://github.com/ClickHouse/ClickHouse/issues/60721): Fix build_report job so that it's defined by ci_config only (not yml file). [#60613](https://github.com/ClickHouse/ClickHouse/pull/60613) ([Max K.](https://github.com/maxknv)).
* Backported in [#60668](https://github.com/ClickHouse/ClickHouse/issues/60668): Do not await ci pending jobs on release branches decrease wait timeout to fit into gh job timeout. [#60652](https://github.com/ClickHouse/ClickHouse/pull/60652) ([Max K.](https://github.com/maxknv)).
* Backported in [#60863](https://github.com/ClickHouse/ClickHouse/issues/60863): Set limited number of builds for "special build check" report in backports. [#60850](https://github.com/ClickHouse/ClickHouse/pull/60850) ([Max K.](https://github.com/maxknv)).
* Backported in [#60946](https://github.com/ClickHouse/ClickHouse/issues/60946): ... [#60935](https://github.com/ClickHouse/ClickHouse/pull/60935) ([Max K.](https://github.com/maxknv)).
* Backported in [#60972](https://github.com/ClickHouse/ClickHouse/issues/60972): ... [#60952](https://github.com/ClickHouse/ClickHouse/pull/60952) ([Max K.](https://github.com/maxknv)).
* Backported in [#60980](https://github.com/ClickHouse/ClickHouse/issues/60980): ... [#60958](https://github.com/ClickHouse/ClickHouse/pull/60958) ([Max K.](https://github.com/maxknv)).
* Backported in [#61170](https://github.com/ClickHouse/ClickHouse/issues/61170): Just a preparation for the merge queue support. [#61099](https://github.com/ClickHouse/ClickHouse/pull/61099) ([Max K.](https://github.com/maxknv)).
* Backported in [#61181](https://github.com/ClickHouse/ClickHouse/issues/61181): ... [#61172](https://github.com/ClickHouse/ClickHouse/pull/61172) ([Max K.](https://github.com/maxknv)).
* Backported in [#61228](https://github.com/ClickHouse/ClickHouse/issues/61228): ... [#61183](https://github.com/ClickHouse/ClickHouse/pull/61183) ([Han Fei](https://github.com/hanfei1991)).
* Backported in [#61194](https://github.com/ClickHouse/ClickHouse/issues/61194): ... [#61185](https://github.com/ClickHouse/ClickHouse/pull/61185) ([Max K.](https://github.com/maxknv)).
* Backported in [#61244](https://github.com/ClickHouse/ClickHouse/issues/61244): ... [#61214](https://github.com/ClickHouse/ClickHouse/pull/61214) ([Max K.](https://github.com/maxknv)).
* Backported in [#61388](https://github.com/ClickHouse/ClickHouse/issues/61388):. [#61373](https://github.com/ClickHouse/ClickHouse/pull/61373) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* CI: make workflow yml abstract [#60421](https://github.com/ClickHouse/ClickHouse/pull/60421) ([Max K.](https://github.com/maxknv)).
* Cancel PipelineExecutor properly in case of exception in spawnThreads [#60499](https://github.com/ClickHouse/ClickHouse/pull/60499) ([Kruglov Pavel](https://github.com/Avogar)).
* General sanity in function `seriesOutliersDetectTukey` [#60535](https://github.com/ClickHouse/ClickHouse/pull/60535) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Speed up cctools building [#61011](https://github.com/ClickHouse/ClickHouse/pull/61011) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).

View File

@ -6,6 +6,11 @@ sidebar_label: JDBC
# JDBC
:::note
clickhouse-jdbc-bridge contains experimental codes and is no longer supported. It may contain reliability issues and security vulnerabilities. Use it at your own risk.
ClickHouse recommend using built-in table functions in ClickHouse which provide a better alternative for ad-hoc querying scenarios (Postgres, MySQL, MongoDB, etc).
:::
Allows ClickHouse to connect to external databases via [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity).
To implement the JDBC connection, ClickHouse uses the separate program [clickhouse-jdbc-bridge](https://github.com/ClickHouse/clickhouse-jdbc-bridge) that should run as a daemon.

View File

@ -946,96 +946,6 @@ You could change storage policy after table creation with [ALTER TABLE ... MODIF
The number of threads performing background moves of data parts can be changed by [background_move_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_move_pool_size) setting.
### Dynamic Storage
This example query shows how to attach a table stored at a URL and configure the
remote storage within the query. The web storage is not configured in the ClickHouse
configuration files; all the settings are in the CREATE/ATTACH query.
:::note
The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk.
:::
#### Example dynamic web storage
:::tip
A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver)
:::
In this `ATTACH TABLE` query the `UUID` provided matches the directory name of the data, and the endpoint is the URL for the raw GitHub content.
```sql
# highlight-next-line
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
is_new UInt8,
duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
# highlight-start
SETTINGS disk = disk(
type=web,
endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
);
# highlight-end
```
### Nested Dynamic Storage
This example query builds on the above dynamic disk configuration and shows how to
use a local disk to cache data from a table stored at a URL. Neither the cache disk
nor the web storage is configured in the ClickHouse configuration files; both are
configured in the CREATE/ATTACH query settings.
In the settings highlighted below notice that the disk of `type=web` is nested within
the disk of `type=cache`.
```sql
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
is_new UInt8,
duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
# highlight-start
SETTINGS disk = disk(
type=cache,
max_size='1Gi',
path='/var/lib/clickhouse/custom_disk_cache/',
disk=disk(
type=web,
endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
)
);
# highlight-end
```
### Details {#details}
In the case of `MergeTree` tables, data is getting to disk in different ways:
@ -1064,13 +974,11 @@ During this time, they are not moved to other volumes or disks. Therefore, until
User can assign new big parts to different disks of a [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures) volume in a balanced way using the [min_bytes_to_rebalance_partition_over_jbod](/docs/en/operations/settings/merge-tree-settings.md/#min-bytes-to-rebalance-partition-over-jbod) setting.
## Using S3 for Data Storage {#table_engine-mergetree-s3}
## Using External Storage for Data Storage {#table_engine-mergetree-s3}
:::note
Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/gcs).
:::
[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` using a disk with types `s3`, `azure_blob_storage`, `hdfs` accordingly. See [configuring external storage options](/docs/en/operations/storing-data.md/#configuring-external-storage) for more details.
`MergeTree` family table engines can store data to [S3](https://aws.amazon.com/s3/) using a disk with type `s3`.
Example for [S3](https://aws.amazon.com/s3/) as external storage using a disk with type `s3`.
Configuration markup:
``` xml
@ -1112,253 +1020,12 @@ Configuration markup:
</storage_configuration>
```
Also see [configuring external storage options](/docs/en/operations/storing-data.md/#configuring-external-storage).
:::note cache configuration
ClickHouse versions 22.3 through 22.7 use a different cache configuration, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) if you are using one of those versions.
:::
### Configuring the S3 disk
Required parameters:
- `endpoint` — S3 endpoint URL in `path` or `virtual hosted` [styles](https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). Endpoint URL should contain a bucket and root path to store data.
- `access_key_id` — S3 access key id.
- `secret_access_key` — S3 secret access key.
Optional parameters:
- `region` — S3 region name.
- `support_batch_delete` — This controls the check to see if batch deletes are supported. Set this to `false` when using Google Cloud Storage (GCS) as GCS does not support batch deletes and preventing the checks will prevent error messages in the logs.
- `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`.
- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`.
- `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`.
- `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL.
- `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`.
- `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`.
- `retry_attempts` — Number of retry attempts in case of failed request. Default value is `10`.
- `single_read_retries` — Number of retry attempts in case of connection drop during read. Default value is `4`.
- `min_bytes_for_seek` — Minimal number of bytes to use seek operation instead of sequential read. Default value is `1 Mb`.
- `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
- `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`.
- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times.
- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set.
- `server_side_encryption_kms_key_id` - If specified, required headers for accessing S3 objects with [SSE-KMS encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) will be set. If an empty string is specified, the AWS managed S3 key will be used. Optional.
- `server_side_encryption_kms_encryption_context` - If specified alongside `server_side_encryption_kms_key_id`, the given encryption context header for SSE-KMS will be set. Optional.
- `server_side_encryption_kms_bucket_key_enabled` - If specified alongside `server_side_encryption_kms_key_id`, the header to enable S3 bucket keys for SSE-KMS will be set. Optional, can be `true` or `false`, defaults to nothing (matches the bucket-level setting).
- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`.
- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`.
- `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
- `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
- `key_template` — Define the format with which the object keys are generated. By default, Clickhouse takes `root path` from `endpoint` option and adds random generated suffix. That suffix is a dir with 3 random symbols and a file name with 29 random symbols. With that option you have a full control how to the object keys are generated. Some usage scenarios require having random symbols in the prefix or in the middle of object key. For example: `[a-z]{3}-prefix-random/constant-part/random-middle-[a-z]{3}/random-suffix-[a-z]{29}`. The value is parsed with [`re2`](https://github.com/google/re2/wiki/Syntax). Only some subset of the syntax is supported. Check if your preferred format is supported before using that option. Disk isn't initialized if clickhouse is unable to generate a key by the value of `key_template`. It requires enabled feature flag [storage_metadata_write_full_object_key](/docs/en/operations/settings/settings#storage_metadata_write_full_object_key). It forbids declaring the `root path` in `endpoint` option. It requires definition of the option `key_compatibility_prefix`.
- `key_compatibility_prefix` — That option is required when option `key_template` is in use. In order to be able to read the objects keys which were stored in the metadata files with the metadata version lower that `VERSION_FULL_OBJECT_KEY`, the previous `root path` from the `endpoint` option should be set here.
### Configuring the cache
This is the cache configuration from above:
```xml
<s3_cache>
<type>cache</type>
<disk>s3</disk>
<path>/var/lib/clickhouse/disks/s3_cache/</path>
<max_size>10Gi</max_size>
</s3_cache>
```
These parameters define the cache layer:
- `type` — If a disk is of type `cache` it caches mark and index files in memory.
- `disk` — The name of the disk that will be cached.
Cache parameters:
- `path` — The path where metadata for the cache is stored.
- `max_size` — The size (amount of disk space) that the cache can grow to.
:::tip
There are several other cache parameters that you can use to tune your storage, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) for the details.
:::
S3 disk can be configured as `main` or `cold` storage:
``` xml
<storage_configuration>
...
<disks>
<s3>
<type>s3</type>
<endpoint>https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/</endpoint>
<access_key_id>your_access_key_id</access_key_id>
<secret_access_key>your_secret_access_key</secret_access_key>
</s3>
</disks>
<policies>
<s3_main>
<volumes>
<main>
<disk>s3</disk>
</main>
</volumes>
</s3_main>
<s3_cold>
<volumes>
<main>
<disk>default</disk>
</main>
<external>
<disk>s3</disk>
</external>
</volumes>
<move_factor>0.2</move_factor>
</s3_cold>
</policies>
...
</storage_configuration>
```
In case of `cold` option a data can be moved to S3 if local disk free size will be smaller than `move_factor * disk_size` or by TTL move rule.
## Using Azure Blob Storage for Data Storage {#table_engine-mergetree-azure-blob-storage}
`MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`.
As of February 2022, this feature is still a fresh addition, so expect that some Azure Blob Storage functionalities might be unimplemented.
Configuration markup:
``` xml
<storage_configuration>
...
<disks>
<blob_storage_disk>
<type>azure_blob_storage</type>
<storage_account_url>http://account.blob.core.windows.net</storage_account_url>
<container_name>container</container_name>
<account_name>account</account_name>
<account_key>pass123</account_key>
<metadata_path>/var/lib/clickhouse/disks/blob_storage_disk/</metadata_path>
<cache_path>/var/lib/clickhouse/disks/blob_storage_disk/cache/</cache_path>
<skip_access_check>false</skip_access_check>
</blob_storage_disk>
</disks>
...
</storage_configuration>
```
Connection parameters:
* `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (`http://account.blob.core.windows.net:{port}/[account_name]{container_name}/{data_prefix}`) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used.
* `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. (Default : true)
* `storage_account_url` - Required if endpoint is not specified, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`.
* `container_name` - Target container name, defaults to `default-container`.
* `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet.
Authentication parameters (the disk will try all available methods **and** Managed Identity Credential):
* `connection_string` - For authentication using a connection string.
* `account_name` and `account_key` - For authentication using Shared Key.
Limit parameters (mainly for internal usage):
* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
* `min_bytes_for_seek` - Limits the size of a seekable region.
* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage.
* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage.
* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated.
* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurrently for one object.
Other parameters:
* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
* `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`.
* `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
* `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)).
:::note Zero-copy replication is not ready for production
Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use.
:::
## HDFS storage {#hdfs-storage}
In this sample configuration:
- the disk is of type `hdfs`
- the data is hosted at `hdfs://hdfs1:9000/clickhouse/`
```xml
<clickhouse>
<storage_configuration>
<disks>
<hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
<skip_access_check>true</skip_access_check>
</hdfs>
<hdd>
<type>local</type>
<path>/</path>
</hdd>
</disks>
<policies>
<hdfs>
<volumes>
<main>
<disk>hdfs</disk>
</main>
<external>
<disk>hdd</disk>
</external>
</volumes>
</hdfs>
</policies>
</storage_configuration>
</clickhouse>
```
## Web storage (read-only) {#web-storage}
Web storage can be used for read-only purposes. An example use is for hosting sample
data, or for migrating data.
:::tip
Storage can also be configured temporarily within a query, if a web dataset is not expected
to be used routinely, see [dynamic storage](#dynamic-storage) and skip editing the
configuration file.
:::
In this sample configuration:
- the disk is of type `web`
- the data is hosted at `http://nginx:80/test1/`
- a cache on local storage is used
```xml
<clickhouse>
<storage_configuration>
<disks>
<web>
<type>web</type>
<endpoint>http://nginx:80/test1/</endpoint>
</web>
<cached_web>
<type>cache</type>
<disk>web</disk>
<path>cached_web_cache/</path>
<max_size>100000000</max_size>
</cached_web>
</disks>
<policies>
<web>
<volumes>
<main>
<disk>web</disk>
</main>
</volumes>
</web>
<cached_web>
<volumes>
<main>
<disk>cached_web</disk>
</main>
</volumes>
</cached_web>
</policies>
</storage_configuration>
</clickhouse>
```
## Virtual Columns {#virtual-columns}
- `_part` — Name of a part.

View File

@ -21,3 +21,79 @@ When restarting a server, data disappears from the table and the table becomes e
Normally, using this table engine is not justified. However, it can be used for tests, and for tasks where maximum speed is required on a relatively small number of rows (up to approximately 100,000,000).
The Memory engine is used by the system for temporary tables with external query data (see the section “External data for processing a query”), and for implementing `GLOBAL IN` (see the section “IN operators”).
Upper and lower bounds can be specified to limit Memory engine table size, effectively allowing it to act as a circular buffer (see [Engine Parameters](#engine-parameters)).
## Engine Parameters {#engine-parameters}
- `min_bytes_to_keep` — Minimum bytes to keep when memory table is size-capped.
- Default value: `0`
- Requires `max_bytes_to_keep`
- `max_bytes_to_keep` — Maximum bytes to keep within memory table where oldest rows are deleted on each insertion (i.e circular buffer). Max bytes can exceed the stated limit if the oldest batch of rows to remove falls under the `min_bytes_to_keep` limit when adding a large block.
- Default value: `0`
- `min_rows_to_keep` — Minimum rows to keep when memory table is size-capped.
- Default value: `0`
- Requires `max_rows_to_keep`
- `max_rows_to_keep` — Maximum rows to keep within memory table where oldest rows are deleted on each insertion (i.e circular buffer). Max rows can exceed the stated limit if the oldest batch of rows to remove falls under the `min_rows_to_keep` limit when adding a large block.
- Default value: `0`
## Usage {#usage}
**Initialize settings**
``` sql
CREATE TABLE memory (i UInt32) ENGINE = Memory SETTINGS min_rows_to_keep = 100, max_rows_to_keep = 1000;
```
**Note:** Both `bytes` and `rows` capping parameters can be set at the same time, however, the lower bounds of `max` and `min` will be adhered to.
## Examples {#examples}
``` sql
CREATE TABLE memory (i UInt32) ENGINE = Memory SETTINGS min_bytes_to_keep = 4096, max_bytes_to_keep = 16384;
/* 1. testing oldest block doesn't get deleted due to min-threshold - 3000 rows */
INSERT INTO memory SELECT * FROM numbers(0, 1600); -- 8'192 bytes
/* 2. adding block that doesn't get deleted */
INSERT INTO memory SELECT * FROM numbers(1000, 100); -- 1'024 bytes
/* 3. testing oldest block gets deleted - 9216 bytes - 1100 */
INSERT INTO memory SELECT * FROM numbers(9000, 1000); -- 8'192 bytes
/* 4. checking a very large block overrides all */
INSERT INTO memory SELECT * FROM numbers(9000, 10000); -- 65'536 bytes
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'memory' and database = currentDatabase();
```
``` text
┌─total_bytes─┬─total_rows─┐
│ 65536 │ 10000 │
└─────────────┴────────────┘
```
also, for rows:
``` sql
CREATE TABLE memory (i UInt32) ENGINE = Memory SETTINGS min_rows_to_keep = 4000, max_rows_to_keep = 10000;
/* 1. testing oldest block doesn't get deleted due to min-threshold - 3000 rows */
INSERT INTO memory SELECT * FROM numbers(0, 1600); -- 1'600 rows
/* 2. adding block that doesn't get deleted */
INSERT INTO memory SELECT * FROM numbers(1000, 100); -- 100 rows
/* 3. testing oldest block gets deleted - 9216 bytes - 1100 */
INSERT INTO memory SELECT * FROM numbers(9000, 1000); -- 1'000 rows
/* 4. checking a very large block overrides all */
INSERT INTO memory SELECT * FROM numbers(9000, 10000); -- 10'000 rows
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'memory' and database = currentDatabase();
```
``` text
┌─total_bytes─┬─total_rows─┐
│ 65536 │ 10000 │
└─────────────┴────────────┘
```

View File

@ -55,7 +55,7 @@ CREATE TABLE criteo_log (
) ENGINE = Log;
```
Download the data:
Insert the data:
``` bash
$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done

View File

@ -248,6 +248,9 @@ Some of the files might not download fully. Check the file sizes and re-download
``` bash
$ curl -O https://datasets.clickhouse.com/trips_mergetree/partitions/trips_mergetree.tar
# Validate the checksum
$ md5sum trips_mergetree.tar
# Checksum should be equal to: f3b8d469b41d9a82da064ded7245d12c
$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory
$ # check permissions of unpacked data, fix if required
$ sudo service clickhouse-server restart

View File

@ -0,0 +1,293 @@
---
slug: /en/getting-started/example-datasets/tw-weather
sidebar_label: Taiwan Historical Weather Datasets
sidebar_position: 1
description: 131 million rows of weather observation data for the last 128 yrs
---
# Taiwan Historical Weather Datasets
This dataset contains historical meteorological observations measurements for the last 128 years. Each row is a measurement for a point in date time and weather station.
The origin of this dataset is available [here](https://github.com/Raingel/historical_weather) and the list of weather station numbers can be found [here](https://github.com/Raingel/weather_station_list).
> The sources of meteorological datasets include the meteorological stations that are established by the Central Weather Administration (station code is beginning with C0, C1, and 4) and the agricultural meteorological stations belonging to the Council of Agriculture (station code other than those mentioned above):
- StationId
- MeasuredDate, the observation time
- StnPres, the station air pressure
- SeaPres, the sea level pressure
- Td, the dew point temperature
- RH, the relative humidity
- Other elements where available
## Downloading the data
- A [pre-processed version](#pre-processed-data) of the data for the ClickHouse, which has been cleaned, re-structured, and enriched. This dataset covers the years from 1896 to 2023.
- [Download the original raw data](#original-raw-data) and convert to the format required by ClickHouse. Users wanting to add their own columns may wish to explore or complete their approaches.
### Pre-processed data
The dataset has also been re-structured from a measurement per line to a row per weather station id and measured date, i.e.
```csv
StationId,MeasuredDate,StnPres,Tx,RH,WS,WD,WSGust,WDGust,Precp,GloblRad,TxSoil0cm,TxSoil5cm,TxSoil20cm,TxSoil50cm,TxSoil100cm,SeaPres,Td,PrecpHour,SunShine,TxSoil10cm,EvapA,Visb,UVI,Cloud Amount,TxSoil30cm,TxSoil200cm,TxSoil300cm,TxSoil500cm,VaporPressure
C0X100,2016-01-01 01:00:00,1022.1,16.1,72,1.1,8.0,,,,,,,,,,,,,,,,,,,,,,,
C0X100,2016-01-01 02:00:00,1021.6,16.0,73,1.2,358.0,,,,,,,,,,,,,,,,,,,,,,,
C0X100,2016-01-01 03:00:00,1021.3,15.8,74,1.5,353.0,,,,,,,,,,,,,,,,,,,,,,,
C0X100,2016-01-01 04:00:00,1021.2,15.8,74,1.7,8.0,,,,,,,,,,,,,,,,,,,,,,,
```
It is easy to query and ensure that the resulting table has less sparse and some elements are null because they're not available to be measured in this weather station.
This dataset is available in the following Google CloudStorage location. Either download the dataset to your local filesystem (and insert them with the ClickHouse client) or insert them directly into the ClickHouse (see [Inserting from URL](#inserting-from-url)).
To download:
```bash
wget https://storage.googleapis.com/taiwan-weather-observaiton-datasets/preprocessed_weather_daily_1896_2023.tar.gz
# Option: Validate the checksum
md5sum preprocessed_weather_daily_1896_2023.tar.gz
# Checksum should be equal to: 11b484f5bd9ddafec5cfb131eb2dd008
tar -xzvf preprocessed_weather_daily_1896_2023.tar.gz
daily_weather_preprocessed_1896_2023.csv
# Option: Validate the checksum
md5sum daily_weather_preprocessed_1896_2023.csv
# Checksum should be equal to: 1132248c78195c43d93f843753881754
```
### Original raw data
The following details are about the steps to download the original raw data to transform and convert as you want.
#### Download
To download the original raw data:
```bash
mkdir tw_raw_weather_data && cd tw_raw_weather_data
wget https://storage.googleapis.com/taiwan-weather-observaiton-datasets/raw_data_weather_daily_1896_2023.tar.gz
# Option: Validate the checksum
md5sum raw_data_weather_daily_1896_2023.tar.gz
# Checksum should be equal to: b66b9f137217454d655e3004d7d1b51a
tar -xzvf raw_data_weather_daily_1896_2023.tar.gz
466920_1928.csv
466920_1929.csv
466920_1930.csv
466920_1931.csv
...
# Option: Validate the checksum
cat *.csv | md5sum
# Checksum should be equal to: b26db404bf84d4063fac42e576464ce1
```
#### Retrieve the Taiwan weather stations
```bash
wget -O weather_sta_list.csv https://github.com/Raingel/weather_station_list/raw/main/data/weather_sta_list.csv
# Option: Convert the UTF-8-BOM to UTF-8 encoding
sed -i '1s/^\xEF\xBB\xBF//' weather_sta_list.csv
```
## Create table schema
Create the MergeTree table in ClickHouse (from the ClickHouse client).
```bash
CREATE TABLE tw_weather_data (
StationId String null,
MeasuredDate DateTime64,
StnPres Float64 null,
SeaPres Float64 null,
Tx Float64 null,
Td Float64 null,
RH Float64 null,
WS Float64 null,
WD Float64 null,
WSGust Float64 null,
WDGust Float64 null,
Precp Float64 null,
PrecpHour Float64 null,
SunShine Float64 null,
GloblRad Float64 null,
TxSoil0cm Float64 null,
TxSoil5cm Float64 null,
TxSoil10cm Float64 null,
TxSoil20cm Float64 null,
TxSoil50cm Float64 null,
TxSoil100cm Float64 null,
TxSoil30cm Float64 null,
TxSoil200cm Float64 null,
TxSoil300cm Float64 null,
TxSoil500cm Float64 null,
VaporPressure Float64 null,
UVI Float64 null,
"Cloud Amount" Float64 null,
EvapA Float64 null,
Visb Float64 null
)
ENGINE = MergeTree
ORDER BY (MeasuredDate);
```
## Inserting into ClickHouse
### Inserting from local file
Data can be inserted from a local file as follows (from the ClickHouse client):
```sql
INSERT INTO tw_weather_data FROM INFILE '/path/to/daily_weather_preprocessed_1896_2023.csv'
```
where `/path/to` represents the specific user path to the local file on the disk.
And the sample response output is as follows after inserting data into the ClickHouse:
```response
Query id: 90e4b524-6e14-4855-817c-7e6f98fbeabb
Ok.
131985329 rows in set. Elapsed: 71.770 sec. Processed 131.99 million rows, 10.06 GB (1.84 million rows/s., 140.14 MB/s.)
Peak memory usage: 583.23 MiB.
```
### Inserting from URL
```sql
INSERT INTO tw_weather_data SELECT *
FROM url('https://storage.googleapis.com/taiwan-weather-observaiton-datasets/daily_weather_preprocessed_1896_2023.csv', 'CSVWithNames')
```
To know how to speed this up, please see our blog post on [tuning large data loads](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2).
## Check data rows and sizes
1. Let's see how many rows are inserted:
```sql
SELECT formatReadableQuantity(count())
FROM tw_weather_data;
```
```response
┌─formatReadableQuantity(count())─┐
│ 131.99 million │
└─────────────────────────────────┘
```
2. Let's see how much disk space are used for this table:
```sql
SELECT
formatReadableSize(sum(bytes)) AS disk_size,
formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size
FROM system.parts
WHERE (`table` = 'tw_weather_data') AND active
```
```response
┌─disk_size─┬─uncompressed_size─┐
│ 2.13 GiB │ 32.94 GiB │
└───────────┴───────────────────┘
```
## Sample queries
### Q1: Retrieve the highest dew point temperature for each weather station in the specific year
```sql
SELECT
StationId,
max(Td) AS max_td
FROM tw_weather_data
WHERE (year(MeasuredDate) = 2023) AND (Td IS NOT NULL)
GROUP BY StationId
┌─StationId─┬─max_td─┐
│ 466940 │ 1 │
│ 467300 │ 1 │
│ 467540 │ 1 │
│ 467490 │ 1 │
│ 467080 │ 1 │
│ 466910 │ 1 │
│ 467660 │ 1 │
│ 467270 │ 1 │
│ 467350 │ 1 │
│ 467571 │ 1 │
│ 466920 │ 1 │
│ 467650 │ 1 │
│ 467550 │ 1 │
│ 467480 │ 1 │
│ 467610 │ 1 │
│ 467050 │ 1 │
│ 467590 │ 1 │
│ 466990 │ 1 │
│ 467060 │ 1 │
│ 466950 │ 1 │
│ 467620 │ 1 │
│ 467990 │ 1 │
│ 466930 │ 1 │
│ 467110 │ 1 │
│ 466881 │ 1 │
│ 467410 │ 1 │
│ 467441 │ 1 │
│ 467420 │ 1 │
│ 467530 │ 1 │
│ 466900 │ 1 │
└───────────┴────────┘
30 rows in set. Elapsed: 0.045 sec. Processed 6.41 million rows, 187.33 MB (143.92 million rows/s., 4.21 GB/s.)
```
### Q2: Raw data fetching with the specific duration time range, fields and weather station
```sql
SELECT
StnPres,
SeaPres,
Tx,
Td,
RH,
WS,
WD,
WSGust,
WDGust,
Precp,
PrecpHour
FROM tw_weather_data
WHERE (StationId = 'C0UB10') AND (MeasuredDate >= '2023-12-23') AND (MeasuredDate < '2023-12-24')
ORDER BY MeasuredDate ASC
LIMIT 10
```
```response
┌─StnPres─┬─SeaPres─┬───Tx─┬───Td─┬─RH─┬──WS─┬──WD─┬─WSGust─┬─WDGust─┬─Precp─┬─PrecpHour─┐
│ 1029.5 │ ᴺᵁᴸᴸ │ 11.8 │ ᴺᵁᴸᴸ │ 78 │ 2.7 │ 271 │ 5.5 │ 275 │ -99.8 │ -99.8 │
│ 1029.8 │ ᴺᵁᴸᴸ │ 12.3 │ ᴺᵁᴸᴸ │ 78 │ 2.7 │ 289 │ 5.5 │ 308 │ -99.8 │ -99.8 │
│ 1028.6 │ ᴺᵁᴸᴸ │ 12.3 │ ᴺᵁᴸᴸ │ 79 │ 2.3 │ 251 │ 6.1 │ 289 │ -99.8 │ -99.8 │
│ 1028.2 │ ᴺᵁᴸᴸ │ 13 │ ᴺᵁᴸᴸ │ 75 │ 4.3 │ 312 │ 7.5 │ 316 │ -99.8 │ -99.8 │
│ 1027.8 │ ᴺᵁᴸᴸ │ 11.1 │ ᴺᵁᴸᴸ │ 89 │ 7.1 │ 310 │ 11.6 │ 322 │ -99.8 │ -99.8 │
│ 1027.8 │ ᴺᵁᴸᴸ │ 11.6 │ ᴺᵁᴸᴸ │ 90 │ 3.1 │ 269 │ 10.7 │ 295 │ -99.8 │ -99.8 │
│ 1027.9 │ ᴺᵁᴸᴸ │ 12.3 │ ᴺᵁᴸᴸ │ 89 │ 4.7 │ 296 │ 8.1 │ 310 │ -99.8 │ -99.8 │
│ 1028.2 │ ᴺᵁᴸᴸ │ 12.2 │ ᴺᵁᴸᴸ │ 94 │ 2.5 │ 246 │ 7.1 │ 283 │ -99.8 │ -99.8 │
│ 1028.4 │ ᴺᵁᴸᴸ │ 12.5 │ ᴺᵁᴸᴸ │ 94 │ 3.1 │ 265 │ 4.8 │ 297 │ -99.8 │ -99.8 │
│ 1028.3 │ ᴺᵁᴸᴸ │ 13.6 │ ᴺᵁᴸᴸ │ 91 │ 1.2 │ 273 │ 4.4 │ 256 │ -99.8 │ -99.8 │
└─────────┴─────────┴──────┴──────┴────┴─────┴─────┴────────┴────────┴───────┴───────────┘
10 rows in set. Elapsed: 0.009 sec. Processed 91.70 thousand rows, 2.33 MB (9.67 million rows/s., 245.31 MB/s.)
```
## Credits
We would like to acknowledge the efforts of the Central Weather Administration and Agricultural Meteorological Observation Network (Station) of the Council of Agriculture for preparing, cleaning, and distributing this dataset. We appreciate your efforts.
Ou, J.-H., Kuo, C.-H., Wu, Y.-F., Lin, G.-C., Lee, M.-H., Chen, R.-K., Chou, H.-P., Wu, H.-Y., Chu, S.-C., Lai, Q.-J., Tsai, Y.-C., Lin, C.-C., Kuo, C.-C., Liao, C.-T., Chen, Y.-N., Chu, Y.-W., Chen, C.-Y., 2023. Application-oriented deep learning model for early warning of rice blast in Taiwan. Ecological Informatics 73, 101950. https://doi.org/10.1016/j.ecoinf.2022.101950 [13/12/2022]

View File

@ -178,7 +178,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
- `--password` The password. Default value: empty string.
- `--ask-password` - Prompt the user to enter a password.
- `--query, -q` The query to process when using non-interactive mode. `--query` can be specified multiple times, e.g. `--query "SELECT 1" --query "SELECT 2"`. Cannot be used simultaneously with `--queries-file`.
- `--queries-file` file path with queries to execute. `--queries-file` can be specified multiple times, e.g. `--query queries1.sql --query queries2.sql`. Cannot be used simultaneously with `--query`.
- `--queries-file` file path with queries to execute. `--queries-file` can be specified multiple times, e.g. `--queries-file queries1.sql --queries-file queries2.sql`. Cannot be used simultaneously with `--query`.
- `--multiquery, -n` If specified, multiple queries separated by semicolons can be listed after the `--query` option. For convenience, it is also possible to omit `--query` and pass the queries directly after `--multiquery`.
- `--multiline, -m` If specified, allow multiline queries (do not send the query on Enter).
- `--database, -d` Select the current default database. Default value: the current database from the server settings (default by default).

View File

@ -95,9 +95,11 @@ which is equal to
## Substituting Configuration {#substitution}
The config can also define “substitutions”. If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-include_from) element in the server config. The substitution values are specified in `/clickhouse/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](../operations/server-configuration-parameters/settings.md#macros)).
The config can define substitutions. There are two types of substitutions:
If you want to replace an entire element with a substitution use `include` as the element name.
- If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-include_from) element in the server config. The substitution values are specified in `/clickhouse/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](../operations/server-configuration-parameters/settings.md#macros)).
- If you want to replace an entire element with a substitution, use `include` as the element name. Substitutions can also be performed from ZooKeeper by specifying attribute `from_zk = "/path/to/node"`. In this case, the element value is replaced with the contents of the Zookeeper node at `/path/to/node`. This also works with you store an entire XML subtree as a Zookeeper node, it will be fully inserted into the source element.
XML substitution example:
@ -114,7 +116,7 @@ XML substitution example:
</clickhouse>
```
Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node, and it will be fully inserted into the source element.
If you want to merge the substituting content with the existing configuration instead of appending you can use attribute `merge="true"`, for example: `<include from_zk="/some_path" merge="true">`. In this case, the existing configuration will be merged with the content from the substitution and the existing configuration settings will be replaced with values from substitution.
## Encrypting and Hiding Configuration {#encryption}

View File

@ -933,9 +933,9 @@ Hard limit is configured via system tools
## database_atomic_delay_before_drop_table_sec {#database_atomic_delay_before_drop_table_sec}
Sets the delay before remove table data in seconds. If the query has `SYNC` modifier, this setting is ignored.
The delay before a table data is dropped in seconds. If the `DROP TABLE` query has a `SYNC` modifier, this setting is ignored.
Default value: `480` (8 minute).
Default value: `480` (8 minutes).
## database_catalog_unused_dir_hide_timeout_sec {#database_catalog_unused_dir_hide_timeout_sec}

View File

@ -4337,6 +4337,18 @@ Possible values:
Default value: `0`.
## function_locate_has_mysql_compatible_argument_order {#function-locate-has-mysql-compatible-argument-order}
Controls the order of arguments in function [locate](../../sql-reference/functions/string-search-functions.md#locate).
Possible values:
- 0 — Function `locate` accepts arguments `(haystack, needle[, start_pos])`.
- 1 — Function `locate` accepts arguments `(needle, haystack, [, start_pos])` (MySQL-compatible behavior)
Default value: `1`.
## date_time_overflow_behavior {#date_time_overflow_behavior}
Defines the behavior when [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md) or integers are converted into Date, Date32, DateTime or DateTime64 but the value cannot be represented in the result type.

View File

@ -5,26 +5,416 @@ sidebar_label: "External Disks for Storing Data"
title: "External Disks for Storing Data"
---
Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon S3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)).
Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely. Various storages are supported:
1. [Amazon S3](https://aws.amazon.com/s3/) object storage.
2. The Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html))
3. [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs).
To work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine.
:::note ClickHouse also has support for external table engines, which are different from external storage option described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` family or `Log` family tables.
1. to work with data stored on `Amazon S3` disks, use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine.
2. to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine.
3. to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) table engine.
:::
To load data from a web server with static files use a disk with type [web](#storing-data-on-webserver).
## Configuring external storage {#configuring-external-storage}
## Configuring HDFS {#configuring-hdfs}
[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` using a disk with types `s3`, `azure_blob_storage`, `hdfs` accordingly.
[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`.
Disk configuration requires:
1. `type` section, equal to one of `s3`, `azure_blob_storage`, `hdfs`, `local_blob_storage`, `web`.
2. Configuration of a specific external storage type.
Configuration markup:
Starting from 24.1 clickhouse version, it is possible to use a new configuration option.
It requires to specify:
1. `type` equal to `object_storage`
2. `object_storage_type`, equal to one of `s3`, `azure_blob_storage` (or just `azure` from `24.3`), `hdfs`, `local_blob_storage` (or just `local` from `24.3`), `web`.
Optionally, `metadata_type` can be specified (it is equal to `local` by default), but it can also be set to `plain`, `web`.
Usage of `plain` metadata type is described in [plain storage section](/docs/en/operations/storing-data.md/#storing-data-on-webserver), `web` metadata type can be used only with `web` object storage type, `local` metadata type stores metadata files locally (each metadata files contains mapping to files in object storage and some additional meta information about them).
E.g. configuration option
``` xml
<s3>
<type>s3</type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3>
```
is equal to configuration (from `24.1`):
``` xml
<s3>
<type>object_storage</type>
<object_storage_type>s3</object_storage_type>
<metadata_type>local</metadata_type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3>
```
Configuration
``` xml
<s3_plain>
<type>s3_plain</type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3_plain>
```
is equal to
``` xml
<s3_plain>
<type>object_storage</type>
<object_storage_type>s3</object_storage_type>
<metadata_type>plain</metadata_type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3_plain>
```
Example of full storage configuration will look like:
``` xml
<clickhouse>
<storage_configuration>
<disks>
<s3>
<type>s3</type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3>
</disks>
<policies>
<s3>
<volumes>
<main>
<disk>s3</disk>
</main>
</volumes>
</s3>
</policies>
</storage_configuration>
</clickhouse>
```
Starting with 24.1 clickhouse version, it can also look like:
``` xml
<clickhouse>
<storage_configuration>
<disks>
<s3>
<type>object_storage</type>
<object_storage_type>s3</object_storage_type>
<metadata_type>local</metadata_type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3>
</disks>
<policies>
<s3>
<volumes>
<main>
<disk>s3</disk>
</main>
</volumes>
</s3>
</policies>
</storage_configuration>
</clickhouse>
```
In order to make a specific kind of storage a default option for all `MergeTree` tables add the following section to configuration file:
``` xml
<clickhouse>
<merge_tree>
<storage_policy>s3</storage_policy>
</merge_tree>
</clickhouse>
```
If you want to configure a specific storage policy only to specific table, you can define it in settings while creating the table:
``` sql
CREATE TABLE test (a Int32, b String)
ENGINE = MergeTree() ORDER BY a
SETTINGS storage_policy = 's3';
```
You can also use `disk` instead of `storage_policy`. In this case it is not requires to have `storage_policy` section in configuration file, only `disk` section would be enough.
``` sql
CREATE TABLE test (a Int32, b String)
ENGINE = MergeTree() ORDER BY a
SETTINGS disk = 's3';
```
## Dynamic Configuration {#dynamic-configuration}
There is also a possibility to specify storage configuration without a predefined disk in configuration in a configuration file, but can be configured in the `CREATE`/`ATTACH` query settings.
The following example query builds on the above dynamic disk configuration and shows how to use a local disk to cache data from a table stored at a URL.
```sql
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
is_new UInt8,
duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
# highlight-start
SETTINGS disk = disk(
type=web,
endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
);
# highlight-end
```
The example below adds cache to external storage.
```sql
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
is_new UInt8,
duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
# highlight-start
SETTINGS disk = disk(
type=cache,
max_size='1Gi',
path='/var/lib/clickhouse/custom_disk_cache/',
disk=disk(
type=web,
endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
)
);
# highlight-end
```
In the settings highlighted below notice that the disk of `type=web` is nested within
the disk of `type=cache`.
:::note
The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk.
:::
A combination of config-based configuration and sql-defined configuration is also possible:
```sql
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
is_new UInt8,
duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
# highlight-start
SETTINGS disk = disk(
type=cache,
max_size='1Gi',
path='/var/lib/clickhouse/custom_disk_cache/',
disk=disk(
type=web,
endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
)
);
# highlight-end
```
where `web` is a from a server configuration file:
``` xml
<storage_configuration>
<disks>
<web>
<type>web</type>
<endpoint>'https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'</endpoint>
</web>
</disks>
</storage_configuration>
```
### Using S3 Storage {#s3-storage}
Required parameters:
- `endpoint` — S3 endpoint URL in `path` or `virtual hosted` [styles](https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). Endpoint URL should contain a bucket and root path to store data.
- `access_key_id` — S3 access key id.
- `secret_access_key` — S3 secret access key.
Optional parameters:
- `region` — S3 region name.
- `support_batch_delete` — This controls the check to see if batch deletes are supported. Set this to `false` when using Google Cloud Storage (GCS) as GCS does not support batch deletes and preventing the checks will prevent error messages in the logs.
- `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`.
- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`.
- `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`.
- `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL.
- `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`.
- `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`.
- `retry_attempts` — Number of retry attempts in case of failed request. Default value is `10`.
- `single_read_retries` — Number of retry attempts in case of connection drop during read. Default value is `4`.
- `min_bytes_for_seek` — Minimal number of bytes to use seek operation instead of sequential read. Default value is `1 Mb`.
- `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
- `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`.
- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times.
- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set.
- `server_side_encryption_kms_key_id` - If specified, required headers for accessing S3 objects with [SSE-KMS encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) will be set. If an empty string is specified, the AWS managed S3 key will be used. Optional.
- `server_side_encryption_kms_encryption_context` - If specified alongside `server_side_encryption_kms_key_id`, the given encryption context header for SSE-KMS will be set. Optional.
- `server_side_encryption_kms_bucket_key_enabled` - If specified alongside `server_side_encryption_kms_key_id`, the header to enable S3 bucket keys for SSE-KMS will be set. Optional, can be `true` or `false`, defaults to nothing (matches the bucket-level setting).
- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`.
- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`.
- `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
- `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
- `key_template` — Define the format with which the object keys are generated. By default, Clickhouse takes `root path` from `endpoint` option and adds random generated suffix. That suffix is a dir with 3 random symbols and a file name with 29 random symbols. With that option you have a full control how to the object keys are generated. Some usage scenarios require having random symbols in the prefix or in the middle of object key. For example: `[a-z]{3}-prefix-random/constant-part/random-middle-[a-z]{3}/random-suffix-[a-z]{29}`. The value is parsed with [`re2`](https://github.com/google/re2/wiki/Syntax). Only some subset of the syntax is supported. Check if your preferred format is supported before using that option. Disk isn't initialized if clickhouse is unable to generate a key by the value of `key_template`. It requires enabled feature flag [storage_metadata_write_full_object_key](/docs/en/operations/settings/settings#storage_metadata_write_full_object_key). It forbids declaring the `root path` in `endpoint` option. It requires definition of the option `key_compatibility_prefix`.
- `key_compatibility_prefix` — That option is required when option `key_template` is in use. In order to be able to read the objects keys which were stored in the metadata files with the metadata version lower that `VERSION_FULL_OBJECT_KEY`, the previous `root path` from the `endpoint` option should be set here.
:::note
Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/gcs).
:::
### Using Plain Storage {#plain-storage}
In `22.10` a new disk type `s3_plain` was introduced, which provides a write-once storage. Configuration parameters are the same as for `s3` disk type.
Unlike `s3` disk type, it stores data as is, e.g. instead of randomly-generated blob names, it uses normal file names (the same way as clickhouse stores files on local disk) and does not store any metadata locally, e.g. it is derived from data on `s3`.
This disk type allows to keep a static version of the table, as it does not allow executing merges on the existing data and does not allow inserting of new data.
A use case for this disk type is to create backups on it, which can be done via `BACKUP TABLE data TO Disk('plain_disk_name', 'backup_name')`. Afterwards you can do `RESTORE TABLE data AS data_restored FROM Disk('plain_disk_name', 'backup_name')` or using `ATTACH TABLE data (...) ENGINE = MergeTree() SETTINGS disk = 'plain_disk_name'`.
Configuration:
``` xml
<s3_plain>
<type>s3_plain</type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3_plain>
```
Starting from `24.1` it is possible configure any object storage disk (`s3`, `azure`, `hdfs`, `local`) using `plain` metadata type.
Configuration:
``` xml
<s3_plain>
<type>object_storage</type>
<object_storage_type>azure</object_storage_type>
<metadata_type>plain</metadata_type>
<endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
<use_invironment_credentials>1</use_invironment_credentials>
</s3_plain>
```
### Using Azure Blob Storage {#azure-blob-storage}
`MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`.
As of February 2022, this feature is still a fresh addition, so expect that some Azure Blob Storage functionalities might be unimplemented.
Configuration markup:
``` xml
<storage_configuration>
...
<disks>
<blob_storage_disk>
<type>azure_blob_storage</type>
<storage_account_url>http://account.blob.core.windows.net</storage_account_url>
<container_name>container</container_name>
<account_name>account</account_name>
<account_key>pass123</account_key>
<metadata_path>/var/lib/clickhouse/disks/blob_storage_disk/</metadata_path>
<cache_path>/var/lib/clickhouse/disks/blob_storage_disk/cache/</cache_path>
<skip_access_check>false</skip_access_check>
</blob_storage_disk>
</disks>
...
</storage_configuration>
```
Connection parameters:
* `storage_account_url` - **Required**, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`.
* `container_name` - Target container name, defaults to `default-container`.
* `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet.
Authentication parameters (the disk will try all available methods **and** Managed Identity Credential):
* `connection_string` - For authentication using a connection string.
* `account_name` and `account_key` - For authentication using Shared Key.
Limit parameters (mainly for internal usage):
* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
* `min_bytes_for_seek` - Limits the size of a seekable region.
* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage.
* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage.
* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated.
* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurrently for one object.
Other parameters:
* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
* `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`.
* `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
* `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)).
:::note Zero-copy replication is not ready for production
Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use.
:::
## Using HDFS storage {#hdfs-storage}
In this sample configuration:
- the disk is of type `hdfs`
- the data is hosted at `hdfs://hdfs1:9000/clickhouse/`
```xml
<clickhouse>
<storage_configuration>
<disks>
<hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
<skip_access_check>true</skip_access_check>
</hdfs>
<hdd>
<type>local</type>
<path>/</path>
</hdd>
</disks>
<policies>
<hdfs>
@ -32,26 +422,17 @@ Configuration markup:
<main>
<disk>hdfs</disk>
</main>
<external>
<disk>hdd</disk>
</external>
</volumes>
</hdfs>
</policies>
</storage_configuration>
<merge_tree>
<min_bytes_for_wide_part>0</min_bytes_for_wide_part>
</merge_tree>
</clickhouse>
```
Required parameters:
- `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data.
Optional parameters:
- `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`.
## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system}
### Using Data Encryption {#encrypted-virtual-file-system}
You can encrypt the data stored on [S3](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one.
@ -112,7 +493,7 @@ Example of disk configuration:
</clickhouse>
```
## Using local cache {#using-local-cache}
### Using local cache {#using-local-cache}
It is possible to configure local cache over disks in storage configuration starting from version 22.3.
For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc.
@ -275,23 +656,92 @@ Cache profile events:
- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
## Using in-memory cache (userspace page cache) {#userspace-page-cache}
The File Cache described above stores cached data in local files. Alternatively, object-store-based disks can be configured to use "Userspace Page Cache", which is RAM-only. Userspace page cache is recommended only if file cache can't be used for some reason, e.g. if the machine doesn't have a local disk at all. Note that file cache effectively uses RAM for caching too, since the OS caches contents of local files.
To enable userspace page cache for disks that don't use file cache, use setting `use_page_cache_for_disks_without_file_cache`.
By default, on Linux, the userspace page cache will use all available memory, similar to the OS page cache. In tools like `top` and `ps`, the clickhouse server process will typically show resident set size near 100% of the machine's RAM - this is normal, and most of this memory is actually reclaimable by the OS on memory pressure (`MADV_FREE`). This behavior can be disabled with server setting `page_cache_use_madv_free = 0`, making the userspace page cache just use a fixed amount of memory `page_cache_size` with no special interaction with the OS. On Mac OS, `page_cache_use_madv_free` is always disabled as it doesn't have lazy `MADV_FREE`.
Unfortunately, `page_cache_use_madv_free` makes it difficult to tell if the server is close to running out of memory, since the RSS metric becomes useless. Async metric `UnreclaimableRSS` shows the amount of physical memory used by the server, excluding the memory reclaimable by the OS: `select value from system.asynchronous_metrics where metric = 'UnreclaimableRSS'`. Use it for monitoring instead of RSS. This metric is only available if `page_cache_use_madv_free` is enabled.
## Storing Data on Web Server {#storing-data-on-webserver}
There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
### Using static Web storage (read-only) {#web-storage}
This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](/docs/en/sql-reference/statements/create/table.md), [ALTER TABLE](/docs/en/sql-reference/statements/alter/index.md), [RENAME TABLE](/docs/en/sql-reference/statements/rename.md/#misc_operations-rename_table), [DETACH TABLE](/docs/en/sql-reference/statements/detach.md) and [TRUNCATE TABLE](/docs/en/sql-reference/statements/truncate.md).
Web storage can be used for read-only purposes. An example use is for hosting sample data, or for migrating data.
There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
Web server storage is supported only for the [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) engine families. To access the data stored on a `web` disk, use the [storage_policy](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#terms) setting when executing the query. For example, `ATTACH TABLE table_web UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'`.
In this sample configuration:
- the disk is of type `web`
- the data is hosted at `http://nginx:80/test1/`
- a cache on local storage is used
```xml
<clickhouse>
<storage_configuration>
<disks>
<web>
<type>web</type>
<endpoint>http://nginx:80/test1/</endpoint>
</web>
<cached_web>
<type>cache</type>
<disk>web</disk>
<path>cached_web_cache/</path>
<max_size>100000000</max_size>
</cached_web>
</disks>
<policies>
<web>
<volumes>
<main>
<disk>web</disk>
</main>
</volumes>
</web>
<cached_web>
<volumes>
<main>
<disk>cached_web</disk>
</main>
</volumes>
</cached_web>
</policies>
</storage_configuration>
</clickhouse>
```
:::tip
Storage can also be configured temporarily within a query, if a web dataset is not expected
to be used routinely, see [dynamic configuration](#dynamic-configuration) and skip editing the
configuration file.
:::
:::tip
A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver)
:::
In this `ATTACH TABLE` query the `UUID` provided matches the directory name of the data, and the endpoint is the URL for the raw GitHub content.
```sql
# highlight-next-line
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
is_new UInt8,
duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
# highlight-start
SETTINGS disk = disk(
type=web,
endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
);
# highlight-end
```
A ready test case. You need to add this configuration to config:
@ -487,7 +937,7 @@ If URL is not reachable on disk load when the server is starting up tables, then
Use [http_max_single_read_retries](/docs/en/operations/settings/settings.md/#http-max-single-read-retries) setting to limit the maximum number of retries during a single HTTP read.
## Zero-copy Replication (not ready for production) {#zero-copy}
### Zero-copy Replication (not ready for production) {#zero-copy}
Zero-copy replication is possible, but not recommended, with `S3` and `HDFS` disks. Zero-copy replication means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself.

View File

@ -26,7 +26,9 @@ priority: 0
is_active: 0
active_children: 0
dequeued_requests: 67
canceled_requests: 0
dequeued_cost: 4692272
canceled_cost: 0
busy_periods: 63
vruntime: 938454.1999999989
system_vruntime: ᴺᵁᴸᴸ
@ -54,7 +56,9 @@ Columns:
- `is_active` (`UInt8`) - Whether this node is currently active - has resource requests to be dequeued and constraints satisfied.
- `active_children` (`UInt64`) - The number of children in active state.
- `dequeued_requests` (`UInt64`) - The total number of resource requests dequeued from this node.
- `canceled_requests` (`UInt64`) - The total number of resource requests canceled from this node.
- `dequeued_cost` (`UInt64`) - The sum of costs (e.g. size in bytes) of all requests dequeued from this node.
- `canceled_cost` (`UInt64`) - The sum of costs (e.g. size in bytes) of all requests canceled from this node.
- `busy_periods` (`UInt64`) - The total number of deactivations of this node.
- `vruntime` (`Nullable(Float64)`) - For children of `fair` nodes only. Virtual runtime of a node used by SFQ algorithm to select the next child to process in a max-min fair manner.
- `system_vruntime` (`Nullable(Float64)`) - For `fair` nodes only. Virtual runtime showing `vruntime` of the last processed resource request. Used during child activation as the new value of `vruntime`.

View File

@ -36,9 +36,9 @@ You can explicitly set a time zone for `DateTime`-type columns when creating a t
The [clickhouse-client](../../interfaces/cli.md) applies the server time zone by default if a time zone isnt explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter.
ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings-formats.md#date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionally, you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function.
ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionally, you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function.
When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../../operations/settings/settings-formats.md#date_time_input_format) setting.
When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format) setting.
## Examples
@ -147,8 +147,8 @@ Time shifts for multiple days. Some pacific islands changed their timezone offse
- [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md)
- [Functions for working with dates and times](../../sql-reference/functions/date-time-functions.md)
- [Functions for working with arrays](../../sql-reference/functions/array-functions.md)
- [The `date_time_input_format` setting](../../operations/settings/settings-formats.md#date_time_input_format)
- [The `date_time_output_format` setting](../../operations/settings/settings-formats.md#date_time_output_format)
- [The `date_time_input_format` setting](../../operations/settings/settings-formats.md#settings-date_time_input_format)
- [The `date_time_output_format` setting](../../operations/settings/settings-formats.md#settings-date_time_output_format)
- [The `timezone` server configuration parameter](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
- [The `session_timezone` setting](../../operations/settings/settings.md#session_timezone)
- [Operators for working with dates and times](../../sql-reference/operators/index.md#operators-datetime)

View File

@ -4,6 +4,67 @@ sidebar_label: Polygons
title: "Functions for Working with Polygons"
---
## WKT
Returns a WKT (Well Known Text) geometric object from various [Geo Data Types](../../data-types/geo.md). Supported WKT objects are:
- POINT
- POLYGON
- MULTIPOLYGON
**Syntax**
```sql
WKT(geo_data)
```
**Parameters**
`geo_data` can be one of the following [Geo Data Types](../../data-types/geo.md) or their underlying primitive types:
- [Point](../../data-types/geo.md#point)
- [Ring](../../data-types/geo.md#ring)
- [Polygon](../../data-types/geo.md#polygon)
- [MultiPolygon](../../data-types/geo.md#multipolygon)
**Returned value**
- WKT geometric object `POINT` is returned for a Point.
- WKT geometric object `POLYGON` is returned for a Polygon
- WKT geometric object `MULTIPOLYGON` is returned for a MultiPolygon.
**Examples**
POINT from tuple:
```sql
SELECT wkt((0., 0.));
```
```response
POINT(0 0)
```
POLYGON from an array of tuples or an array of tuple arrays:
```sql
SELECT wkt([(0., 0.), (10., 0.), (10., 10.), (0., 10.)]);
```
```response
POLYGON((0 0,10 0,10 10,0 10))
```
MULTIPOLYGON from an array of multi-dimensional tuple arrays:
```sql
SELECT wkt([[[(0., 0.), (10., 0.), (10., 10.), (0., 10.)], [(4., 4.), (5., 4.), (5., 5.), (4., 5.)]], [[(-10., -10.), (-10., -9.), (-9., 10.)]]]);
```
```response
MULTIPOLYGON(((0 0,10 0,10 10,0 10,0 0),(4 4,5 4,5 5,4 5,4 4)),((-10 -10,-10 -9,-9 10,-10 -10)))
```
## readWKTMultiPolygon
Converts a WKT (Well Known Text) MultiPolygon into a MultiPolygon type.

View File

@ -30,7 +30,6 @@ position(haystack, needle[, start_pos])
Alias:
- `position(needle IN haystack)`
- `locate(haystack, needle[, start_pos])`.
**Arguments**
@ -49,7 +48,7 @@ If substring `needle` is empty, these rules apply:
- if `start_pos >= 1` and `start_pos <= length(haystack) + 1`: return `start_pos`
- otherwise: return `0`
The same rules also apply to functions `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8`
The same rules also apply to functions `locate`, `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8`.
Type: `Integer`.
@ -114,6 +113,21 @@ SELECT
└─────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┘
```
## locate
Like [position](#position) but with arguments `haystack` and `locate` switched.
The behavior of this function depends on the ClickHouse version:
- in versions < v24.3, `locate` was an alias of function `position` and accepted arguments `(haystack, needle[, start_pos])`.
- in versions >= 24.3,, `locate` is an individual function (for better compatibility with MySQL) and accepts arguments `(needle, haystack[, start_pos])`. The previous behavior
can be restored using setting [function_locate_has_mysql_compatible_argument_order = false](../../operations/settings/settings.md#function-locate-has-mysql-compatible-argument-order);
**Syntax**
``` sql
locate(needle, haystack[, start_pos])
```
## positionCaseInsensitive
Like [position](#position) but searches case-insensitively.

View File

@ -350,6 +350,7 @@ ALTER TABLE mt DELETE IN PARTITION ID '2' WHERE p = 2;
You can specify the partition expression in `ALTER ... PARTITION` queries in different ways:
- As a value from the `partition` column of the `system.parts` table. For example, `ALTER TABLE visits DETACH PARTITION 201901`.
- Using the keyword `ALL`. It can be used only with DROP/DETACH/ATTACH. For example, `ALTER TABLE visits ATTACH PARTITION ALL`.
- As a tuple of expressions or constants that matches (in types) the table partitioning keys tuple. In the case of a single element partitioning key, the expression should be wrapped in the `tuple (...)` function. For example, `ALTER TABLE visits DETACH PARTITION tuple(toYYYYMM(toDate('2019-01-25')))`.
- Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`.
- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`.

View File

@ -13,13 +13,6 @@ a system table called `system.dropped_tables`.
If you have a materialized view without a `TO` clause associated with the dropped table, then you will also have to UNDROP the inner table of that view.
:::note
UNDROP TABLE is experimental. To use it add this setting:
```sql
set allow_experimental_undrop_table_query = 1;
```
:::
:::tip
Also see [DROP TABLE](/docs/en/sql-reference/statements/drop.md)
:::
@ -32,60 +25,53 @@ UNDROP TABLE [db.]name [UUID '<uuid>'] [ON CLUSTER cluster]
**Example**
``` sql
set allow_experimental_undrop_table_query = 1;
```
```sql
CREATE TABLE undropMe
CREATE TABLE tab
(
`id` UInt8
)
ENGINE = MergeTree
ORDER BY id
```
ORDER BY id;
DROP TABLE tab;
```sql
DROP TABLE undropMe
```
```sql
SELECT *
FROM system.dropped_tables
FORMAT Vertical
FORMAT Vertical;
```
```response
Row 1:
──────
index: 0
database: default
table: undropMe
table: tab
uuid: aa696a1a-1d70-4e60-a841-4c80827706cc
engine: MergeTree
metadata_dropped_path: /var/lib/clickhouse/metadata_dropped/default.undropMe.aa696a1a-1d70-4e60-a841-4c80827706cc.sql
metadata_dropped_path: /var/lib/clickhouse/metadata_dropped/default.tab.aa696a1a-1d70-4e60-a841-4c80827706cc.sql
table_dropped_time: 2023-04-05 14:12:12
1 row in set. Elapsed: 0.001 sec.
```
```sql
UNDROP TABLE undropMe
```
```response
Ok.
```
```sql
UNDROP TABLE tab;
SELECT *
FROM system.dropped_tables
FORMAT Vertical
```
FORMAT Vertical;
```response
Ok.
0 rows in set. Elapsed: 0.001 sec.
```
```sql
DESCRIBE TABLE undropMe
FORMAT Vertical
DESCRIBE TABLE tab
FORMAT Vertical;
```
```response
Row 1:
──────

View File

@ -6,6 +6,11 @@ sidebar_label: jdbc
# jdbc
:::note
clickhouse-jdbc-bridge contains experimental codes and is no longer supported. It may contain reliability issues and security vulnerabilities. Use it at your own risk.
ClickHouse recommend using built-in table functions in ClickHouse which provide a better alternative for ad-hoc querying scenarios (Postgres, MySQL, MongoDB, etc).
:::
`jdbc(datasource, schema, table)` - returns table that is connected via JDBC driver.
This table function requires separate [clickhouse-jdbc-bridge](https://github.com/ClickHouse/clickhouse-jdbc-bridge) program to be running.

View File

@ -27,9 +27,9 @@ DateTime([timezone])
Консольный клиент ClickHouse по умолчанию использует часовой пояс сервера, если для значения `DateTime` часовой пояс не был задан в явном виде при инициализации типа данных. Чтобы использовать часовой пояс клиента, запустите [clickhouse-client](../../interfaces/cli.md) с параметром `--use_client_time_zone`.
ClickHouse отображает значения в зависимости от значения параметра [date\_time\_output\_format](../../operations/settings/settings-formats.md#date_time_output_format). Текстовый формат по умолчанию `YYYY-MM-DD hh:mm:ss`. Кроме того, вы можете поменять отображение с помощью функции [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime).
ClickHouse отображает значения в зависимости от значения параметра [date\_time\_output\_format](../../operations/settings/index.md#settings-date_time_output_format). Текстовый формат по умолчанию `YYYY-MM-DD hh:mm:ss`. Кроме того, вы можете поменять отображение с помощью функции [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime).
При вставке данных в ClickHouse, можно использовать различные форматы даты и времени в зависимости от значения настройки [date_time_input_format](../../operations/settings/settings-formats.md#date_time_input_format).
При вставке данных в ClickHouse, можно использовать различные форматы даты и времени в зависимости от значения настройки [date_time_input_format](../../operations/settings/index.md#settings-date_time_input_format).
## Примеры {#primery}
@ -119,8 +119,8 @@ FROM dt
- [Функции преобразования типов](../../sql-reference/functions/type-conversion-functions.md)
- [Функции для работы с датой и временем](../../sql-reference/functions/date-time-functions.md)
- [Функции для работы с массивами](../../sql-reference/functions/array-functions.md)
- [Настройка `date_time_input_format`](../../operations/settings/settings-formats.md#date_time_input_format)
- [Настройка `date_time_output_format`](../../operations/settings/settings-formats.md#date_time_output_format)
- [Настройка `date_time_input_format`](../../operations/settings/index.md#settings-date_time_input_format)
- [Настройка `date_time_output_format`](../../operations/settings/index.md)
- [Конфигурационный параметр сервера `timezone`](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
- [Параметр `session_timezone`](../../operations/settings/settings.md#session_timezone)
- [Операторы для работы с датой и временем](../../sql-reference/operators/index.md#operators-datetime)

View File

@ -143,7 +143,7 @@ int mainEntryClickHouseCompressor(int argc, char ** argv)
ParserCodec codec_parser;
std::string codecs_line = boost::algorithm::join(codecs, ",");
auto ast = parseQuery(codec_parser, "(" + codecs_line + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
auto ast = parseQuery(codec_parser, "(" + codecs_line + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
codec = CompressionCodecFactory::instance().get(ast, nullptr);
}
else

View File

@ -234,7 +234,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
size_t approx_query_length = multiple ? find_first_symbols<';'>(pos, end) - pos : end - pos;
ASTPtr res = parseQueryAndMovePosition(
parser, pos, end, "query", multiple, cmd_settings.max_query_size, cmd_settings.max_parser_depth);
parser, pos, end, "query", multiple, cmd_settings.max_query_size, cmd_settings.max_parser_depth, cmd_settings.max_parser_backtracks);
std::unique_ptr<ReadBuffer> insert_query_payload = nullptr;
/// If the query is INSERT ... VALUES, then we will try to parse the data.

View File

@ -2,6 +2,7 @@
#include "Commands.h"
#include <queue>
#include "KeeperClient.h"
#include "Parsers/CommonParsers.h"
namespace DB
@ -106,13 +107,13 @@ bool CreateCommand::parse(IParser::Pos & pos, std::shared_ptr<ASTKeeperQuery> &
int mode = zkutil::CreateMode::Persistent;
if (ParserKeyword{"PERSISTENT"}.ignore(pos, expected))
if (ParserKeyword(Keyword::PERSISTENT).ignore(pos, expected))
mode = zkutil::CreateMode::Persistent;
else if (ParserKeyword{"EPHEMERAL"}.ignore(pos, expected))
else if (ParserKeyword(Keyword::EPHEMERAL).ignore(pos, expected))
mode = zkutil::CreateMode::Ephemeral;
else if (ParserKeyword{"EPHEMERAL SEQUENTIAL"}.ignore(pos, expected))
else if (ParserKeyword(Keyword::EPHEMERAL_SEQUENTIAL).ignore(pos, expected))
mode = zkutil::CreateMode::EphemeralSequential;
else if (ParserKeyword{"PERSISTENT SEQUENTIAL"}.ignore(pos, expected))
else if (ParserKeyword(Keyword::PERSISTENT_SEQUENTIAL).ignore(pos, expected))
mode = zkutil::CreateMode::PersistentSequential;
node->args.push_back(std::move(mode));
@ -382,12 +383,16 @@ void RMRCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) co
bool ReconfigCommand::parse(IParser::Pos & pos, std::shared_ptr<ASTKeeperQuery> & node, DB::Expected & expected) const
{
ParserKeyword s_add(Keyword::ADD);
ParserKeyword s_remove(Keyword::REMOVE);
ParserKeyword s_set(Keyword::SET);
ReconfigCommand::Operation operation;
if (ParserKeyword{"ADD"}.ignore(pos, expected))
if (s_add.ignore(pos, expected))
operation = ReconfigCommand::Operation::ADD;
else if (ParserKeyword{"REMOVE"}.ignore(pos, expected))
else if (s_remove.ignore(pos, expected))
operation = ReconfigCommand::Operation::REMOVE;
else if (ParserKeyword{"SET"}.ignore(pos, expected))
else if (s_set.ignore(pos, expected))
operation = ReconfigCommand::Operation::SET;
else
return false;

View File

@ -44,7 +44,7 @@ String KeeperClient::executeFourLetterCommand(const String & command)
std::vector<String> KeeperClient::getCompletions(const String & prefix) const
{
Tokens tokens(prefix.data(), prefix.data() + prefix.size(), 0, false);
IParser::Pos pos(tokens, 0);
IParser::Pos pos(tokens, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
if (pos->type != TokenType::BareWord)
return registered_commands_and_four_letter_words;
@ -278,6 +278,7 @@ bool KeeperClient::processQueryText(const String & text)
/* allow_multi_statements = */ true,
/* max_query_size = */ 0,
/* max_parser_depth = */ 0,
/* max_parser_backtracks = */ 0,
/* skip_insignificant = */ false);
if (!res)

View File

@ -10,6 +10,7 @@
#include <IO/UseSSL.h>
#include <Core/ServerUUID.h>
#include <Common/logger_useful.h>
#include <Common/CgroupsMemoryUsageObserver.h>
#include <Common/ErrorHandlers.h>
#include <Common/assertProcessUserMatchesDataOwner.h>
#include <Common/makeSocketAddress.h>
@ -623,6 +624,25 @@ try
buildLoggers(config(), logger());
main_config_reloader->start();
std::optional<CgroupsMemoryUsageObserver> cgroups_memory_usage_observer;
try
{
auto wait_time = config().getUInt64("keeper_server.cgroups_memory_observer_wait_time", 15);
if (wait_time != 0)
{
cgroups_memory_usage_observer.emplace(std::chrono::seconds(wait_time));
/// Not calling cgroups_memory_usage_observer->setLimits() here (as for the normal ClickHouse server) because Keeper controls
/// its memory usage by other means (via setting 'max_memory_usage_soft_limit').
cgroups_memory_usage_observer->setOnMemoryAmountAvailableChangedFn([&]() { main_config_reloader->reload(); });
cgroups_memory_usage_observer->startThread();
}
}
catch (Exception &)
{
tryLogCurrentException(log, "Disabling cgroup memory observer because of an error during initialization");
}
LOG_INFO(log, "Ready for connections.");
waitForTerminationRequest();

View File

@ -11,6 +11,7 @@ set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES
LibraryBridgeHandlers.cpp
SharedLibrary.cpp
library-bridge.cpp
createFunctionBaseCast.cpp
)
clickhouse_add_executable(clickhouse-library-bridge ${CLICKHOUSE_LIBRARY_BRIDGE_SOURCES})

View File

@ -1,6 +1,6 @@
#pragma once
#include <Interpreters/Context.h>
#include <Interpreters/Context_fwd.h>
#include <Bridge/IBridge.h>
#include "LibraryBridgeHandlerFactory.h"

View File

@ -0,0 +1,23 @@
#include <memory>
#include <Functions/CastOverloadResolver.h>
#include <Core/ColumnsWithTypeAndName.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class IFunctionBase;
using FunctionBasePtr = std::shared_ptr<const IFunctionBase>;
FunctionBasePtr createFunctionBaseCast(
ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional<CastDiagnostic>, CastType)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge");
}
}

View File

@ -1000,12 +1000,6 @@ extern "C" int LLVMFuzzerInitialize(int * pargc, char *** pargv)
{
std::vector<char *> argv(*pargv, *pargv + (*pargc + 1));
if (!isClickhouseApp("local", argv))
{
std::cerr << "\033[31m" << "ClickHouse compiled in fuzzing mode, only clickhouse local is available." << "\033[0m" << std::endl;
exit(1);
}
/// As a user you can add flags to clickhouse binary in fuzzing mode as follows
/// clickhouse local <set of clickhouse-local specific flag> -- <set of libfuzzer flags>
@ -1013,13 +1007,16 @@ extern "C" int LLVMFuzzerInitialize(int * pargc, char *** pargv)
auto it = argv.begin() + 1;
for (; *it; ++it)
{
if (strcmp(*it, "--") == 0)
{
++it;
break;
}
}
while (*it)
{
if (strncmp(*it, "--", 2) != 0)
{
*(p++) = *it;
@ -1027,6 +1024,7 @@ extern "C" int LLVMFuzzerInitialize(int * pargc, char *** pargv)
}
else
++it;
}
*pargc = static_cast<int>(p - &(*pargv)[0]);
*p = nullptr;

View File

@ -68,7 +68,6 @@ namespace
using MainFunc = int (*)(int, char**);
#if !defined(FUZZING_MODE)
/// Add an item here to register new application
std::pair<std::string_view, MainFunc> clickhouse_applications[] =
{
@ -105,13 +104,6 @@ std::pair<std::string_view, MainFunc> clickhouse_applications[] =
{"restart", mainEntryClickHouseRestart},
};
/// Add an item here to register a new short name
std::pair<std::string_view, std::string_view> clickhouse_short_names[] =
{
{"chl", "local"},
{"chc", "client"},
};
int printHelp(int, char **)
{
std::cerr << "Use one of the following commands:" << std::endl;
@ -121,6 +113,13 @@ int printHelp(int, char **)
}
#endif
/// Add an item here to register a new short name
std::pair<std::string_view, std::string_view> clickhouse_short_names[] =
{
{"chl", "local"},
{"chc", "client"},
};
enum class InstructionFail
{

View File

@ -13,6 +13,7 @@ set (CLICKHOUSE_ODBC_BRIDGE_SOURCES
getIdentifierQuote.cpp
odbc-bridge.cpp
validateODBCConnectionString.cpp
createFunctionBaseCast.cpp
)
clickhouse_add_executable(clickhouse-odbc-bridge ${CLICKHOUSE_ODBC_BRIDGE_SOURCES})

View File

@ -0,0 +1,23 @@
#include <memory>
#include <Functions/CastOverloadResolver.h>
#include <Core/ColumnsWithTypeAndName.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class IFunctionBase;
using FunctionBasePtr = std::shared_ptr<const IFunctionBase>;
FunctionBasePtr createFunctionBaseCast(
ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional<CastDiagnostic>, CastType)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge");
}
}

View File

@ -733,8 +733,6 @@ try
LOG_INFO(log, "Available CPU instruction sets: {}", cpu_info);
#endif
sanityChecks(*this);
// Initialize global thread pool. Do it before we fetch configs from zookeeper
// nodes (`from_zk`), because ZooKeeper interface uses the pool. We will
// ignore `max_thread_pool_size` in configs we fetch from ZK, but oh well.
@ -904,6 +902,7 @@ try
config_processor.savePreprocessedConfig(loaded_config, config().getString("path", DBMS_DEFAULT_PATH));
config().removeConfiguration(old_configuration.get());
config().add(loaded_config.configuration.duplicate(), PRIO_DEFAULT, false);
global_context->setConfig(loaded_config.configuration);
}
Settings::checkNoSettingNamesAtTopLevel(config(), config_path);
@ -911,6 +910,9 @@ try
/// We need to reload server settings because config could be updated via zookeeper.
server_settings.loadSettingsFromConfig(config());
/// NOTE: Do sanity checks after we loaded all possible substitutions (for the configuration) from ZK
sanityChecks(*this);
#if defined(OS_LINUX)
std::string executable_path = getExecutablePath();
@ -1294,7 +1296,7 @@ try
std::optional<CgroupsMemoryUsageObserver> cgroups_memory_usage_observer;
try
{
UInt64 wait_time = server_settings.cgroups_memory_usage_observer_wait_time;
auto wait_time = server_settings.cgroups_memory_usage_observer_wait_time;
if (wait_time != 0)
cgroups_memory_usage_observer.emplace(std::chrono::seconds(wait_time));
}
@ -1360,7 +1362,7 @@ try
{
double hard_limit_ratio = new_server_settings.cgroup_memory_watcher_hard_limit_ratio;
double soft_limit_ratio = new_server_settings.cgroup_memory_watcher_soft_limit_ratio;
cgroups_memory_usage_observer->setLimits(
cgroups_memory_usage_observer->setMemoryUsageLimits(
static_cast<uint64_t>(max_server_memory_usage * hard_limit_ratio),
static_cast<uint64_t>(max_server_memory_usage * soft_limit_ratio));
}
@ -1718,6 +1720,12 @@ try
throw;
}
if (cgroups_memory_usage_observer)
{
cgroups_memory_usage_observer->setOnMemoryAmountAvailableChangedFn([&]() { main_config_reloader->reload(); });
cgroups_memory_usage_observer->startThread();
}
/// Reload config in SYSTEM RELOAD CONFIG query.
global_context->setConfigReloadCallback([&]()
{

View File

@ -297,7 +297,7 @@ namespace
std::pair<String, BackupEntryPtr> makeBackupEntryForAccess(
const std::vector<std::pair<UUID, AccessEntityPtr>> access_entities,
const std::vector<std::pair<UUID, AccessEntityPtr>> & access_entities,
const String & data_path_in_backup,
size_t counter,
const AccessControl & access_control)
@ -326,7 +326,7 @@ void AccessRestorerFromBackup::addDataPath(const String & data_path)
return;
fs::path data_path_in_backup_fs = data_path;
Strings filenames = backup->listFiles(data_path);
Strings filenames = backup->listFiles(data_path, /*recursive*/ false);
if (filenames.empty())
return;

View File

@ -21,7 +21,7 @@ struct RestoreSettings;
/// Makes a backup of access entities of a specified type.
std::pair<String, BackupEntryPtr> makeBackupEntryForAccess(
const std::vector<std::pair<UUID, AccessEntityPtr>> access_entities,
const std::vector<std::pair<UUID, AccessEntityPtr>> & access_entities,
const String & data_path_in_backup,
size_t counter,
const AccessControl & access_control);

View File

@ -62,7 +62,7 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition)
const char * end = begin + definition.size();
while (pos < end)
{
queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH));
queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS));
while (isWhitespaceASCII(*pos) || *pos == ';')
++pos;
}

View File

@ -13,63 +13,63 @@ namespace ErrorCodes
const AuthenticationTypeInfo & AuthenticationTypeInfo::get(AuthenticationType type_)
{
static constexpr auto make_info = [](const char * raw_name_, bool is_password_ = false)
static constexpr auto make_info = [](Keyword keyword_, bool is_password_ = false)
{
String init_name = raw_name_;
String init_name = String(toStringView(keyword_));
boost::to_lower(init_name);
return AuthenticationTypeInfo{raw_name_, std::move(init_name), is_password_};
return AuthenticationTypeInfo{keyword_, std::move(init_name), is_password_};
};
switch (type_)
{
case AuthenticationType::NO_PASSWORD:
{
static const auto info = make_info("NO_PASSWORD");
static const auto info = make_info(Keyword::NO_PASSWORD);
return info;
}
case AuthenticationType::PLAINTEXT_PASSWORD:
{
static const auto info = make_info("PLAINTEXT_PASSWORD", true);
static const auto info = make_info(Keyword::PLAINTEXT_PASSWORD, true);
return info;
}
case AuthenticationType::SHA256_PASSWORD:
{
static const auto info = make_info("SHA256_PASSWORD", true);
static const auto info = make_info(Keyword::SHA256_PASSWORD, true);
return info;
}
case AuthenticationType::DOUBLE_SHA1_PASSWORD:
{
static const auto info = make_info("DOUBLE_SHA1_PASSWORD", true);
static const auto info = make_info(Keyword::DOUBLE_SHA1_PASSWORD, true);
return info;
}
case AuthenticationType::LDAP:
{
static const auto info = make_info("LDAP");
static const auto info = make_info(Keyword::LDAP);
return info;
}
case AuthenticationType::KERBEROS:
{
static const auto info = make_info("KERBEROS");
static const auto info = make_info(Keyword::KERBEROS);
return info;
}
case AuthenticationType::SSL_CERTIFICATE:
{
static const auto info = make_info("SSL_CERTIFICATE");
static const auto info = make_info(Keyword::SSL_CERTIFICATE);
return info;
}
case AuthenticationType::BCRYPT_PASSWORD:
{
static const auto info = make_info("BCRYPT_PASSWORD", true);
static const auto info = make_info(Keyword::BCRYPT_PASSWORD, true);
return info;
}
case AuthenticationType::SSH_KEY:
{
static const auto info = make_info("SSH_KEY");
static const auto info = make_info(Keyword::SSH_KEY);
return info;
}
case AuthenticationType::HTTP:
{
static const auto info = make_info("HTTP");
static const auto info = make_info(Keyword::HTTP);
return info;
}
case AuthenticationType::MAX:

View File

@ -1,6 +1,7 @@
#pragma once
#include <base/types.h>
#include <Parsers/CommonParsers.h>
namespace DB
{
@ -45,7 +46,7 @@ enum class AuthenticationType
struct AuthenticationTypeInfo
{
const char * const raw_name;
Keyword keyword; // Keyword used in parser
const String name; /// Lowercased with underscores, e.g. "sha256_password".
bool is_password;
static const AuthenticationTypeInfo & get(AuthenticationType type_);
@ -53,7 +54,7 @@ struct AuthenticationTypeInfo
inline String toString(AuthenticationType type_)
{
return AuthenticationTypeInfo::get(type_).raw_name;
return String(toStringView(AuthenticationTypeInfo::get(type_).keyword));
}
}

View File

@ -7,7 +7,7 @@ namespace DB
{
ContextAccessParams::ContextAccessParams(
const std::optional<UUID> user_id_,
std::optional<UUID> user_id_,
bool full_access_,
bool use_default_roles_,
const std::shared_ptr<const std::vector<UUID>> & current_roles_,

View File

@ -15,7 +15,7 @@ class ContextAccessParams
{
public:
ContextAccessParams(
const std::optional<UUID> user_id_,
std::optional<UUID> user_id_,
bool full_access_,
bool use_default_roles_,
const std::shared_ptr<const std::vector<UUID>> & current_roles_,

View File

@ -34,7 +34,7 @@ public:
};
explicit GSSAcceptorContext(const Params & params_);
virtual ~GSSAcceptorContext() override;
~GSSAcceptorContext() override;
GSSAcceptorContext(const GSSAcceptorContext &) = delete;
GSSAcceptorContext(GSSAcceptorContext &&) = delete;

View File

@ -204,7 +204,7 @@ void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPClient::SearchR
}
void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles, const std::size_t external_roles_hash) const
void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles, std::size_t external_roles_hash) const
{
const auto & user_name = user.getName();
auto & granted_roles = user.granted_roles;

View File

@ -33,29 +33,29 @@ public:
static constexpr char STORAGE_TYPE[] = "ldap";
explicit LDAPAccessStorage(const String & storage_name_, AccessControl & access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix);
virtual ~LDAPAccessStorage() override = default;
~LDAPAccessStorage() override = default;
String getLDAPServerName() const;
// IAccessStorage implementations.
virtual const char * getStorageType() const override;
virtual String getStorageParamsJSON() const override;
virtual bool isReadOnly() const override { return true; }
virtual bool exists(const UUID & id) const override;
const char * getStorageType() const override;
String getStorageParamsJSON() const override;
bool isReadOnly() const override { return true; }
bool exists(const UUID & id) const override;
private: // IAccessStorage implementations.
virtual std::optional<UUID> findImpl(AccessEntityType type, const String & name) const override;
virtual std::vector<UUID> findAllImpl(AccessEntityType type) const override;
virtual AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override;
virtual std::optional<std::pair<String, AccessEntityType>> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const override;
virtual std::optional<AuthResult> authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const override;
std::optional<UUID> findImpl(AccessEntityType type, const String & name) const override;
std::vector<UUID> findAllImpl(AccessEntityType type) const override;
AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override;
std::optional<std::pair<String, AccessEntityType>> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const override;
std::optional<AuthResult> authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const override;
void setConfiguration(const Poco::Util::AbstractConfiguration & config, const String & prefix);
void processRoleChange(const UUID & id, const AccessEntityPtr & entity);
void applyRoleChangeNoLock(bool grant, const UUID & role_id, const String & role_name);
void assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles) const;
void assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles, const std::size_t external_roles_hash) const;
void assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles, std::size_t external_roles_hash) const;
void updateAssignedRolesNoLock(const UUID & id, const String & user_name, const LDAPClient::SearchResultsList & external_roles) const;
std::set<String> mapExternalRolesNoLock(const LDAPClient::SearchResultsList & external_roles) const;
bool areLDAPCredentialsValidNoLock(const User & user, const Credentials & credentials,

View File

@ -86,7 +86,7 @@ void RowPolicyCache::PolicyInfo::setPolicy(const RowPolicyPtr & policy_)
try
{
ParserExpression parser;
parsed_filters[filter_type_i] = parseQuery(parser, filter, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
parsed_filters[filter_type_i] = parseQuery(parser, filter, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
}
catch (...)
{

View File

@ -66,7 +66,7 @@ namespace
String error_message;
const char * pos = string_query.data();
auto ast = tryParseQuery(parser, pos, pos + string_query.size(), error_message, false, "", false, 0, 0);
auto ast = tryParseQuery(parser, pos, pos + string_query.size(), error_message, false, "", false, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS, true);
if (!ast)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Failed to parse grant query. Error: {}", error_message);

View File

@ -252,7 +252,6 @@ void dumpFlameGraph(
fillColumn(chars, offsets, out.str());
}
// NOLINTBEGIN(clang-analyzer-optin.performance.Padding)
struct AggregateFunctionFlameGraphData
{
struct Entry
@ -469,7 +468,6 @@ struct AggregateFunctionFlameGraphData
DB::dumpFlameGraph(tree.dump(max_depth, min_bytes), chars, offsets);
}
};
// NOLINTEND(clang-analyzer-optin.performance.Padding)
/// Aggregate function which builds a flamegraph using the list of stacktraces.
/// The output is an array of strings which can be used by flamegraph.pl util.

View File

@ -157,7 +157,7 @@ public:
void update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient) override;
virtual void merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac) override;
void merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac) override;
void write(WriteBuffer & buf) const override;
@ -189,7 +189,7 @@ public:
void update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient) override;
virtual void merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac) override;
void merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac) override;
void write(WriteBuffer & buf) const override;
@ -226,7 +226,7 @@ public:
void update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient) override;
virtual void merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac) override;
void merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac) override;
void write(WriteBuffer & buf) const override;

View File

@ -234,9 +234,6 @@ namespace ErrorCodes
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace
{
template <template <typename> typename FunctionTemplate, StatisticsFunctionKind kind>
AggregateFunctionPtr createAggregateFunctionStatisticsUnary(
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
@ -273,5 +270,3 @@ AggregateFunctionPtr createAggregateFunctionStatisticsBinary(
}
}
}

View File

@ -483,6 +483,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return is_able_to_parallelize_merge; }
bool canOptimizeEqualKeysRanges() const override { return !is_able_to_parallelize_merge; }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena *) const override
{
@ -576,6 +577,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return is_able_to_parallelize_merge; }
bool canOptimizeEqualKeysRanges() const override { return !is_able_to_parallelize_merge; }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena *) const override
{

View File

@ -42,9 +42,6 @@ struct UniqCombinedHashTableGrower : public HashTableGrowerWithPrecalculation<>
void increaseSize() { increaseSizeDegree(1); }
};
namespace
{
template <typename T, UInt8 K, typename HashValueType>
struct AggregateFunctionUniqCombinedData
{
@ -268,8 +265,6 @@ AggregateFunctionPtr createAggregateFunctionWithK(const DataTypes & argument_typ
return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<false, false>>(argument_types, params);
}
}
template <UInt8 K>
AggregateFunctionPtr createAggregateFunctionWithHashType(bool use_64_bit_hash, const DataTypes & argument_types, const Array & params)
{

View File

@ -142,6 +142,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
bool canOptimizeEqualKeysRanges() const override { return nested_func->canOptimizeEqualKeysRanges(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{

View File

@ -165,6 +165,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
bool canOptimizeEqualKeysRanges() const override { return nested_func->canOptimizeEqualKeysRanges(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{

View File

@ -111,6 +111,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
bool canOptimizeEqualKeysRanges() const override { return nested_func->canOptimizeEqualKeysRanges(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{

View File

@ -152,6 +152,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return nested_function->isAbleToParallelizeMerge(); }
bool canOptimizeEqualKeysRanges() const override { return nested_function->canOptimizeEqualKeysRanges(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{
@ -520,7 +521,7 @@ public:
}
}
chassert(nullable_filters.size() > 0);
chassert(!nullable_filters.empty());
bool found_one = false;
if (nullable_filters.size() == 1)
{

View File

@ -92,6 +92,7 @@ public:
}
bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
bool canOptimizeEqualKeysRanges() const override { return nested_func->canOptimizeEqualKeysRanges(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{

View File

@ -38,7 +38,7 @@ public:
offset = other->offset;
}
int length()
int length() const
{
return static_cast<int>(bins.size());
}

View File

@ -162,6 +162,10 @@ public:
/// Tells if merge() with thread pool parameter could be used.
virtual bool isAbleToParallelizeMerge() const { return false; }
/// Return true if it is allowed to replace call of `addBatch`
/// to `addBatchSinglePlace` for ranges of consecutive equal keys.
virtual bool canOptimizeEqualKeysRanges() const { return true; }
/// Should be used only if isAbleToParallelizeMerge() returned true.
virtual void
merge(AggregateDataPtr __restrict /*place*/, ConstAggregateDataPtr /*rhs*/, ThreadPool & /*thread_pool*/, Arena * /*arena*/) const

View File

@ -150,7 +150,7 @@ struct QuantileExactExclusive : public QuantileExact<Value>
return static_cast<Float64>(*std::min_element(array.begin(), array.end()));
::nth_element(array.begin(), array.begin() + n - 1, array.end());
auto nth_elem = std::min_element(array.begin() + n, array.end());
auto * nth_elem = std::min_element(array.begin() + n, array.end());
return static_cast<Float64>(array[n - 1]) + (h - n) * static_cast<Float64>(*nth_elem - array[n - 1]);
}
@ -179,7 +179,7 @@ struct QuantileExactExclusive : public QuantileExact<Value>
else
{
::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());
auto nth_elem = std::min_element(array.begin() + n, array.end());
auto * nth_elem = std::min_element(array.begin() + n, array.end());
result[indices[i]] = static_cast<Float64>(array[n - 1]) + (h - n) * static_cast<Float64>(*nth_elem - array[n - 1]);
prev_n = n - 1;
@ -214,7 +214,7 @@ struct QuantileExactInclusive : public QuantileExact<Value>
else if (n < 1)
return static_cast<Float64>(*std::min_element(array.begin(), array.end()));
::nth_element(array.begin(), array.begin() + n - 1, array.end());
auto nth_elem = std::min_element(array.begin() + n, array.end());
auto * nth_elem = std::min_element(array.begin() + n, array.end());
return static_cast<Float64>(array[n - 1]) + (h - n) * static_cast<Float64>(*nth_elem - array[n - 1]);
}
@ -241,7 +241,7 @@ struct QuantileExactInclusive : public QuantileExact<Value>
else
{
::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());
auto nth_elem = std::min_element(array.begin() + n, array.end());
auto * nth_elem = std::min_element(array.begin() + n, array.end());
result[indices[i]] = static_cast<Float64>(array[n - 1]) + (h - n) * (static_cast<Float64>(*nth_elem) - array[n - 1]);
prev_n = n - 1;

View File

@ -191,7 +191,7 @@ public:
/// TODO: After implementation of "versioning aggregate function state",
/// change the serialization format.
Element elem;
memset(&elem, 0, sizeof(elem));
memset(&elem, 0, sizeof(elem)); /// NOLINT(bugprone-undefined-memory-manipulation)
elem = samples[i];
DB::transformEndianness<std::endian::little>(elem);

View File

@ -23,7 +23,7 @@ struct SingleValueDataBase
/// For example argMin holds 1 of these (for the result), while keeping a template for the value
static constexpr UInt32 MAX_STORAGE_SIZE = 64;
virtual ~SingleValueDataBase() { }
virtual ~SingleValueDataBase() = default;
virtual bool has() const = 0;
virtual void insertResultInto(IColumn &) const = 0;
virtual void write(WriteBuffer &, const ISerialization &) const = 0;

View File

@ -39,8 +39,8 @@ public:
/// This method will convert all the SingleLevelSet to TwoLevelSet in parallel if the hashsets are not all singlelevel or not all twolevel.
static void parallelizeMergePrepare(const std::vector<UniqExactSet *> & data_vec, ThreadPool & thread_pool)
{
unsigned long single_level_set_num = 0;
unsigned long all_single_hash_size = 0;
UInt64 single_level_set_num = 0;
UInt64 all_single_hash_size = 0;
for (auto ele : data_vec)
{

View File

@ -27,6 +27,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size)
auto initialize = [&]() mutable
{
if (context)
return true;
shared_context = Context::createShared();
context = Context::createGlobal(shared_context.get());
context->makeGlobalContext();

View File

@ -81,7 +81,8 @@ void getAggregateFunctionNameAndParametersArray(
ParserExpressionList params_parser(false);
ASTPtr args_ast = parseQuery(params_parser,
parameters_str.data(), parameters_str.data() + parameters_str.size(),
"parameters of aggregate function in " + error_context, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
"parameters of aggregate function in " + error_context,
0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
if (args_ast->children.empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect list of parameters to aggregate function {}",

View File

@ -776,6 +776,7 @@ struct IdentifierResolveScope
std::unordered_map<QueryTreeNodePtr, TableExpressionData> table_expression_node_to_data;
QueryTreeNodePtrWithHashSet nullable_group_by_keys;
QueryTreeNodePtrWithHashMap<QueryTreeNodePtr> nullable_join_columns;
/// Use identifier lookup to result cache
bool use_identifier_lookup_to_result_cache = true;
@ -1276,7 +1277,11 @@ private:
return {};
}
static void convertJoinedColumnTypeToNullIfNeeded(QueryTreeNodePtr & resolved_identifier, const JoinKind & join_kind, std::optional<JoinTableSide> resolved_side)
static QueryTreeNodePtr convertJoinedColumnTypeToNullIfNeeded(
const QueryTreeNodePtr & resolved_identifier,
const JoinKind & join_kind,
std::optional<JoinTableSide> resolved_side,
IdentifierResolveScope & scope)
{
if (resolved_identifier->getNodeType() == QueryTreeNodeType::COLUMN &&
JoinCommon::canBecomeNullable(resolved_identifier->getResultType()) &&
@ -1284,9 +1289,20 @@ private:
(isLeft(join_kind) && resolved_side && *resolved_side == JoinTableSide::Right) ||
(isRight(join_kind) && resolved_side && *resolved_side == JoinTableSide::Left)))
{
auto & resolved_column = resolved_identifier->as<ColumnNode &>();
resolved_column.setColumnType(makeNullableOrLowCardinalityNullable(resolved_column.getColumnType()));
auto nullable_resolved_identifier = resolved_identifier->clone();
auto & resolved_column = nullable_resolved_identifier->as<ColumnNode &>();
auto new_result_type = makeNullableOrLowCardinalityNullable(resolved_column.getColumnType());
resolved_column.setColumnType(new_result_type);
if (resolved_column.hasExpression())
{
auto & resolved_expression = resolved_column.getExpression();
if (!resolved_expression->getResultType()->equals(*new_result_type))
resolved_expression = buildCastFunction(resolved_expression, new_result_type, scope.context, true);
}
scope.nullable_join_columns[nullable_resolved_identifier] = resolved_identifier;
return nullable_resolved_identifier;
}
return nullptr;
}
/// Resolve identifier functions
@ -3258,6 +3274,32 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableExpression(const Id
return {};
}
QueryTreeNodePtr checkIsMissedObjectJSONSubcolumn(const QueryTreeNodePtr & left_resolved_identifier,
const QueryTreeNodePtr & right_resolved_identifier)
{
if (left_resolved_identifier && right_resolved_identifier && left_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT
&& right_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT)
{
auto & left_resolved_column = left_resolved_identifier->as<ConstantNode &>();
auto & right_resolved_column = right_resolved_identifier->as<ConstantNode &>();
if (left_resolved_column.getValueStringRepresentation() == "NULL" && right_resolved_column.getValueStringRepresentation() == "NULL")
return left_resolved_identifier;
}
else if (left_resolved_identifier && left_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT)
{
auto & left_resolved_column = left_resolved_identifier->as<ConstantNode &>();
if (left_resolved_column.getValueStringRepresentation() == "NULL")
return left_resolved_identifier;
}
else if (right_resolved_identifier && right_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT)
{
auto & right_resolved_column = right_resolved_identifier->as<ConstantNode &>();
if (right_resolved_column.getValueStringRepresentation() == "NULL")
return right_resolved_identifier;
}
return {};
}
QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoin(const IdentifierLookup & identifier_lookup,
const QueryTreeNodePtr & table_expression_node,
IdentifierResolveScope & scope)
@ -3358,28 +3400,8 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoin(const IdentifierLoo
/// If columns from left or right table were missed Object(Nullable('json')) subcolumns, they will be replaced
/// to ConstantNode(NULL), which can't be cast to ColumnNode, so we resolve it here.
if (left_resolved_identifier && right_resolved_identifier && left_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT
&& right_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT)
{
auto & left_resolved_column = left_resolved_identifier->as<ConstantNode &>();
auto & right_resolved_column = right_resolved_identifier->as<ConstantNode &>();
if (left_resolved_column.getValueStringRepresentation() == "NULL" && right_resolved_column.getValueStringRepresentation() == "NULL")
return left_resolved_identifier;
}
else if (left_resolved_identifier && left_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT)
{
resolved_side = JoinTableSide::Left;
auto & left_resolved_column = left_resolved_identifier->as<ConstantNode &>();
if (left_resolved_column.getValueStringRepresentation() == "NULL")
return left_resolved_identifier;
}
else if (right_resolved_identifier && right_resolved_identifier->getNodeType() == QueryTreeNodeType::CONSTANT)
{
resolved_side = JoinTableSide::Right;
auto & right_resolved_column = right_resolved_identifier->as<ConstantNode &>();
if (right_resolved_column.getValueStringRepresentation() == "NULL")
return right_resolved_identifier;
}
if (auto missed_subcolumn_identifier = checkIsMissedObjectJSONSubcolumn(left_resolved_identifier, right_resolved_identifier))
return missed_subcolumn_identifier;
if (left_resolved_identifier && right_resolved_identifier)
{
@ -3521,8 +3543,9 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoin(const IdentifierLoo
if (scope.join_use_nulls)
{
resolved_identifier = resolved_identifier->clone();
convertJoinedColumnTypeToNullIfNeeded(resolved_identifier, join_kind, resolved_side);
auto nullable_resolved_identifier = convertJoinedColumnTypeToNullIfNeeded(resolved_identifier, join_kind, resolved_side, scope);
if (nullable_resolved_identifier)
resolved_identifier = nullable_resolved_identifier;
}
return resolved_identifier;
@ -4402,6 +4425,28 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher(
const auto & join_using_column_nodes_list = join_using_column_node.getExpressionOrThrow()->as<ListNode &>();
const auto & join_using_column_nodes = join_using_column_nodes_list.getNodes();
/** If column doesn't exists in the table, then do not match column from USING clause.
* Example: SELECT a + 1 AS id, * FROM (SELECT 1 AS a) AS t1 JOIN (SELECT 2 AS id) AS t2 USING (id);
* In this case `id` is not present in the left table expression,
* so asterisk should return `id` from the right table expression.
*/
auto is_column_from_parent_scope = [&scope](const QueryTreeNodePtr & using_node_from_table)
{
const auto & using_column_from_table = using_node_from_table->as<ColumnNode &>();
auto table_expression_data_it = scope.table_expression_node_to_data.find(using_column_from_table.getColumnSource());
if (table_expression_data_it != scope.table_expression_node_to_data.end())
{
const auto & table_expression_data = table_expression_data_it->second;
const auto & column_name = using_column_from_table.getColumnName();
return !table_expression_data.column_name_to_column_node.contains(column_name);
}
return false;
};
if (is_column_from_parent_scope(join_using_column_nodes.at(0)) ||
is_column_from_parent_scope(join_using_column_nodes.at(1)))
continue;
QueryTreeNodePtr matched_column_node;
if (isRight(join_node->getKind()))
@ -4523,7 +4568,15 @@ ProjectionNames QueryAnalyzer::resolveMatcher(QueryTreeNodePtr & matcher_node, I
for (auto & [node, node_name] : matched_expression_nodes_with_names)
{
auto join_identifier_side = getColumnSideFromJoinTree(node, *nearest_scope_join_node);
convertJoinedColumnTypeToNullIfNeeded(node, nearest_scope_join_node->getKind(), join_identifier_side);
auto projection_name_it = node_to_projection_name.find(node);
auto nullable_node = convertJoinedColumnTypeToNullIfNeeded(node, nearest_scope_join_node->getKind(), join_identifier_side, scope);
if (nullable_node)
{
node = nullable_node;
/// Set the same projection name for new nullable node
if (projection_name_it != node_to_projection_name.end())
node_to_projection_name.emplace(node, projection_name_it->second);
}
}
}
}
@ -7304,8 +7357,64 @@ void QueryAnalyzer::resolveJoin(QueryTreeNodePtr & join_node, IdentifierResolveS
join_using_identifiers.insert(identifier_full_name);
const auto & settings = scope.context->getSettingsRef();
/** While resolving JOIN USING identifier, try to resolve identifier from parent subquery projection.
* Example: SELECT a + 1 AS b FROM (SELECT 1 AS a) t1 JOIN (SELECT 2 AS b) USING b
* In this case `b` is not in the left table expression, but it is in the parent subquery projection.
*/
auto try_resolve_identifier_from_query_projection = [this](const String & identifier_full_name_,
const QueryTreeNodePtr & left_table_expression,
const IdentifierResolveScope & scope_) -> QueryTreeNodePtr
{
const QueryNode * query_node = scope_.scope_node ? scope_.scope_node->as<QueryNode>() : nullptr;
if (!query_node)
return nullptr;
const auto & projection_list = query_node->getProjection();
for (const auto & projection_node : projection_list.getNodes())
{
if (projection_node->hasAlias() && identifier_full_name_ == projection_node->getAlias())
{
auto left_subquery = std::make_shared<QueryNode>(query_node->getMutableContext());
left_subquery->getProjection().getNodes().push_back(projection_node->clone());
left_subquery->getJoinTree() = left_table_expression;
IdentifierResolveScope left_subquery_scope(left_subquery, nullptr /*parent_scope*/);
resolveQuery(left_subquery, left_subquery_scope);
const auto & resolved_nodes = left_subquery->getProjection().getNodes();
if (resolved_nodes.size() == 1)
{
/// Create ColumnNode with expression from parent projection
return std::make_shared<ColumnNode>(
NameAndTypePair{identifier_full_name_, resolved_nodes.front()->getResultType()},
resolved_nodes.front(), left_table_expression);
}
}
}
return nullptr;
};
QueryTreeNodePtr result_left_table_expression = nullptr;
/** With `analyzer_compatibility_join_using_top_level_identifier` alias in projection has higher priority than column from left table.
* But if aliased expression cannot be resolved from left table, we get UNKNOW_IDENTIFIER error,
* despite the fact that column from USING could be resolved from left table.
* It's compatibility with a default behavior for old analyzer.
*/
if (settings.analyzer_compatibility_join_using_top_level_identifier)
result_left_table_expression = try_resolve_identifier_from_query_projection(identifier_full_name, join_node_typed.getLeftTableExpression(), scope);
IdentifierLookup identifier_lookup{identifier_node->getIdentifier(), IdentifierLookupContext::EXPRESSION};
auto result_left_table_expression = tryResolveIdentifierFromJoinTreeNode(identifier_lookup, join_node_typed.getLeftTableExpression(), scope);
if (!result_left_table_expression)
result_left_table_expression = tryResolveIdentifierFromJoinTreeNode(identifier_lookup, join_node_typed.getLeftTableExpression(), scope);
/// Here we may try to resolve identifier from projection in case it's not resolved from left table expression
/// and analyzer_compatibility_join_using_top_level_identifier is disabled.
/// For now we do not do this, because not all corner cases are clear.
/// if (!settings.analyzer_compatibility_join_using_top_level_identifier && !result_left_table_expression)
/// result_left_table_expression = try_resolve_identifier_from_query_projection(identifier_full_name, join_node_typed.getLeftTableExpression(), scope);
if (!result_left_table_expression)
throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
"JOIN {} using identifier '{}' cannot be resolved from left table expression. In scope {}",
@ -7448,6 +7557,29 @@ void QueryAnalyzer::resolveQueryJoinTreeNode(QueryTreeNodePtr & join_tree_node,
scope.table_expressions_in_resolve_process.erase(join_tree_node.get());
}
class ReplaceColumnsVisitor : public InDepthQueryTreeVisitor<ReplaceColumnsVisitor>
{
public:
explicit ReplaceColumnsVisitor(const QueryTreeNodePtrWithHashMap<QueryTreeNodePtr> & replacement_map_, const ContextPtr & context_)
: replacement_map(replacement_map_)
, context(context_)
{}
void visitImpl(QueryTreeNodePtr & node)
{
if (auto it = replacement_map.find(node); it != replacement_map.end())
node = it->second;
if (auto * function_node = node->as<FunctionNode>())
rerunFunctionResolve(function_node, context);
}
bool shouldTraverseTopToBottom() const { return false; }
private:
const QueryTreeNodePtrWithHashMap<QueryTreeNodePtr> & replacement_map;
const ContextPtr & context;
};
/** Resolve query.
* This function modifies query node during resolve. It is caller responsibility to clone query node before resolve
* if it is needed for later use.
@ -7635,21 +7767,23 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
scope.scope_node->formatASTForErrorMessage());
}
if (query_node_typed.getPrewhere())
if (auto & prewhere_node = query_node_typed.getPrewhere())
{
/** Expression in PREWHERE with JOIN should not be modified by join_use_nulls.
* Example: SELECT * FROM t1 JOIN t2 USING (id) PREWHERE a = 1
* Column `a` should be resolved from table and should not change its type to Nullable.
*/
bool join_use_nulls = scope.join_use_nulls;
bool use_identifier_lookup_to_result_cache = scope.use_identifier_lookup_to_result_cache;
scope.join_use_nulls = false;
scope.use_identifier_lookup_to_result_cache = false;
resolveExpressionNode(prewhere_node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/);
resolveExpressionNode(query_node_typed.getPrewhere(), scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/);
scope.join_use_nulls = join_use_nulls;
scope.use_identifier_lookup_to_result_cache = use_identifier_lookup_to_result_cache;
if (scope.join_use_nulls)
{
/** Expression in PREWHERE with JOIN should not be modified by join_use_nulls.
* Example: SELECT * FROM t1 JOIN t2 USING (id) PREWHERE b = 1
* Column `a` should be resolved from table and should not change its type to Nullable.
* More complicated example when column is somewhere inside an expression:
* SELECT a + 1 as b FROM t1 JOIN t2 USING (id) PREWHERE b = 1
* expression `a + 1 as b` in projection and in PREWHERE should have different `a`.
*/
prewhere_node = prewhere_node->clone();
ReplaceColumnsVisitor replace_visitor(scope.nullable_join_columns, scope.context);
replace_visitor.visit(prewhere_node);
}
}
if (query_node_typed.getWhere())

View File

@ -77,7 +77,7 @@ public:
* Available expression columns are extracted from table expression.
* Table expression node must have query, union, table, table function type.
*/
QueryAnalysisPass(QueryTreeNodePtr table_expression_, bool only_analyze_ = false);
explicit QueryAnalysisPass(QueryTreeNodePtr table_expression_, bool only_analyze_ = false);
String getName() override
{

View File

@ -54,10 +54,10 @@ public:
if (!constant_node)
return;
const auto & constant_value_literal = constant_node->getValue();
if (!isInt64OrUInt64FieldType(constant_value_literal.getType()))
if (auto constant_type = constant_node->getResultType(); !isNativeInteger(constant_type))
return;
const auto & constant_value_literal = constant_node->getValue();
if (getSettings().aggregate_functions_null_for_empty)
return;

View File

@ -1,6 +1,7 @@
#include <Backups/BackupCoordinationRemote.h>
#include <base/hex.h>
#include <boost/algorithm/string/split.hpp>
#include <Access/Common/AccessEntityType.h>
#include <Backups/BackupCoordinationReplicatedAccess.h>

View File

@ -25,7 +25,7 @@ String BackupInfo::toString() const
BackupInfo BackupInfo::fromString(const String & str)
{
ParserIdentifierWithOptionalParameters parser;
ASTPtr ast = parseQuery(parser, str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
ASTPtr ast = parseQuery(parser, str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
return fromAST(*ast);
}

View File

@ -26,7 +26,7 @@ struct BackupSettings
String password;
/// S3 storage class.
String s3_storage_class = "";
String s3_storage_class;
/// If this is set to true then only create queries will be written to backup,
/// without the data of tables.

View File

@ -327,7 +327,7 @@ public:
metric_active_threads = CurrentMetrics::RestoreThreadsActive;
metric_active_threads = CurrentMetrics::RestoreThreadsScheduled;
max_threads = num_restore_threads;
use_queue = (thread_pool_id != ThreadPoolId::RESTORE);
use_queue = true;
break;
}
}

View File

@ -80,7 +80,7 @@ public:
/// Returns names of entries stored in a specified directory in the backup.
/// If `directory` is empty or '/' the functions returns entries in the backup's root.
virtual Strings listFiles(const String & directory, bool recursive = false) const = 0;
virtual Strings listFiles(const String & directory, bool recursive) const = 0;
/// Checks if a specified directory contains any files.
/// The function returns the same as `!listFiles(directory).empty()`.
@ -108,11 +108,9 @@ public:
virtual std::unique_ptr<SeekableReadBuffer> readFile(const SizeAndChecksum & size_and_checksum) const = 0;
/// Copies a file from the backup to a specified destination disk. Returns the number of bytes written.
virtual size_t copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path,
WriteMode write_mode = WriteMode::Rewrite) const = 0;
virtual size_t copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const = 0;
virtual size_t copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path,
WriteMode write_mode = WriteMode::Rewrite) const = 0;
virtual size_t copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const = 0;
/// Puts a new entry to the backup.
virtual void writeFile(const BackupFileInfo & file_info, BackupEntryPtr entry) = 0;

View File

@ -101,10 +101,12 @@ RestorerFromBackup::RestorerFromBackup(
RestorerFromBackup::~RestorerFromBackup()
{
if (!futures.empty())
/// If an exception occurs we can come here to the destructor having some tasks still unfinished.
/// We have to wait until they finish.
if (getNumFutures() > 0)
{
LOG_ERROR(log, "RestorerFromBackup must not be destroyed while {} tasks are still running", futures.size());
chassert(false && "RestorerFromBackup must not be destroyed while some tasks are still running");
LOG_INFO(log, "Waiting for {} tasks to finish", getNumFutures());
waitFutures();
}
}
@ -271,7 +273,7 @@ void RestorerFromBackup::findRootPathsInBackup()
root_paths_in_backup.push_back(root_path);
/// Add shard-related part to the root path.
Strings shards_in_backup = backup->listFiles(root_path / "shards");
Strings shards_in_backup = backup->listFiles(root_path / "shards", /*recursive*/ false);
if (shards_in_backup.empty())
{
if (restore_settings.shard_num_in_backup > 1)
@ -293,7 +295,7 @@ void RestorerFromBackup::findRootPathsInBackup()
}
/// Add replica-related part to the root path.
Strings replicas_in_backup = backup->listFiles(root_path / "replicas");
Strings replicas_in_backup = backup->listFiles(root_path / "replicas", /*recursive*/ false);
if (replicas_in_backup.empty())
{
if (restore_settings.replica_num_in_backup > 1)
@ -422,7 +424,7 @@ void RestorerFromBackup::findTableInBackupImpl(const QualifiedTableName & table_
readStringUntilEOF(create_query_str, *read_buffer);
read_buffer.reset();
ParserCreateQuery create_parser;
ASTPtr create_table_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
ASTPtr create_table_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
applyCustomStoragePolicy(create_table_query);
renameDatabaseAndTableNameInCreateQuery(create_table_query, renaming_map, context->getGlobalContext());
String create_table_query_str = serializeAST(*create_table_query);
@ -512,7 +514,7 @@ void RestorerFromBackup::findDatabaseInBackupImpl(const String & database_name_i
if (!metadata_path && !try_metadata_path.empty() && backup->fileExists(try_metadata_path))
metadata_path = try_metadata_path;
Strings file_names = backup->listFiles(try_tables_metadata_path);
Strings file_names = backup->listFiles(try_tables_metadata_path, /*recursive*/ false);
for (const String & file_name : file_names)
{
if (!file_name.ends_with(".sql"))
@ -532,7 +534,7 @@ void RestorerFromBackup::findDatabaseInBackupImpl(const String & database_name_i
readStringUntilEOF(create_query_str, *read_buffer);
read_buffer.reset();
ParserCreateQuery create_parser;
ASTPtr create_database_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
ASTPtr create_database_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
renameDatabaseAndTableNameInCreateQuery(create_database_query, renaming_map, context->getGlobalContext());
String create_database_query_str = serializeAST(*create_database_query);
@ -573,7 +575,7 @@ void RestorerFromBackup::findEverythingInBackup(const std::set<String> & except_
for (const auto & root_path_in_backup : root_paths_in_backup)
{
Strings file_names = backup->listFiles(root_path_in_backup / "metadata");
Strings file_names = backup->listFiles(root_path_in_backup / "metadata", /*recursive*/ false);
for (String & file_name : file_names)
{
if (file_name.ends_with(".sql"))

Some files were not shown because too many files have changed in this diff Show More