Merge branch 'master' into query_plan_for_merge

This commit is contained in:
Alexander Gololobov 2024-09-05 13:32:12 +02:00 committed by GitHub
commit e1b2952a60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
188 changed files with 5038 additions and 804 deletions

View File

@ -66,13 +66,11 @@ TRAP(gethostbyname)
TRAP(gethostbyname2)
TRAP(gethostent)
TRAP(getlogin)
TRAP(getmntent)
TRAP(getnetbyaddr)
TRAP(getnetbyname)
TRAP(getnetent)
TRAP(getnetgrent)
TRAP(getnetgrent_r)
TRAP(getopt)
TRAP(getopt_long)
TRAP(getopt_long_only)
TRAP(getpass)
@ -133,7 +131,6 @@ TRAP(nrand48)
TRAP(__ppc_get_timebase_freq)
TRAP(ptsname)
TRAP(putchar_unlocked)
TRAP(putenv)
TRAP(pututline)
TRAP(pututxline)
TRAP(putwchar_unlocked)
@ -148,7 +145,6 @@ TRAP(sethostent)
TRAP(sethostid)
TRAP(setkey)
//TRAP(setlocale) // Used by replxx at startup
TRAP(setlogmask)
TRAP(setnetent)
TRAP(setnetgrent)
TRAP(setprotoent)
@ -203,7 +199,6 @@ TRAP(lgammal)
TRAP(nftw)
TRAP(nl_langinfo)
TRAP(putc_unlocked)
TRAP(rand)
/** In the current POSIX.1 specification (POSIX.1-2008), readdir() is not required to be thread-safe. However, in modern
* implementations (including the glibc implementation), concurrent calls to readdir() that specify different directory streams
* are thread-safe. In cases where multiple threads must read from the same directory stream, using readdir() with external
@ -288,4 +283,14 @@ TRAP(tss_get)
TRAP(tss_set)
TRAP(tss_delete)
#ifndef USE_MUSL
/// These produce duplicate symbol errors when statically linking with musl.
/// Maybe we can remove them from the musl fork.
TRAP(getopt)
TRAP(putenv)
TRAP(setlogmask)
TRAP(rand)
TRAP(getmntent)
#endif
#endif

View File

@ -48,25 +48,17 @@ std::string PathImpl::currentImpl()
std::string PathImpl::homeImpl()
{
std::string path;
#if defined(_POSIX_C_SOURCE) || defined(_BSD_SOURCE) || defined(_POSIX_C_SOURCE)
size_t buf_size = 1024; // Same as glibc use for getpwuid
std::vector<char> buf(buf_size);
struct passwd res;
struct passwd* pwd = nullptr;
getpwuid_r(getuid(), &res, buf.data(), buf_size, &pwd);
#else
struct passwd* pwd = getpwuid(getuid());
#endif
if (pwd)
path = pwd->pw_dir;
else
{
#if defined(_POSIX_C_SOURCE) || defined(_BSD_SOURCE) || defined(_POSIX_C_SOURCE)
getpwuid_r(getuid(), &res, buf.data(), buf_size, &pwd);
#else
pwd = getpwuid(geteuid());
#endif
if (pwd)
path = pwd->pw_dir;
else

View File

@ -18,4 +18,4 @@ target_compile_options (_poco_util
-Wno-zero-as-null-pointer-constant
)
target_include_directories (_poco_util SYSTEM PUBLIC "include")
target_link_libraries (_poco_util PUBLIC Poco::JSON Poco::XML)
target_link_libraries (_poco_util PUBLIC Poco::JSON Poco::XML Poco::Net)

View File

@ -241,6 +241,20 @@ namespace Util
/// If the value contains references to other properties (${<property>}), these
/// are expanded.
std::string getHost(const std::string & key) const;
/// Returns the string value of the host property with the given name.
/// Throws a NotFoundException if the key does not exist.
/// Throws a SyntaxException if the property is not a valid host (IP address or domain).
/// If the value contains references to other properties (${<property>}), these
/// are expanded.
std::string getHost(const std::string & key, const std::string & defaultValue) const;
/// If a property with the given key exists, returns the host property's string value,
/// otherwise returns the given default value.
/// Throws a SyntaxException if the property is not a valid host (IP address or domain).
/// If the value contains references to other properties (${<property>}), these
/// are expanded.
virtual void setString(const std::string & key, const std::string & value);
/// Sets the property with the given key to the given value.
/// An already existing value for the key is overwritten.
@ -339,12 +353,35 @@ namespace Util
static bool parseBool(const std::string & value);
void setRawWithEvent(const std::string & key, std::string value);
static void checkHostValidity(const std::string & value);
/// Throws a SyntaxException if the value is not a valid host (IP address or domain).
virtual ~AbstractConfiguration();
private:
std::string internalExpand(const std::string & value) const;
std::string uncheckedExpand(const std::string & value) const;
static bool isValidIPv4Address(const std::string & value);
/// IPv4 address considered valid if it is "0.0.0.0" or one of those,
/// defined by inet_aton() or inet_addr()
static bool isValidIPv6Address(const std::string & value);
/// IPv6 address considered valid if it is "::" or one of those,
/// defined by inet_pton() with AF_INET6 flag
/// (in this case it may have scope id and may be surrounded by '[', ']')
static bool isValidDomainName(const std::string & value);
/// <domain> ::= <subdomain> [ "." ]
/// <subdomain> ::= <label> | <subdomain> "." <label>
/// <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
/// <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
/// <let-dig-hyp> ::= <let-dig> | "-"
/// <let-dig> ::= <letter> | <digit>
/// <letter> ::= any one of the 52 alphabetic characters A through Z in
/// upper case and a through z in lower case
/// <digit> ::= any one of the ten digits 0 through 9
AbstractConfiguration(const AbstractConfiguration &);
AbstractConfiguration & operator=(const AbstractConfiguration &);

View File

@ -18,6 +18,7 @@
#include "Poco/NumberParser.h"
#include "Poco/NumberFormatter.h"
#include "Poco/String.h"
#include "Poco/Net/IPAddressImpl.h"
using Poco::Mutex;
@ -263,6 +264,41 @@ bool AbstractConfiguration::getBool(const std::string& key, bool defaultValue) c
}
std::string AbstractConfiguration::getHost(const std::string& key) const
{
Mutex::ScopedLock lock(_mutex);
std::string value;
if (getRaw(key, value))
{
std::string expandedValue = internalExpand(value);
checkHostValidity(expandedValue);
return expandedValue;
}
else
throw NotFoundException(key);
}
std::string AbstractConfiguration::getHost(const std::string& key, const std::string& defaultValue) const
{
Mutex::ScopedLock lock(_mutex);
std::string value;
if (getRaw(key, value))
{
std::string expandedValue = internalExpand(value);
checkHostValidity(expandedValue);
return expandedValue;
}
else
{
checkHostValidity(defaultValue);
return defaultValue;
}
}
void AbstractConfiguration::setString(const std::string& key, const std::string& value)
{
setRawWithEvent(key, value);
@ -529,4 +565,68 @@ void AbstractConfiguration::setRawWithEvent(const std::string& key, std::string
}
void AbstractConfiguration::checkHostValidity(const std::string& value)
{
if (!isValidIPv4Address(value) && !isValidIPv6Address(value) && !isValidDomainName(value))
{
throw SyntaxException("Property is not a valid host name", value);
}
}
bool AbstractConfiguration::isValidIPv4Address(const std::string& value)
{
using Poco::Net::Impl::IPv4AddressImpl;
IPv4AddressImpl empty4 = IPv4AddressImpl();
IPv4AddressImpl ipAddress = IPv4AddressImpl::parse(value);
return ipAddress != empty4 || value == "0.0.0.0";
}
bool AbstractConfiguration::isValidIPv6Address(const std::string& value)
{
#if defined(POCO_HAVE_IPv6)
using Poco::Net::Impl::IPv6AddressImpl;
IPv6AddressImpl empty6 = IPv6AddressImpl();
IPv6AddressImpl ipAddress = IPv6AddressImpl::parse(value);
return ipAddress != empty6 || value == "::";
#else
return false;
#endif
}
bool AbstractConfiguration::isValidDomainName(const std::string& value)
{
if (value.empty() || value == "." || value.length() > 253)
return false;
int labelLength = 0;
char oldChar = 0;
for (char ch : value)
{
if (ch == '.')
{
if (labelLength == 0 || labelLength > 63 || oldChar == '-')
return false;
labelLength = 0;
}
else if (isalnum(ch) || ch == '-')
{
if (labelLength == 0 && (ch == '-' || isdigit(ch)))
return false;
++labelLength;
}
else
{
return false;
}
oldChar = ch;
}
return oldChar == '.' || (labelLength > 0 && labelLength <= 63 && oldChar != '-');
}
} } // namespace Poco::Util

View File

@ -140,6 +140,12 @@ if (CMAKE_CROSSCOMPILING)
message (STATUS "CROSS COMPILING SET LLVM HOST TRIPLE ${LLVM_HOST_TRIPLE}")
endif()
# llvm-project/llvm/cmake/config-ix.cmake does a weird thing: it defines _LARGEFILE64_SOURCE,
# then checks if lseek64() function exists, then undefines _LARGEFILE64_SOURCE.
# Then the actual code that uses this function *doesn't* define _LARGEFILE64_SOURCE, so lseek64()
# may not exist and compilation fails. This happens with musl.
add_compile_definitions("_LARGEFILE64_SOURCE")
add_subdirectory ("${LLVM_SOURCE_DIR}" "${LLVM_BINARY_DIR}")
set_directory_properties (PROPERTIES

2
contrib/sysroot vendored

@ -1 +1 @@
Subproject commit cc385041b226d1fc28ead14dbab5d40a5f821dd8
Subproject commit 5be834147d5b5dd77ca2b821f356982029320513

View File

@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="24.8.2.3"
ARG VERSION="24.8.3.59"
ARG PACKAGES="clickhouse-keeper"
ARG DIRECT_DOWNLOAD_URLS=""

View File

@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="24.8.2.3"
ARG VERSION="24.8.3.59"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
ARG DIRECT_DOWNLOAD_URLS=""

View File

@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
ARG VERSION="24.8.2.3"
ARG VERSION="24.8.3.59"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
#docker-official-library:off

View File

@ -0,0 +1,32 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.3.10.33-lts (37b6502ebf0) FIXME as compared to v24.3.9.5-lts (a939270465e)
#### Improvement
* Backported in [#68870](https://github.com/ClickHouse/ClickHouse/issues/68870): Make allow_experimental_analyzer be controlled by the initiator for distributed queries. This ensures compatibility and correctness during operations in mixed version clusters. [#65777](https://github.com/ClickHouse/ClickHouse/pull/65777) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
* Backported in [#69095](https://github.com/ClickHouse/ClickHouse/issues/69095): Support for the Spanish language in the embedded dictionaries. [#69035](https://github.com/ClickHouse/ClickHouse/pull/69035) ([Vasily Okunev](https://github.com/VOkunev)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#68995](https://github.com/ClickHouse/ClickHouse/issues/68995): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
* Backported in [#68844](https://github.com/ClickHouse/ClickHouse/issues/68844): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
* Backported in [#68881](https://github.com/ClickHouse/ClickHouse/issues/68881): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
* Backported in [#69054](https://github.com/ClickHouse/ClickHouse/issues/69054): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
* Backported in [#68856](https://github.com/ClickHouse/ClickHouse/issues/68856): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#69152](https://github.com/ClickHouse/ClickHouse/issues/69152): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
* Backported in [#69112](https://github.com/ClickHouse/ClickHouse/issues/69112): Fix logical error when we have empty async insert. [#69080](https://github.com/ClickHouse/ClickHouse/pull/69080) ([Han Fei](https://github.com/hanfei1991)).
#### NO CL CATEGORY
* Backported in [#68938](https://github.com/ClickHouse/ClickHouse/issues/68938):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Backported in [#68826](https://github.com/ClickHouse/ClickHouse/issues/68826): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
* Backported in [#68754](https://github.com/ClickHouse/ClickHouse/issues/68754): To make patch release possible from every commit on release branch, package_debug build is required and must not be skipped. [#68750](https://github.com/ClickHouse/ClickHouse/pull/68750) ([Max K.](https://github.com/maxknv)).
* Backported in [#69044](https://github.com/ClickHouse/ClickHouse/issues/69044): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.5.7.31-stable (6c185e9aec1) FIXME as compared to v24.5.6.45-stable (bdca8604c29)
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#68564](https://github.com/ClickHouse/ClickHouse/issues/68564): Fix indexHint function case found by fuzzer. [#66286](https://github.com/ClickHouse/ClickHouse/pull/66286) ([Anton Popov](https://github.com/CurtizJ)).
* Backported in [#68996](https://github.com/ClickHouse/ClickHouse/issues/68996): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
* Backported in [#68865](https://github.com/ClickHouse/ClickHouse/issues/68865): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
* Backported in [#69004](https://github.com/ClickHouse/ClickHouse/issues/69004): After https://github.com/ClickHouse/ClickHouse/pull/61984 `schema_inference_make_columns_nullable=0` still can make columns `Nullable` in Parquet/Arrow formats. The change was backward incompatible and users noticed the changes in the behaviour. This PR makes `schema_inference_make_columns_nullable=0` to work as before (no Nullable columns will be inferred) and introduces new value `auto` for this setting that will make columns `Nullable` only if data has information about nullability. [#68298](https://github.com/ClickHouse/ClickHouse/pull/68298) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68882](https://github.com/ClickHouse/ClickHouse/issues/68882): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
* Backported in [#69023](https://github.com/ClickHouse/ClickHouse/issues/69023): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
* Backported in [#68858](https://github.com/ClickHouse/ClickHouse/issues/68858): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68784](https://github.com/ClickHouse/ClickHouse/issues/68784): Fix issue with materialized constant keys when hashing maps with arrays as keys in functions `sipHash(64/128)Keyed`. [#68731](https://github.com/ClickHouse/ClickHouse/pull/68731) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
* Backported in [#69154](https://github.com/ClickHouse/ClickHouse/issues/69154): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
#### NO CL CATEGORY
* Backported in [#68940](https://github.com/ClickHouse/ClickHouse/issues/68940):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Backported in [#68828](https://github.com/ClickHouse/ClickHouse/issues/68828): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
* Backported in [#69046](https://github.com/ClickHouse/ClickHouse/issues/69046): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.6.5.30-stable (e6e196c92d6) FIXME as compared to v24.6.4.42-stable (c534bb4b4dd)
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#68969](https://github.com/ClickHouse/ClickHouse/issues/68969): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
* Backported in [#68814](https://github.com/ClickHouse/ClickHouse/issues/68814): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
* Backported in [#69005](https://github.com/ClickHouse/ClickHouse/issues/69005): After https://github.com/ClickHouse/ClickHouse/pull/61984 `schema_inference_make_columns_nullable=0` still can make columns `Nullable` in Parquet/Arrow formats. The change was backward incompatible and users noticed the changes in the behaviour. This PR makes `schema_inference_make_columns_nullable=0` to work as before (no Nullable columns will be inferred) and introduces new value `auto` for this setting that will make columns `Nullable` only if data has information about nullability. [#68298](https://github.com/ClickHouse/ClickHouse/pull/68298) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68883](https://github.com/ClickHouse/ClickHouse/issues/68883): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
* Backported in [#69025](https://github.com/ClickHouse/ClickHouse/issues/69025): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
* Backported in [#68860](https://github.com/ClickHouse/ClickHouse/issues/68860): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68786](https://github.com/ClickHouse/ClickHouse/issues/68786): Fix issue with materialized constant keys when hashing maps with arrays as keys in functions `sipHash(64/128)Keyed`. [#68731](https://github.com/ClickHouse/ClickHouse/pull/68731) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
* Backported in [#69156](https://github.com/ClickHouse/ClickHouse/issues/69156): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
* Backported in [#69116](https://github.com/ClickHouse/ClickHouse/issues/69116): Fix logical error when we have empty async insert. [#69080](https://github.com/ClickHouse/ClickHouse/pull/69080) ([Han Fei](https://github.com/hanfei1991)).
#### NO CL CATEGORY
* Backported in [#68942](https://github.com/ClickHouse/ClickHouse/issues/68942):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Backported in [#68830](https://github.com/ClickHouse/ClickHouse/issues/68830): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
* Backported in [#69048](https://github.com/ClickHouse/ClickHouse/issues/69048): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -0,0 +1,50 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.8.3.59-lts (e729b9fa40e) FIXME as compared to v24.8.2.3-lts (b54f79ed323)
#### New Feature
* Backported in [#68710](https://github.com/ClickHouse/ClickHouse/issues/68710): Query cache entries can now be dropped by tag. For example, the query cache entry created by `SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'abc'` can now be dropped by `SYSTEM DROP QUERY CACHE TAG 'abc'` (or of course just: `SYSTEM DROP QUERY CACHE` which will clear the entire query cache). [#68477](https://github.com/ClickHouse/ClickHouse/pull/68477) ([Michał Tabaszewski](https://github.com/pinsvin00)).
#### Improvement
* Backported in [#69097](https://github.com/ClickHouse/ClickHouse/issues/69097): Support for the Spanish language in the embedded dictionaries. [#69035](https://github.com/ClickHouse/ClickHouse/pull/69035) ([Vasily Okunev](https://github.com/VOkunev)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#68973](https://github.com/ClickHouse/ClickHouse/issues/68973): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
* Backported in [#68818](https://github.com/ClickHouse/ClickHouse/issues/68818): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
* Backported in [#68893](https://github.com/ClickHouse/ClickHouse/issues/68893): After https://github.com/ClickHouse/ClickHouse/pull/61984 `schema_inference_make_columns_nullable=0` still can make columns `Nullable` in Parquet/Arrow formats. The change was backward incompatible and users noticed the changes in the behaviour. This PR makes `schema_inference_make_columns_nullable=0` to work as before (no Nullable columns will be inferred) and introduces new value `auto` for this setting that will make columns `Nullable` only if data has information about nullability. [#68298](https://github.com/ClickHouse/ClickHouse/pull/68298) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68721](https://github.com/ClickHouse/ClickHouse/issues/68721): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
* Backported in [#69029](https://github.com/ClickHouse/ClickHouse/issues/69029): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
* Backported in [#68864](https://github.com/ClickHouse/ClickHouse/issues/68864): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68854](https://github.com/ClickHouse/ClickHouse/issues/68854): Fix possible error `DB::Exception: Block structure mismatch in joined block stream: different columns:` with new JSON column. [#68686](https://github.com/ClickHouse/ClickHouse/pull/68686) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68790](https://github.com/ClickHouse/ClickHouse/issues/68790): Fix issue with materialized constant keys when hashing maps with arrays as keys in functions `sipHash(64/128)Keyed`. [#68731](https://github.com/ClickHouse/ClickHouse/pull/68731) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
* Backported in [#69108](https://github.com/ClickHouse/ClickHouse/issues/69108): TODO. [#68744](https://github.com/ClickHouse/ClickHouse/pull/68744) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Backported in [#68850](https://github.com/ClickHouse/ClickHouse/issues/68850): Fix resolving dynamic subcolumns from subqueries in analyzer. [#68824](https://github.com/ClickHouse/ClickHouse/pull/68824) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68911](https://github.com/ClickHouse/ClickHouse/issues/68911): Fix complex types metadata parsing in DeltaLake. Closes [#68739](https://github.com/ClickHouse/ClickHouse/issues/68739). [#68836](https://github.com/ClickHouse/ClickHouse/pull/68836) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Backported in [#69160](https://github.com/ClickHouse/ClickHouse/issues/69160): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
* Backported in [#69072](https://github.com/ClickHouse/ClickHouse/issues/69072): Fixed writing to Materialized Views with enabled setting `optimize_functions_to_subcolumns`. [#68951](https://github.com/ClickHouse/ClickHouse/pull/68951) ([Anton Popov](https://github.com/CurtizJ)).
* Backported in [#69016](https://github.com/ClickHouse/ClickHouse/issues/69016): Don't use serializations cache in const Dynamic column methods. It could let to use-of-unitialized value or even race condition during aggregations. [#68953](https://github.com/ClickHouse/ClickHouse/pull/68953) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#69120](https://github.com/ClickHouse/ClickHouse/issues/69120): Fix logical error when we have empty async insert. [#69080](https://github.com/ClickHouse/ClickHouse/pull/69080) ([Han Fei](https://github.com/hanfei1991)).
#### NO CL CATEGORY
* Backported in [#68947](https://github.com/ClickHouse/ClickHouse/issues/68947):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Backported in [#68704](https://github.com/ClickHouse/ClickHouse/issues/68704): Fix enumerating dynamic subcolumns. [#68582](https://github.com/ClickHouse/ClickHouse/pull/68582) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#69000](https://github.com/ClickHouse/ClickHouse/issues/69000): Prioritizing of virtual columns in hive partitioning. [#68606](https://github.com/ClickHouse/ClickHouse/pull/68606) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
* Backported in [#68799](https://github.com/ClickHouse/ClickHouse/issues/68799): CI: Disable SQLLogic job. [#68654](https://github.com/ClickHouse/ClickHouse/pull/68654) ([Max K.](https://github.com/maxknv)).
* Backported in [#68834](https://github.com/ClickHouse/ClickHouse/issues/68834): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
* Backported in [#68781](https://github.com/ClickHouse/ClickHouse/issues/68781): Fix flaky test 00989_parallel_parts_loading. [#68737](https://github.com/ClickHouse/ClickHouse/pull/68737) ([alesapin](https://github.com/alesapin)).
* Backported in [#68762](https://github.com/ClickHouse/ClickHouse/issues/68762): To make patch release possible from every commit on release branch, package_debug build is required and must not be skipped. [#68750](https://github.com/ClickHouse/ClickHouse/pull/68750) ([Max K.](https://github.com/maxknv)).
* Backported in [#68810](https://github.com/ClickHouse/ClickHouse/issues/68810): Try to disable rerun check if job triggered manually. [#68751](https://github.com/ClickHouse/ClickHouse/pull/68751) ([Max K.](https://github.com/maxknv)).
* Backported in [#68962](https://github.com/ClickHouse/ClickHouse/issues/68962): Fix 2477 timeout. [#68752](https://github.com/ClickHouse/ClickHouse/pull/68752) ([jsc0218](https://github.com/jsc0218)).
* Backported in [#68977](https://github.com/ClickHouse/ClickHouse/issues/68977): Check setting use_json_alias_for_old_object_type in runtime. [#68793](https://github.com/ClickHouse/ClickHouse/pull/68793) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#68852](https://github.com/ClickHouse/ClickHouse/issues/68852): Make dynamic structure selection more consistent. [#68802](https://github.com/ClickHouse/ClickHouse/pull/68802) ([Kruglov Pavel](https://github.com/Avogar)).
* Backported in [#69052](https://github.com/ClickHouse/ClickHouse/issues/69052): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -111,13 +111,14 @@ ANN indexes are built during column insertion and merge. As a result, `INSERT` a
tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write
requests.
ANN indexes support these queries:
ANN indexes support this type of query:
``` sql
WITH [...] AS reference_vector
SELECT *
FROM table
[WHERE ...]
ORDER BY Distance(vectors, Point)
WHERE ... -- WHERE clause is optional
ORDER BY Distance(vectors, reference_vector)
LIMIT N
```

View File

@ -3226,7 +3226,7 @@ Default value: `0`.
## lightweight_deletes_sync {#lightweight_deletes_sync}
The same as 'mutation_sync', but controls only execution of lightweight deletes.
The same as [`mutations_sync`](#mutations_sync), but controls only execution of lightweight deletes.
Possible values:

View File

@ -499,7 +499,7 @@ Required parameters:
- `type``encrypted`. Otherwise the encrypted disk is not created.
- `disk` — Type of disk for data storage.
- `key` — The key for encryption and decryption. Type: [Uint64](/docs/en/sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encode the key in hexadecimal form.
You can specify multiple keys using the `id` attribute (see example above).
You can specify multiple keys using the `id` attribute (see example below).
Optional parameters:

View File

@ -0,0 +1,44 @@
---
slug: /en/sql-reference/aggregate-functions/reference/distinctdynamictypes
sidebar_position: 215
---
# distinctDynamicTypes
Calculates the list of distinct data types stored in [Dynamic](../../data-types/dynamic.md) column.
**Syntax**
```sql
distinctDynamicTypes(dynamic)
```
**Arguments**
- `dynamic` — [Dynamic](../../data-types/dynamic.md) column.
**Returned Value**
- The sorted list of data type names [Array(String)](../../data-types/array.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS test_dynamic;
CREATE TABLE test_dynamic(d Dynamic) ENGINE = Memory;
INSERT INTO test_dynamic VALUES (42), (NULL), ('Hello'), ([1, 2, 3]), ('2020-01-01'), (map(1, 2)), (43), ([4, 5]), (NULL), ('World'), (map(3, 4))
```
```sql
SELECT distinctDynamicTypes(d) FROM test_dynamic;
```
Result:
```reference
┌─distinctDynamicTypes(d)──────────────────────────────────────┐
│ ['Array(Int64)','Date','Int64','Map(UInt8, UInt8)','String'] │
└──────────────────────────────────────────────────────────────┘
```

View File

@ -0,0 +1,125 @@
---
slug: /en/sql-reference/aggregate-functions/reference/distinctjsonpaths
sidebar_position: 216
---
# distinctJSONPaths
Calculates the list of distinct paths stored in [JSON](../../data-types/newjson.md) column.
**Syntax**
```sql
distinctJSONPaths(json)
```
**Arguments**
- `json` — [JSON](../../data-types/newjson.md) column.
**Returned Value**
- The sorted list of paths [Array(String)](../../data-types/array.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS test_json;
CREATE TABLE test_json(json JSON) ENGINE = Memory;
INSERT INTO test_json VALUES ('{"a" : 42, "b" : "Hello"}'), ('{"b" : [1, 2, 3], "c" : {"d" : {"e" : "2020-01-01"}}}'), ('{"a" : 43, "c" : {"d" : {"f" : [{"g" : 42}]}}}')
```
```sql
SELECT distinctJSONPaths(json) FROM test_json;
```
Result:
```reference
┌─distinctJSONPaths(json)───┐
│ ['a','b','c.d.e','c.d.f'] │
└───────────────────────────┘
```
# distinctJSONPathsAndTypes
Calculates the list of distinct paths and their types stored in [JSON](../../data-types/newjson.md) column.
**Syntax**
```sql
distinctJSONPathsAndTypes(json)
```
**Arguments**
- `json` — [JSON](../../data-types/newjson.md) column.
**Returned Value**
- The sorted map of paths and types [Map(String, Array(String))](../../data-types/map.md).
**Example**
Query:
```sql
DROP TABLE IF EXISTS test_json;
CREATE TABLE test_json(json JSON) ENGINE = Memory;
INSERT INTO test_json VALUES ('{"a" : 42, "b" : "Hello"}'), ('{"b" : [1, 2, 3], "c" : {"d" : {"e" : "2020-01-01"}}}'), ('{"a" : 43, "c" : {"d" : {"f" : [{"g" : 42}]}}}')
```
```sql
SELECT distinctJSONPathsAndTypes(json) FROM test_json;
```
Result:
```reference
┌─distinctJSONPathsAndTypes(json)───────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ {'a':['Int64'],'b':['Array(Nullable(Int64))','String'],'c.d.e':['Date'],'c.d.f':['Array(JSON(max_dynamic_types=16, max_dynamic_paths=256))']} │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
**Note**
If JSON declaration contains paths with specified types, these paths will be always included in the result of `distinctJSONPaths/distinctJSONPathsAndTypes` functions even if input data didn't have values for these paths.
```sql
DROP TABLE IF EXISTS test_json;
CREATE TABLE test_json(json JSON(a UInt32)) ENGINE = Memory;
INSERT INTO test_json VALUES ('{"b" : "Hello"}'), ('{"b" : "World", "c" : [1, 2, 3]}');
```
```sql
SELECT json FROM test_json;
```
```text
┌─json──────────────────────────────────┐
│ {"a":0,"b":"Hello"} │
│ {"a":0,"b":"World","c":["1","2","3"]} │
└───────────────────────────────────────┘
```
```sql
SELECT distinctJSONPaths(json) FROM test_json;
```
```text
┌─distinctJSONPaths(json)─┐
│ ['a','b','c'] │
└─────────────────────────┘
```
```sql
SELECT distinctJSONPathsAndTypes(json) FROM test_json;
```
```text
┌─distinctJSONPathsAndTypes(json)────────────────────────────────┐
│ {'a':['UInt32'],'b':['String'],'c':['Array(Nullable(Int64))']} │
└────────────────────────────────────────────────────────────────┘
```

View File

@ -505,7 +505,130 @@ As we can see, ClickHouse kept the most frequent paths `a`, `b` and `c` and move
## Introspection functions
There are several functions that can help to inspect the content of the JSON column: [JSONAllPaths](../functions/json-functions.md#jsonallpaths), [JSONAllPathsWithTypes](../functions/json-functions.md#jsonallpathswithtypes), [JSONDynamicPaths](../functions/json-functions.md#jsondynamicpaths), [JSONDynamicPathsWithTypes](../functions/json-functions.md#jsondynamicpathswithtypes), [JSONSharedDataPaths](../functions/json-functions.md#jsonshareddatapaths), [JSONSharedDataPathsWithTypes](../functions/json-functions.md#jsonshareddatapathswithtypes).
There are several functions that can help to inspect the content of the JSON column: [JSONAllPaths](../functions/json-functions.md#jsonallpaths), [JSONAllPathsWithTypes](../functions/json-functions.md#jsonallpathswithtypes), [JSONDynamicPaths](../functions/json-functions.md#jsondynamicpaths), [JSONDynamicPathsWithTypes](../functions/json-functions.md#jsondynamicpathswithtypes), [JSONSharedDataPaths](../functions/json-functions.md#jsonshareddatapaths), [JSONSharedDataPathsWithTypes](../functions/json-functions.md#jsonshareddatapathswithtypes), [distinctDynamicTypes](../aggregate-functions/reference/distinctdynamictypes.md), [distinctJSONPaths and distinctJSONPathsAndTypes](../aggregate-functions/reference/distinctjsonpaths.md)
**Examples**
Let's investigate the content of [GH Archive](https://www.gharchive.org/) dataset for `2020-01-01` date:
```sql
SELECT arrayJoin(distinctJSONPaths(json)) FROM s3('s3://clickhouse-public-datasets/gharchive/original/2020-01-01-*.json.gz', JSONAsObject)
```
```text
┌─arrayJoin(distinctJSONPaths(json))─────────────────────────┐
│ actor.avatar_url │
│ actor.display_login │
│ actor.gravatar_id │
│ actor.id │
│ actor.login │
│ actor.url │
│ created_at │
│ id │
│ org.avatar_url │
│ org.gravatar_id │
│ org.id │
│ org.login │
│ org.url │
│ payload.action │
│ payload.before │
│ payload.comment._links.html.href │
│ payload.comment._links.pull_request.href │
│ payload.comment._links.self.href │
│ payload.comment.author_association │
│ payload.comment.body │
│ payload.comment.commit_id │
│ payload.comment.created_at │
│ payload.comment.diff_hunk │
│ payload.comment.html_url │
│ payload.comment.id │
│ payload.comment.in_reply_to_id │
│ payload.comment.issue_url │
│ payload.comment.line │
│ payload.comment.node_id │
│ payload.comment.original_commit_id │
│ payload.comment.original_position │
│ payload.comment.path │
│ payload.comment.position │
│ payload.comment.pull_request_review_id │
...
│ payload.release.node_id │
│ payload.release.prerelease │
│ payload.release.published_at │
│ payload.release.tag_name │
│ payload.release.tarball_url │
│ payload.release.target_commitish │
│ payload.release.upload_url │
│ payload.release.url │
│ payload.release.zipball_url │
│ payload.size │
│ public │
│ repo.id │
│ repo.name │
│ repo.url │
│ type │
└─arrayJoin(distinctJSONPaths(json))─────────────────────────┘
```
```sql
SELECT arrayJoin(distinctJSONPathsAndTypes(json)) FROM s3('s3://clickhouse-public-datasets/gharchive/original/2020-01-01-*.json.gz', JSONAsObject) SETTINGS date_time_input_format='best_effort'
```
```text
┌─arrayJoin(distinctJSONPathsAndTypes(json))──────────────────┐
│ ('actor.avatar_url',['String']) │
│ ('actor.display_login',['String']) │
│ ('actor.gravatar_id',['String']) │
│ ('actor.id',['Int64']) │
│ ('actor.login',['String']) │
│ ('actor.url',['String']) │
│ ('created_at',['DateTime']) │
│ ('id',['String']) │
│ ('org.avatar_url',['String']) │
│ ('org.gravatar_id',['String']) │
│ ('org.id',['Int64']) │
│ ('org.login',['String']) │
│ ('org.url',['String']) │
│ ('payload.action',['String']) │
│ ('payload.before',['String']) │
│ ('payload.comment._links.html.href',['String']) │
│ ('payload.comment._links.pull_request.href',['String']) │
│ ('payload.comment._links.self.href',['String']) │
│ ('payload.comment.author_association',['String']) │
│ ('payload.comment.body',['String']) │
│ ('payload.comment.commit_id',['String']) │
│ ('payload.comment.created_at',['DateTime']) │
│ ('payload.comment.diff_hunk',['String']) │
│ ('payload.comment.html_url',['String']) │
│ ('payload.comment.id',['Int64']) │
│ ('payload.comment.in_reply_to_id',['Int64']) │
│ ('payload.comment.issue_url',['String']) │
│ ('payload.comment.line',['Int64']) │
│ ('payload.comment.node_id',['String']) │
│ ('payload.comment.original_commit_id',['String']) │
│ ('payload.comment.original_position',['Int64']) │
│ ('payload.comment.path',['String']) │
│ ('payload.comment.position',['Int64']) │
│ ('payload.comment.pull_request_review_id',['Int64']) │
...
│ ('payload.release.node_id',['String']) │
│ ('payload.release.prerelease',['Bool']) │
│ ('payload.release.published_at',['DateTime']) │
│ ('payload.release.tag_name',['String']) │
│ ('payload.release.tarball_url',['String']) │
│ ('payload.release.target_commitish',['String']) │
│ ('payload.release.upload_url',['String']) │
│ ('payload.release.url',['String']) │
│ ('payload.release.zipball_url',['String']) │
│ ('payload.size',['Int64']) │
│ ('public',['Bool']) │
│ ('repo.id',['Int64']) │
│ ('repo.name',['String']) │
│ ('repo.url',['String']) │
│ ('type',['String']) │
└─arrayJoin(distinctJSONPathsAndTypes(json))──────────────────┘
```
## Tips for better usage of the JSON type

View File

@ -2035,6 +2035,7 @@ Query:
SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]);
```
Result:
``` text
@ -2043,6 +2044,43 @@ Result:
└──────────────────────────────────────┘
```
## arrayZipUnaligned
Combines multiple arrays into a single array, allowing for unaligned arrays. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
**Syntax**
``` sql
arrayZipUnaligned(arr1, arr2, ..., arrN)
```
**Arguments**
- `arrN` — [Array](../data-types/array.md).
The function can take any number of arrays of different types.
**Returned value**
- Array with elements from the source arrays grouped into [tuples](../data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. [Array](../data-types/array.md). If the arrays have different sizes, the shorter arrays will be padded with `null` values.
**Example**
Query:
``` sql
SELECT arrayZipUnaligned(['a'], [1, 2, 3]);
```
Result:
``` text
┌─arrayZipUnaligned(['a'], [1, 2, 3])─┐
│ [('a',1),(NULL,2),(NULL,3)] │
└─────────────────────────────────────┘
```
## arrayAUC
Calculate AUC (Area Under the Curve, which is a concept in machine learning, see more details: <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>).

View File

@ -8,14 +8,14 @@ slug: /en/guides/developer/transactional
This is transactional (ACID) if the inserted rows are packed and inserted as a single block (see Notes):
- Atomic: an INSERT succeeds or is rejected as a whole: if a confirmation is sent to the client, then all rows were inserted; if an error is sent to the client, then no rows were inserted.
- Consistent: if there are no table constraints violated, then all rows in an INSERT are inserted and the INSERT succeeds; if constraints are violated, then no rows are inserted.
- Isolated: concurrent clients observe a consistent snapshot of the tablethe state of the table either as it was before the INSERT attempt, or after the successful INSERT; no partial state is seen
- Isolated: concurrent clients observe a consistent snapshot of the tablethe state of the table either as it was before the INSERT attempt, or after the successful INSERT; no partial state is seen. Clients inside of another transaction have [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation), while clients outside of a transaction have [read uncommitted](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Read_uncommitted) isolation level.
- Durable: a successful INSERT is written to the filesystem before answering to the client, on a single replica or multiple replicas (controlled by the `insert_quorum` setting), and ClickHouse can ask the OS to sync the filesystem data on the storage media (controlled by the `fsync_after_insert` setting).
- INSERT into multiple tables with one statement is possible if materialized views are involved (the INSERT from the client is to a table which has associate materialized views).
## Case 2: INSERT into multiple partitions, of one table, of the MergeTree* family
Same as Case 1 above, with this detail:
- If table has many partitions and INSERT covers many partitionsthen insertion into every partition is transactional on its own
- If table has many partitions and INSERT covers many partitions, then insertion into every partition is transactional on its own
## Case 3: INSERT into one distributed table of the MergeTree* family
@ -38,7 +38,7 @@ Same as Case 1 above, with this detail:
- the insert format is column-based (like Native, Parquet, ORC, etc) and the data contains only one block of data
- the size of the inserted block in general may depend on many settings (for example: `max_block_size`, `max_insert_block_size`, `min_insert_block_size_rows`, `min_insert_block_size_bytes`, `preferred_block_size_bytes`, etc)
- if the client did not receive an answer from the server, the client does not know if the transaction succeeded, and it can repeat the transaction, using exactly-once insertion properties
- ClickHouse is using MVCC with snapshot isolation internally
- ClickHouse is using [MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) with [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation) internally for concurrent transactions
- all ACID properties are valid even in the case of server kill/crash
- either insert_quorum into different AZ or fsync should be enabled to ensure durable inserts in the typical setup
- "consistency" in ACID terms does not cover the semantics of distributed systems, see https://jepsen.io/consistency which is controlled by different settings (select_sequential_consistency)
@ -260,7 +260,7 @@ FROM mergetree_table
### Transactions introspection
You can inspect transactions by querying the `system.transactions` table, but note that you cannot query that
table from a session that is in a transactionopen a second `clickhouse client` session to query that table.
table from a session that is in a transaction. Open a second `clickhouse client` session to query that table.
```sql
SELECT *

View File

@ -0,0 +1,161 @@
#include <unordered_set>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <Columns/ColumnDynamic.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/FactoryHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_ARRAY_SIZE;
}
struct AggregateFunctionDistinctDynamicTypesData
{
constexpr static size_t MAX_ARRAY_SIZE = 0xFFFFFF;
std::unordered_set<String> data;
void add(const String & type)
{
data.insert(type);
}
void merge(const AggregateFunctionDistinctDynamicTypesData & other)
{
data.insert(other.data.begin(), other.data.end());
}
void serialize(WriteBuffer & buf) const
{
writeVarUInt(data.size(), buf);
for (const auto & type : data)
writeStringBinary(type, buf);
}
void deserialize(ReadBuffer & buf)
{
size_t size;
readVarUInt(size, buf);
if (size > MAX_ARRAY_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size (maximum: {}): {}", MAX_ARRAY_SIZE, size);
data.reserve(size);
String type;
for (size_t i = 0; i != size; ++i)
{
readStringBinary(type, buf);
data.insert(type);
}
}
void insertResultInto(IColumn & column)
{
/// Insert types in sorted order for better output.
auto & array_column = assert_cast<ColumnArray &>(column);
auto & string_column = assert_cast<ColumnString &>(array_column.getData());
std::vector<String> sorted_data(data.begin(), data.end());
std::sort(sorted_data.begin(), sorted_data.end());
for (const auto & type : sorted_data)
string_column.insertData(type.data(), type.size());
array_column.getOffsets().push_back(string_column.size());
}
};
/// Calculates the list of distinct data types in Dynamic column.
class AggregateFunctionDistinctDynamicTypes final : public IAggregateFunctionDataHelper<AggregateFunctionDistinctDynamicTypesData, AggregateFunctionDistinctDynamicTypes>
{
public:
explicit AggregateFunctionDistinctDynamicTypes(const DataTypes & argument_types_)
: IAggregateFunctionDataHelper<AggregateFunctionDistinctDynamicTypesData, AggregateFunctionDistinctDynamicTypes>(argument_types_, {}, std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()))
{
}
String getName() const override { return "distinctDynamicTypes"; }
bool allocatesMemoryInArena() const override { return false; }
void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
{
const auto & dynamic_column = assert_cast<const ColumnDynamic & >(*columns[0]);
if (dynamic_column.isNullAt(row_num))
return;
data(place).add(dynamic_column.getTypeNameAt(row_num));
}
void ALWAYS_INLINE addBatchSinglePlace(
size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos)
const override
{
if (if_argument_pos >= 0 || row_begin != 0 || row_end != columns[0]->size())
IAggregateFunctionDataHelper::addBatchSinglePlace(row_begin, row_end, place, columns, arena, if_argument_pos);
/// Optimization for case when we add all rows from the column into single place.
/// In this case we can avoid iterating over all rows because we can get all types
/// in Dynamic column in a more efficient way.
else
assert_cast<const ColumnDynamic & >(*columns[0]).getAllTypeNamesInto(data(place).data);
}
void addManyDefaults(
AggregateDataPtr __restrict /*place*/,
const IColumn ** /*columns*/,
size_t /*length*/,
Arena * /*arena*/) const override
{
/// Default value for Dynamic is NULL, so nothing to add.
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
{
data(place).merge(data(rhs));
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
{
data(place).serialize(buf);
}
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
{
data(place).deserialize(buf);
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{
data(place).insertResultInto(to);
}
};
AggregateFunctionPtr createAggregateFunctionDistinctDynamicTypes(
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
{
assertNoParameters(name, parameters);
if (argument_types.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Incorrect number of arguments for aggregate function {}. Expected single argument with type Dynamic, got {} arguments", name, argument_types.size());
if (!isDynamic(argument_types[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}. Expected type Dynamic", argument_types[0]->getName(), name);
return std::make_shared<AggregateFunctionDistinctDynamicTypes>(argument_types);
}
void registerAggregateFunctionDistinctDynamicTypes(AggregateFunctionFactory & factory)
{
factory.registerFunction("distinctDynamicTypes", createAggregateFunctionDistinctDynamicTypes);
}
}

View File

@ -0,0 +1,350 @@
#include <unordered_set>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnMap.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/FactoryHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_ARRAY_SIZE;
}
constexpr static size_t DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE = 0xFFFFFF;
struct AggregateFunctionDistinctJSONPathsData
{
static constexpr auto name = "distinctJSONPaths";
std::unordered_set<String> data;
void add(const ColumnObject & column, size_t row_num, const std::unordered_map<String, String> &)
{
for (const auto & [path, _] : column.getTypedPaths())
data.insert(path);
for (const auto & [path, dynamic_column] : column.getDynamicPathsPtrs())
{
/// Add path from dynamic paths only if it's not NULL in this row.
if (!dynamic_column->isNullAt(row_num))
data.insert(path);
}
/// Iterate over paths in shared data in this row.
const auto [shared_data_paths, _] = column.getSharedDataPathsAndValues();
const auto & shared_data_offsets = column.getSharedDataOffsets();
const size_t start = shared_data_offsets[static_cast<ssize_t>(row_num) - 1];
const size_t end = shared_data_offsets[static_cast<ssize_t>(row_num)];
for (size_t i = start; i != end; ++i)
data.insert(shared_data_paths->getDataAt(i).toString());
}
void addWholeColumn(const ColumnObject & column, const std::unordered_map<String, String> &)
{
for (const auto & [path, _] : column.getTypedPaths())
data.insert(path);
for (const auto & [path, dynamic_column] : column.getDynamicPathsPtrs())
{
/// Add dynamic path only if it has at least one non-null value.
/// getNumberOfDefaultRows for Dynamic column is O(1).
if (dynamic_column->getNumberOfDefaultRows() != dynamic_column->size())
data.insert(path);
}
/// Iterate over all paths in shared data.
const auto [shared_data_paths, _] = column.getSharedDataPathsAndValues();
for (size_t i = 0; i != shared_data_paths->size(); ++i)
data.insert(shared_data_paths->getDataAt(i).toString());
}
void merge(const AggregateFunctionDistinctJSONPathsData & other)
{
data.insert(other.data.begin(), other.data.end());
}
void serialize(WriteBuffer & buf) const
{
writeVarUInt(data.size(), buf);
for (const auto & path : data)
writeStringBinary(path, buf);
}
void deserialize(ReadBuffer & buf)
{
size_t size;
readVarUInt(size, buf);
if (size > DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size (maximum: {}): {}", DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE, size);
String path;
for (size_t i = 0; i != size; ++i)
{
readStringBinary(path, buf);
data.insert(path);
}
}
void insertResultInto(IColumn & column)
{
/// Insert paths in sorted order for better output.
auto & array_column = assert_cast<ColumnArray &>(column);
auto & string_column = assert_cast<ColumnString &>(array_column.getData());
std::vector<String> sorted_data(data.begin(), data.end());
std::sort(sorted_data.begin(), sorted_data.end());
for (const auto & path : sorted_data)
string_column.insertData(path.data(), path.size());
array_column.getOffsets().push_back(string_column.size());
}
static DataTypePtr getResultType()
{
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
}
};
struct AggregateFunctionDistinctJSONPathsAndTypesData
{
static constexpr auto name = "distinctJSONPathsAndTypes";
std::unordered_map<String, std::unordered_set<String>> data;
void add(const ColumnObject & column, size_t row_num, const std::unordered_map<String, String> & typed_paths_type_names)
{
for (const auto & [path, _] : column.getTypedPaths())
data[path].insert(typed_paths_type_names.at(path));
for (const auto & [path, dynamic_column] : column.getDynamicPathsPtrs())
{
if (!dynamic_column->isNullAt(row_num))
data[path].insert(dynamic_column->getTypeNameAt(row_num));
}
/// Iterate over paths om shared data in this row and decode the data types.
const auto [shared_data_paths, shared_data_values] = column.getSharedDataPathsAndValues();
const auto & shared_data_offsets = column.getSharedDataOffsets();
const size_t start = shared_data_offsets[static_cast<ssize_t>(row_num) - 1];
const size_t end = shared_data_offsets[static_cast<ssize_t>(row_num)];
for (size_t i = start; i != end; ++i)
{
auto path = shared_data_paths->getDataAt(i).toString();
auto value = shared_data_values->getDataAt(i);
ReadBufferFromMemory buf(value.data, value.size);
auto type = decodeDataType(buf);
/// We should not have Nulls here but let's check just in case.
chassert(!isNothing(type));
data[path].insert(type->getName());
}
}
void addWholeColumn(const ColumnObject & column, const std::unordered_map<String, String> & typed_paths_type_names)
{
for (const auto & [path, _] : column.getTypedPaths())
data[path].insert(typed_paths_type_names.at(path));
for (const auto & [path, dynamic_column] : column.getDynamicPathsPtrs())
{
/// Add dynamic path only if it has at least one non-null value.
/// getNumberOfDefaultRows for Dynamic column is O(1).
if (dynamic_column->getNumberOfDefaultRows() != dynamic_column->size())
dynamic_column->getAllTypeNamesInto(data[path]);
}
/// Iterate over all paths in shared data and decode the data types.
const auto [shared_data_paths, shared_data_values] = column.getSharedDataPathsAndValues();
for (size_t i = 0; i != shared_data_paths->size(); ++i)
{
auto path = shared_data_paths->getDataAt(i).toString();
auto value = shared_data_values->getDataAt(i);
ReadBufferFromMemory buf(value.data, value.size);
auto type = decodeDataType(buf);
/// We should not have Nulls here but let's check just in case.
chassert(!isNothing(type));
data[path].insert(type->getName());
}
}
void merge(const AggregateFunctionDistinctJSONPathsAndTypesData & other)
{
for (const auto & [path, types] : other.data)
data[path].insert(types.begin(), types.end());
}
void serialize(WriteBuffer & buf) const
{
writeVarUInt(data.size(), buf);
for (const auto & [path, types] : data)
{
writeStringBinary(path, buf);
writeVarUInt(types.size(), buf);
for (const auto & type : types)
writeStringBinary(type, buf);
}
}
void deserialize(ReadBuffer & buf)
{
size_t paths_size, types_size;
readVarUInt(paths_size, buf);
if (paths_size > DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size for paths (maximum: {}): {}", DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE, paths_size);
data.reserve(paths_size);
String path, type;
for (size_t i = 0; i != paths_size; ++i)
{
readStringBinary(path, buf);
readVarUInt(types_size, buf);
if (types_size > DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size for types (maximum: {}): {}", DISTINCT_JSON_PATHS_MAX_ARRAY_SIZE, types_size);
data[path].reserve(types_size);
for (size_t j = 0; j != types_size; ++j)
{
readStringBinary(type, buf);
data[path].insert(type);
}
}
}
void insertResultInto(IColumn & column)
{
/// Insert sorted paths and types for better output.
auto & array_column = assert_cast<ColumnMap &>(column).getNestedColumn();
auto & tuple_column = assert_cast<ColumnTuple &>(array_column.getData());
auto & key_column = assert_cast<ColumnString &>(tuple_column.getColumn(0));
auto & value_column = assert_cast<ColumnArray &>(tuple_column.getColumn(1));
auto & value_column_data = assert_cast<ColumnString &>(value_column.getData());
std::vector<std::pair<String, std::vector<String>>> sorted_data;
sorted_data.reserve(data.size());
for (const auto & [path, types] : data)
{
std::vector<String> sorted_types(types.begin(), types.end());
std::sort(sorted_types.begin(), sorted_types.end());
sorted_data.emplace_back(path, std::move(sorted_types));
}
std::sort(sorted_data.begin(), sorted_data.end());
for (const auto & [path, types] : sorted_data)
{
key_column.insertData(path.data(), path.size());
for (const auto & type : types)
value_column_data.insertData(type.data(), type.size());
value_column.getOffsets().push_back(value_column_data.size());
}
array_column.getOffsets().push_back(key_column.size());
}
static DataTypePtr getResultType()
{
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()));
}
};
/// Calculates the list of distinct paths or pairs (path, type) in JSON column.
template <typename Data>
class AggregateFunctionDistinctJSONPathsAndTypes final : public IAggregateFunctionDataHelper<Data, AggregateFunctionDistinctJSONPathsAndTypes<Data>>
{
public:
explicit AggregateFunctionDistinctJSONPathsAndTypes(const DataTypes & argument_types_)
: IAggregateFunctionDataHelper<Data, AggregateFunctionDistinctJSONPathsAndTypes<Data>>(
argument_types_, {}, Data::getResultType())
{
const auto & typed_paths_types = assert_cast<const DataTypeObject &>(*argument_types_[0]).getTypedPaths();
typed_paths_type_names.reserve(typed_paths_types.size());
for (const auto & [path, type] : typed_paths_types)
typed_paths_type_names[path] = type->getName();
}
String getName() const override { return Data::name; }
bool allocatesMemoryInArena() const override { return false; }
void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
{
const auto & object_column = assert_cast<const ColumnObject & >(*columns[0]);
this->data(place).add(object_column, row_num, typed_paths_type_names);
}
void ALWAYS_INLINE addBatchSinglePlace(
size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos)
const override
{
if (if_argument_pos >= 0 || row_begin != 0 || row_end != columns[0]->size())
IAggregateFunctionDataHelper<Data, AggregateFunctionDistinctJSONPathsAndTypes<Data>>::addBatchSinglePlace(row_begin, row_end, place, columns, arena, if_argument_pos);
/// Optimization for case when we add all rows from the column into single place.
/// In this case we can avoid iterating over all rows because we can get all paths
/// and types in JSON column in a more efficient way.
else
this->data(place).addWholeColumn(assert_cast<const ColumnObject & >(*columns[0]), typed_paths_type_names);
}
void addManyDefaults(
AggregateDataPtr __restrict /*place*/,
const IColumn ** /*columns*/,
size_t /*length*/,
Arena * /*arena*/) const override
{
/// Default value for JSON is empty object, so nothing to add.
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
{
this->data(place).merge(this->data(rhs));
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
{
this->data(place).serialize(buf);
}
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
{
this->data(place).deserialize(buf);
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{
this->data(place).insertResultInto(to);
}
private:
std::unordered_map<String, String> typed_paths_type_names;
};
template <typename Data>
AggregateFunctionPtr createAggregateFunctionDistinctJSONPathsAndTypes(
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
{
assertNoParameters(name, parameters);
if (argument_types.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Incorrect number of arguments for aggregate function {}. Expected single argument with type JSON, got {} arguments", name, argument_types.size());
if (!isObject(argument_types[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}. Expected type JSON", argument_types[0]->getName(), name);
return std::make_shared<AggregateFunctionDistinctJSONPathsAndTypes<Data>>(argument_types);
}
void registerAggregateFunctionDistinctJSONPathsAndTypes(AggregateFunctionFactory & factory)
{
factory.registerFunction("distinctJSONPaths", createAggregateFunctionDistinctJSONPathsAndTypes<AggregateFunctionDistinctJSONPathsData>);
factory.registerFunction("distinctJSONPathsAndTypes", createAggregateFunctionDistinctJSONPathsAndTypes<AggregateFunctionDistinctJSONPathsAndTypesData>);
}
}

View File

@ -89,6 +89,8 @@ void registerAggregateFunctionAnalysisOfVariance(AggregateFunctionFactory &);
void registerAggregateFunctionFlameGraph(AggregateFunctionFactory &);
void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory);
void registerAggregateFunctionLargestTriangleThreeBuckets(AggregateFunctionFactory & factory);
void registerAggregateFunctionDistinctDynamicTypes(AggregateFunctionFactory & factory);
void registerAggregateFunctionDistinctJSONPathsAndTypes(AggregateFunctionFactory & factory);
class AggregateFunctionCombinatorFactory;
void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@ -191,6 +193,8 @@ void registerAggregateFunctions()
registerAggregateFunctionFlameGraph(factory);
registerAggregateFunctionKolmogorovSmirnovTest(factory);
registerAggregateFunctionLargestTriangleThreeBuckets(factory);
registerAggregateFunctionDistinctDynamicTypes(factory);
registerAggregateFunctionDistinctJSONPathsAndTypes(factory);
registerWindowFunctions(factory);
}

View File

@ -2699,14 +2699,6 @@ bool ClientBase::processMultiQueryFromFile(const String & file_name)
ReadBufferFromFile in(file_name);
readStringUntilEOF(queries_from_file, in);
if (!getClientConfiguration().has("log_comment"))
{
Settings settings = client_context->getSettingsCopy();
/// NOTE: cannot use even weakly_canonical() since it fails for /dev/stdin due to resolving of "pipe:[X]"
settings.log_comment = fs::absolute(fs::path(file_name));
client_context->setSettings(settings);
}
return executeMultiQuery(queries_from_file);
}

View File

@ -979,6 +979,41 @@ ColumnPtr ColumnDynamic::compress() const
});
}
String ColumnDynamic::getTypeNameAt(size_t row_num) const
{
const auto & variant_col = getVariantColumn();
const size_t discr = variant_col.globalDiscriminatorAt(row_num);
if (discr == ColumnVariant::NULL_DISCRIMINATOR)
return "";
if (discr == getSharedVariantDiscriminator())
{
const auto value = getSharedVariant().getDataAt(variant_col.offsetAt(row_num));
ReadBufferFromMemory buf(value.data, value.size);
return decodeDataType(buf)->getName();
}
return variant_info.variant_names[discr];
}
void ColumnDynamic::getAllTypeNamesInto(std::unordered_set<String> & names) const
{
auto shared_variant_discr = getSharedVariantDiscriminator();
for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
{
if (i != shared_variant_discr && !variant_column_ptr->getVariantByGlobalDiscriminator(i).empty())
names.insert(variant_info.variant_names[i]);
}
const auto & shared_variant = getSharedVariant();
for (size_t i = 0; i != shared_variant.size(); ++i)
{
const auto value = shared_variant.getDataAt(i);
ReadBufferFromMemory buf(value.data, value.size);
names.insert(decodeDataType(buf)->getName());
}
}
void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
{
if (source_columns.empty())

View File

@ -430,6 +430,9 @@ public:
const SerializationPtr & getVariantSerialization(const DataTypePtr & variant_type) { return getVariantSerialization(variant_type, variant_type->getName()); }
String getTypeNameAt(size_t row_num) const;
void getAllTypeNamesInto(std::unordered_set<String> & names) const;
private:
void createVariantInfo(const DataTypePtr & variant_type);

View File

@ -0,0 +1,69 @@
#include <Poco/AutoPtr.h>
#include <Poco/DOM/DOMParser.h>
#include <Poco/Util/XMLConfiguration.h>
#include <gtest/gtest.h>
TEST(Common, ConfigHostValidation)
{
std::string xml(R"CONFIG(<clickhouse>
<IPv4_1>0.0.0.0</IPv4_1>
<IPv4_2>192.168.0.1</IPv4_2>
<IPv4_3>127.0.0.1</IPv4_3>
<IPv4_4>255.255.255.255</IPv4_4>
<IPv6_1>2001:0db8:85a3:0000:0000:8a2e:0370:7334</IPv6_1>
<IPv6_2>2001:DB8::8a2e:370:7334</IPv6_2>
<IPv6_3>::1</IPv6_3>
<IPv6_4>::</IPv6_4>
<Domain_1>www.example.com.</Domain_1>
<Domain_2>a.co</Domain_2>
<Domain_3>localhost</Domain_3>
<Domain_4>xn--fiqs8s.xn--fiqz9s</Domain_4>
<IPv4_Invalid_1>192.168.1.256</IPv4_Invalid_1>
<IPv4_Invalid_2>192.168.1.1.1</IPv4_Invalid_2>
<IPv4_Invalid_3>192.168.1.99999999999999999999</IPv4_Invalid_3>
<IPv4_Invalid_4>192.168.1.a</IPv4_Invalid_4>
<IPv6_Invalid_1>2001:0db8:85a3:::8a2e:0370:7334</IPv6_Invalid_1>
<IPv6_Invalid_2>1200::AB00:1234::2552:7777:1313</IPv6_Invalid_2>
<IPv6_Invalid_3>1200::AB00:1234:Q000:2552:7777:1313</IPv6_Invalid_3>
<IPv6_Invalid_4>1200:AB00:1234:2552:7777:1313:FFFF</IPv6_Invalid_4>
<Domain_Invalid_1>example.com..</Domain_Invalid_1>
<Domain_Invalid_2>5example.com</Domain_Invalid_2>
<Domain_Invalid_3>example.com-</Domain_Invalid_3>
<Domain_Invalid_4>exa_mple.com</Domain_Invalid_4>
</clickhouse>)CONFIG");
Poco::XML::DOMParser dom_parser;
Poco::AutoPtr<Poco::XML::Document> document = dom_parser.parseString(xml);
Poco::AutoPtr<Poco::Util::XMLConfiguration> config = new Poco::Util::XMLConfiguration(document);
EXPECT_NO_THROW(config->getHost("IPv4_1"));
EXPECT_NO_THROW(config->getHost("IPv4_2"));
EXPECT_NO_THROW(config->getHost("IPv4_3"));
EXPECT_NO_THROW(config->getHost("IPv4_4"));
EXPECT_NO_THROW(config->getHost("IPv6_1"));
EXPECT_NO_THROW(config->getHost("IPv6_2"));
EXPECT_NO_THROW(config->getHost("IPv6_3"));
EXPECT_NO_THROW(config->getHost("IPv6_4"));
EXPECT_NO_THROW(config->getHost("Domain_1"));
EXPECT_NO_THROW(config->getHost("Domain_2"));
EXPECT_NO_THROW(config->getHost("Domain_3"));
EXPECT_NO_THROW(config->getHost("Domain_4"));
EXPECT_THROW(config->getHost("IPv4_Invalid_1"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv4_Invalid_2"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv4_Invalid_3"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv4_Invalid_4"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv6_Invalid_1"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv6_Invalid_2"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv6_Invalid_3"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("IPv6_Invalid_4"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("Domain_Invalid_1"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("Domain_Invalid_2"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("Domain_Invalid_3"), Poco::SyntaxException);
EXPECT_THROW(config->getHost("Domain_Invalid_4"), Poco::SyntaxException);
}

View File

@ -17,11 +17,12 @@
#include <Core/ExternalTable.h>
#include <Core/Settings.h>
#include <Poco/Net/MessageHeader.h>
#include <Parsers/ASTNameTypePair.h>
#include <Parsers/IdentifierQuotingStyle.h>
#include <Parsers/ParserCreateQuery.h>
#include <Parsers/parseQuery.h>
#include <base/scope_guard.h>
#include <Poco/Net/MessageHeader.h>
namespace DB
@ -85,7 +86,15 @@ void BaseExternalTable::parseStructureFromStructureField(const std::string & arg
/// We use `formatWithPossiblyHidingSensitiveData` instead of `getColumnNameWithoutAlias` because `column->type` is an ASTFunction.
/// `getColumnNameWithoutAlias` will return name of the function with `(arguments)` even if arguments is empty.
if (column)
structure.emplace_back(column->name, column->type->formatWithPossiblyHidingSensitiveData(0, true, true, false));
structure.emplace_back(
column->name,
column->type->formatWithPossiblyHidingSensitiveData(
/*max_length=*/0,
/*one_line=*/true,
/*show_secrets=*/true,
/*print_pretty_type_names=*/false,
/*always_quote_identifiers=*/false,
/*identifier_quoting_style=*/IdentifierQuotingStyle::Backticks));
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Error while parsing table structure: expected column definition, got {}", child->formatForErrorMessage());
}
@ -102,7 +111,15 @@ void BaseExternalTable::parseStructureFromTypesField(const std::string & argumen
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Error while parsing table structure: {}", error);
for (size_t i = 0; i < type_list_raw->children.size(); ++i)
structure.emplace_back("_" + toString(i + 1), type_list_raw->children[i]->formatWithPossiblyHidingSensitiveData(0, true, true, false));
structure.emplace_back(
"_" + toString(i + 1),
type_list_raw->children[i]->formatWithPossiblyHidingSensitiveData(
/*max_length=*/0,
/*one_line=*/true,
/*show_secrets=*/true,
/*print_pretty_type_names=*/false,
/*always_quote_identifiers=*/false,
/*identifier_quoting_style=*/IdentifierQuotingStyle::Backticks));
}
void BaseExternalTable::initSampleBlock()

View File

@ -1296,6 +1296,9 @@ class IColumn;
M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \
M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", "Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'.", 0) \
M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, "Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple", 0) \
\
M(Bool, output_format_always_quote_identifiers, false, "Always quote identifiers", 0) \
M(IdentifierQuotingStyle, output_format_identifier_quoting_style, IdentifierQuotingStyle::Backticks, "Set the quoting style for identifiers", 0) \
// End of FORMAT_FACTORY_SETTINGS

View File

@ -75,6 +75,8 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
{"create_if_not_exists", false, false, "New setting."},
{"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"},
{"output_format_always_quote_identifiers", false, false, "New setting."},
{"output_format_identifier_quoting_style", "Backticks", "Backticks", "New setting."}
}
},
{"24.8",

View File

@ -244,4 +244,10 @@ IMPLEMENT_SETTING_ENUM(
GroupArrayActionWhenLimitReached,
ErrorCodes::BAD_ARGUMENTS,
{{"throw", GroupArrayActionWhenLimitReached::THROW}, {"discard", GroupArrayActionWhenLimitReached::DISCARD}})
IMPLEMENT_SETTING_ENUM(IdentifierQuotingStyle, ErrorCodes::BAD_ARGUMENTS,
{{"None", IdentifierQuotingStyle::None},
{"Backticks", IdentifierQuotingStyle::Backticks},
{"DoubleQuotes", IdentifierQuotingStyle::DoubleQuotes},
{"BackticksMySQL", IdentifierQuotingStyle::BackticksMySQL}})
}

View File

@ -10,6 +10,7 @@
#include <Formats/FormatSettings.h>
#include <IO/ReadSettings.h>
#include <Parsers/ASTSQLSecurity.h>
#include <Parsers/IdentifierQuotingStyle.h>
#include <QueryPipeline/SizeLimits.h>
#include <Common/ShellCommandSettings.h>
@ -351,6 +352,8 @@ DECLARE_SETTING_ENUM_WITH_RENAME(DateTimeOverflowBehavior, FormatSettings::DateT
DECLARE_SETTING_ENUM(SQLSecurityType)
DECLARE_SETTING_ENUM(IdentifierQuotingStyle)
enum class GroupArrayActionWhenLimitReached : uint8_t
{
THROW,

View File

@ -210,7 +210,7 @@ namespace
{
UInt64 stringToMaxThreads(const String & str)
{
if (startsWith(str, "auto"))
if (startsWith(str, "auto") || startsWith(str, "'auto"))
return 0;
return parseFromString<UInt64>(str);
}
@ -237,7 +237,8 @@ SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f)
String SettingFieldMaxThreads::toString() const
{
if (is_auto)
return "auto(" + ::DB::toString(value) + ")";
/// Removing quotes here will introduce an incompatibility between replicas with different versions.
return "'auto(" + ::DB::toString(value) + ")'";
else
return ::DB::toString(value);
}

View File

@ -519,10 +519,10 @@ static DataTypePtr createJSON(const ASTPtr & arguments)
if (!context)
context = Context::getGlobalContextInstance();
if (context->getSettingsRef().use_json_alias_for_old_object_type)
if (context->getSettingsRef().allow_experimental_object_type && context->getSettingsRef().use_json_alias_for_old_object_type)
{
if (arguments && !arguments->children.empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Experimental Object type doesn't support any arguments. If you want to use new JSON type, set setting allow_experimental_json_type = 1");
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Experimental Object type doesn't support any arguments. If you want to use new JSON type, set settings allow_experimental_json_type = 1 and use_json_alias_for_old_object_type = 0");
return std::make_shared<DataTypeObjectDeprecated>("JSON", false);
}

View File

@ -302,8 +302,12 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
/// Try to determine the type of value inside quotes
auto type = tryInferDataTypeForSingleField(data, format_settings);
/// If we couldn't infer any type or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string.
if (!type || (format_settings.csv.try_infer_strings_from_quoted_tuples && isTuple(type)) || (!format_settings.csv.try_infer_numbers_from_strings && isNumber(type)))
/// Return String type if one of the following conditions apply
/// - we couldn't infer any type
/// - it's a number and csv.try_infer_numbers_from_strings = 0
/// - it's a tuple and try_infer_strings_from_quoted_tuples = 0
/// - it's a Bool type (we don't allow reading bool values from strings)
if (!type || (format_settings.csv.try_infer_strings_from_quoted_tuples && isTuple(type)) || (!format_settings.csv.try_infer_numbers_from_strings && isNumber(type)) || isBool(type))
return std::make_shared<DataTypeString>();
return type;

View File

@ -4134,6 +4134,29 @@ private:
};
}
/// Create wrapper only if we support this conversion.
WrapperType createWrapperIfCanConvert(const DataTypePtr & from, const DataTypePtr & to) const
{
try
{
/// We can avoid try/catch here if we will implement check that 2 types can be casted, but it
/// requires quite a lot of work. By now let's simply use try/catch.
/// First, check that we can create a wrapper.
WrapperType wrapper = prepareUnpackDictionaries(from, to);
/// Second, check if we can perform a conversion on column with default value.
/// (we cannot just check empty column as we do some checks only during iteration over rows).
auto test_col = from->createColumn();
test_col->insertDefault();
ColumnsWithTypeAndName column_from = {{test_col->getPtr(), from, "" }};
wrapper(column_from, to, nullptr, 1);
return wrapper;
}
catch (const Exception &)
{
return {};
}
}
WrapperType createVariantToColumnWrapper(const DataTypeVariant & from_variant, const DataTypePtr & to_type) const
{
const auto & variant_types = from_variant.getVariants();
@ -4142,7 +4165,19 @@ private:
/// Create conversion wrapper for each variant.
for (const auto & variant_type : variant_types)
variant_wrappers.push_back(prepareUnpackDictionaries(variant_type, to_type));
{
WrapperType wrapper;
if (cast_type == CastType::accurateOrNull)
{
/// Create wrapper only if we support conversion from variant to the resulting type.
wrapper = createWrapperIfCanConvert(variant_type, to_type);
}
else
{
wrapper = prepareUnpackDictionaries(variant_type, to_type);
}
variant_wrappers.push_back(wrapper);
}
return [variant_wrappers, variant_types, to_type]
(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr
@ -4157,7 +4192,11 @@ private:
auto variant_col = column_variant.getVariantPtrByGlobalDiscriminator(i);
ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }};
const auto & variant_wrapper = variant_wrappers[i];
casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size()));
ColumnPtr casted_variant;
/// Check if we have wrapper for this variant.
if (variant_wrapper)
casted_variant = variant_wrapper(variant, result_type, nullptr, variant_col->size());
casted_variant_columns.push_back(std::move(casted_variant));
}
/// Second, construct resulting column from casted variant columns according to discriminators.
@ -4167,7 +4206,7 @@ private:
for (size_t i = 0; i != input_rows_count; ++i)
{
auto global_discr = column_variant.globalDiscriminatorByLocal(local_discriminators[i]);
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR)
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR || !casted_variant_columns[global_discr])
res->insertDefault();
else
res->insertFrom(*casted_variant_columns[global_discr], column_variant.offsetAt(i));
@ -4357,10 +4396,27 @@ private:
casted_variant_columns.reserve(variant_types.size());
for (size_t i = 0; i != variant_types.size(); ++i)
{
/// Skip shared variant, it will be processed later.
if (i == column_dynamic.getSharedVariantDiscriminator())
{
casted_variant_columns.push_back(nullptr);
continue;
}
const auto & variant_col = variant_column.getVariantPtrByGlobalDiscriminator(i);
ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], ""}};
auto variant_wrapper = prepareUnpackDictionaries(variant_types[i], result_type);
casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size()));
WrapperType variant_wrapper;
if (cast_type == CastType::accurateOrNull)
/// Create wrapper only if we support conversion from variant to the resulting type.
variant_wrapper = createWrapperIfCanConvert(variant_types[i], result_type);
else
variant_wrapper = prepareUnpackDictionaries(variant_types[i], result_type);
ColumnPtr casted_variant;
/// Check if we have wrapper for this variant.
if (variant_wrapper)
casted_variant = variant_wrapper(variant, result_type, nullptr, variant_col->size());
casted_variant_columns.push_back(casted_variant);
}
/// Second, collect all variants stored in shared variant and cast them to result type.
@ -4416,8 +4472,18 @@ private:
for (size_t i = 0; i != variant_types_from_shared_variant.size(); ++i)
{
ColumnsWithTypeAndName variant = {{variant_columns_from_shared_variant[i]->getPtr(), variant_types_from_shared_variant[i], ""}};
auto variant_wrapper = prepareUnpackDictionaries(variant_types_from_shared_variant[i], result_type);
casted_shared_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_columns_from_shared_variant[i]->size()));
WrapperType variant_wrapper;
if (cast_type == CastType::accurateOrNull)
/// Create wrapper only if we support conversion from variant to the resulting type.
variant_wrapper = createWrapperIfCanConvert(variant_types_from_shared_variant[i], result_type);
else
variant_wrapper = prepareUnpackDictionaries(variant_types_from_shared_variant[i], result_type);
ColumnPtr casted_variant;
/// Check if we have wrapper for this variant.
if (variant_wrapper)
casted_variant = variant_wrapper(variant, result_type, nullptr, variant_columns_from_shared_variant[i]->size());
casted_shared_variant_columns.push_back(casted_variant);
}
/// Construct result column from all casted variants.
@ -4427,11 +4493,23 @@ private:
{
auto global_discr = variant_column.globalDiscriminatorByLocal(local_discriminators[i]);
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR)
{
res->insertDefault();
}
else if (global_discr == shared_variant_discr)
{
if (casted_shared_variant_columns[shared_variant_indexes[i]])
res->insertFrom(*casted_shared_variant_columns[shared_variant_indexes[i]], shared_variant_offsets[i]);
else
res->insertDefault();
}
else
{
if (casted_variant_columns[global_discr])
res->insertFrom(*casted_variant_columns[global_discr], offsets[i]);
else
res->insertDefault();
}
}
return res;

View File

@ -1,7 +1,8 @@
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <IO/WriteHelpers.h>
@ -19,16 +20,15 @@ namespace ErrorCodes
}
/// arrayZip(['a', 'b', 'c'], ['d', 'e', 'f']) = [('a', 'd'), ('b', 'e'), ('c', 'f')]
/// arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e']) = [('a', 'd'), ('b', 'e'), ('c', null)]
template <bool allow_unaligned>
class FunctionArrayZip : public IFunction
{
public:
static constexpr auto name = "arrayZip";
static constexpr auto name = allow_unaligned ? "arrayZipUnaligned" : "arrayZip";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayZip>(); }
String getName() const override
{
return name;
}
String getName() const override { return name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
@ -39,8 +39,11 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.empty())
throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION,
"Function {} needs at least one argument; passed {}." , getName(), arguments.size());
throw Exception(
ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION,
"Function {} needs at least one argument; passed {}.",
getName(),
arguments.size());
DataTypes arguments_types;
for (size_t index = 0; index < arguments.size(); ++index)
@ -48,56 +51,142 @@ public:
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[index].type.get());
if (!array_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument {} of function {} must be array. Found {} instead.",
toString(index + 1), getName(), arguments[0].type->getName());
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument {} of function {} must be array. Found {} instead.",
toString(index + 1),
getName(),
arguments[0].type->getName());
arguments_types.emplace_back(array_type->getNestedType());
auto nested_type = array_type->getNestedType();
if constexpr (allow_unaligned)
nested_type = makeNullable(nested_type);
arguments_types.emplace_back(nested_type);
}
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(arguments_types));
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
ColumnPtr
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override
{
size_t num_arguments = arguments.size();
ColumnPtr first_array_column;
Columns holders(num_arguments);
Columns tuple_columns(num_arguments);
bool has_unaligned = false;
size_t unaligned_index = 0;
for (size_t i = 0; i < num_arguments; ++i)
{
/// Constant columns cannot be inside tuple. It's only possible to have constant tuple as a whole.
ColumnPtr holder = arguments[i].column->convertToFullColumnIfConst();
holders[i] = holder;
const ColumnArray * column_array = checkAndGetColumn<ColumnArray>(holder.get());
if (!column_array)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Argument {} of function {} must be array. Found column {} instead.",
i + 1, getName(), holder->getName());
if (i == 0)
{
first_array_column = holder;
}
else if (!column_array->hasEqualOffsets(static_cast<const ColumnArray &>(*first_array_column)))
{
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
"The argument 1 and argument {} of function {} have different array sizes",
i + 1, getName());
}
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Argument {} of function {} must be array. Found column {} instead.",
i + 1,
getName(),
holder->getName());
tuple_columns[i] = column_array->getDataPtr();
if (i && !column_array->hasEqualOffsets(static_cast<const ColumnArray &>(*holders[0])))
{
has_unaligned = true;
unaligned_index = i;
}
}
if constexpr (!allow_unaligned)
{
if (has_unaligned)
throw Exception(
ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
"The argument 1 and argument {} of function {} have different array sizes",
unaligned_index + 1,
getName());
else
return ColumnArray::create(
ColumnTuple::create(tuple_columns), static_cast<const ColumnArray &>(*first_array_column).getOffsetsPtr());
ColumnTuple::create(std::move(tuple_columns)), static_cast<const ColumnArray &>(*holders[0]).getOffsetsPtr());
}
else
return executeUnaligned(holders, tuple_columns, input_rows_count, has_unaligned);
}
private:
ColumnPtr executeUnaligned(const Columns & holders, Columns & tuple_columns, size_t input_rows_count, bool has_unaligned) const
{
std::vector<const ColumnArray *> array_columns(holders.size());
for (size_t i = 0; i < holders.size(); ++i)
array_columns[i] = checkAndGetColumn<ColumnArray>(holders[i].get());
for (auto & tuple_column : tuple_columns)
tuple_column = makeNullable(tuple_column);
if (!has_unaligned)
return ColumnArray::create(ColumnTuple::create(std::move(tuple_columns)), array_columns[0]->getOffsetsPtr());
MutableColumns res_tuple_columns(tuple_columns.size());
for (size_t i = 0; i < tuple_columns.size(); ++i)
{
res_tuple_columns[i] = tuple_columns[i]->cloneEmpty();
res_tuple_columns[i]->reserve(tuple_columns[i]->size());
}
auto res_offsets_column = ColumnArray::ColumnOffsets::create(input_rows_count);
auto & res_offsets = assert_cast<ColumnArray::ColumnOffsets &>(*res_offsets_column).getData();
size_t curr_offset = 0;
for (size_t row_i = 0; row_i < input_rows_count; ++row_i)
{
size_t max_size = 0;
for (size_t arg_i = 0; arg_i < holders.size(); ++arg_i)
{
const auto * array_column = array_columns[arg_i];
const auto & offsets = array_column->getOffsets();
size_t array_offset = offsets[row_i - 1];
size_t array_size = offsets[row_i] - array_offset;
res_tuple_columns[arg_i]->insertRangeFrom(*tuple_columns[arg_i], array_offset, array_size);
max_size = std::max(max_size, array_size);
}
for (size_t arg_i = 0; arg_i < holders.size(); ++arg_i)
{
const auto * array_column = array_columns[arg_i];
const auto & offsets = array_column->getOffsets();
size_t array_offset = offsets[row_i - 1];
size_t array_size = offsets[row_i] - array_offset;
res_tuple_columns[arg_i]->insertManyDefaults(max_size - array_size);
}
curr_offset += max_size;
res_offsets[row_i] = curr_offset;
}
return ColumnArray::create(ColumnTuple::create(std::move(res_tuple_columns)), std::move(res_offsets_column));
}
};
REGISTER_FUNCTION(ArrayZip)
{
factory.registerFunction<FunctionArrayZip>();
factory.registerFunction<FunctionArrayZip<false>>(
{.description = R"(
Combines multiple arrays into a single array. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
)",
.categories{"String"}});
factory.registerFunction<FunctionArrayZip<true>>(
{.description = R"(
Combines multiple arrays into a single array, allowing for unaligned arrays. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
If the arrays have different sizes, the shorter arrays will be padded with `null` values.
)",
.categories{"String"}}
);
}
}

View File

@ -396,7 +396,7 @@ const ActionsDAG::Node * ActionsDAG::tryFindInOutputs(const std::string & name)
return nullptr;
}
ActionsDAG::NodeRawConstPtrs ActionsDAG::findInOutpus(const Names & names) const
ActionsDAG::NodeRawConstPtrs ActionsDAG::findInOutputs(const Names & names) const
{
NodeRawConstPtrs required_nodes;
required_nodes.reserve(names.size());
@ -524,7 +524,7 @@ void ActionsDAG::removeUnusedActions(const NameSet & required_names, bool allow_
void ActionsDAG::removeUnusedActions(const Names & required_names, bool allow_remove_inputs, bool allow_constant_folding)
{
auto required_nodes = findInOutpus(required_names);
auto required_nodes = findInOutputs(required_names);
outputs.swap(required_nodes);
removeUnusedActions(allow_remove_inputs, allow_constant_folding);
}

View File

@ -156,7 +156,7 @@ public:
const Node * tryFindInOutputs(const std::string & name) const;
/// Same, but for the list of names.
NodeRawConstPtrs findInOutpus(const Names & names) const;
NodeRawConstPtrs findInOutputs(const Names & names) const;
/// Find first node with the same name in output nodes and replace it.
/// If was not found, add node to outputs end.
@ -436,7 +436,7 @@ public:
/// Returns a list of nodes representing atomic predicates.
static NodeRawConstPtrs extractConjunctionAtoms(const Node * predicate);
/// Get a list of nodes. For every node, check if it can be compused using allowed subset of inputs.
/// Get a list of nodes. For every node, check if it can be computed using allowed subset of inputs.
/// Returns only those nodes from the list which can be computed.
static NodeRawConstPtrs filterNodesByAllowedInputs(
NodeRawConstPtrs nodes,

View File

@ -33,6 +33,8 @@
#include <Common/SensitiveDataMasker.h>
#include <Common/SipHash.h>
#include <Common/logger_useful.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTIdentifier.h>
namespace CurrentMetrics
{
@ -308,16 +310,32 @@ void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const
/* no_squash */ false,
/* no_destination */ false,
/* async_insert */ false);
auto table = interpreter.getTable(insert_query);
auto sample_block = InterpreterInsertQuery::getSampleBlock(insert_query, table, table->getInMemoryMetadataPtr(), query_context);
if (!FormatFactory::instance().isInputFormat(insert_query.format))
{
if (insert_query.format.empty() && insert_query.infile)
{
const auto & in_file_node = insert_query.infile->as<ASTLiteral &>();
const auto in_file = in_file_node.value.safeGet<std::string>();
const auto in_file_format = FormatFactory::instance().getFormatFromFileName(in_file);
if (!FormatFactory::instance().isInputFormat(in_file_format))
throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown input INFILE format {}", in_file_format);
}
else
throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown input format {}", insert_query.format);
}
/// For table functions we check access while executing
/// InterpreterInsertQuery::getTable() -> ITableFunction::execute().
if (insert_query.table_id)
query_context->checkAccess(AccessType::INSERT, insert_query.table_id, sample_block.getNames());
insert_query.columns = std::make_shared<ASTExpressionList>();
for (const auto & column : sample_block)
insert_query.columns->children.push_back(std::make_shared<ASTIdentifier>(column.name));
}
AsynchronousInsertQueue::PushResult
@ -696,6 +714,17 @@ catch (...)
tryLogCurrentException("AsynchronousInsertQueue", "Failed to add elements to AsynchronousInsertLog");
}
void convertBlockToHeader(Block & block, const Block & header)
{
auto converting_dag = ActionsDAG::makeConvertingActions(
block.getColumnsWithTypeAndName(),
header.getColumnsWithTypeAndName(),
ActionsDAG::MatchColumnsMode::Name);
auto converting_actions = std::make_shared<ExpressionActions>(std::move(converting_dag));
converting_actions->execute(block);
}
String serializeQuery(const IAST & query, size_t max_length)
{
return query.hasSecretParts()
@ -791,6 +820,61 @@ try
if (async_insert_log)
log_elements.reserve(data->entries.size());
auto add_entry_to_asynchronous_insert_log = [&, query_by_format = NameToNameMap{}](
const InsertData::EntryPtr & entry,
const String & parsing_exception,
size_t num_rows,
size_t num_bytes) mutable
{
if (!async_insert_log)
return;
AsynchronousInsertLogElement elem;
elem.event_time = timeInSeconds(entry->create_time);
elem.event_time_microseconds = timeInMicroseconds(entry->create_time);
elem.database = query_database;
elem.table = query_table;
elem.format = entry->format;
elem.query_id = entry->query_id;
elem.bytes = num_bytes;
elem.rows = num_rows;
elem.exception = parsing_exception;
elem.data_kind = entry->chunk.getDataKind();
elem.timeout_milliseconds = data->timeout_ms.count();
elem.flush_query_id = insert_query_id;
auto get_query_by_format = [&](const String & format) -> const String &
{
auto [it, inserted] = query_by_format.try_emplace(format);
if (!inserted)
return it->second;
auto query = key.query->clone();
assert_cast<ASTInsertQuery &>(*query).format = format;
it->second = serializeQuery(*query, insert_context->getSettingsRef().log_queries_cut_to_length);
return it->second;
};
if (entry->chunk.getDataKind() == DataKind::Parsed)
elem.query_for_logging = key.query_str;
else
elem.query_for_logging = get_query_by_format(entry->format);
/// If there was a parsing error,
/// the entry won't be flushed anyway,
/// so add the log element immediately.
if (!elem.exception.empty())
{
elem.status = AsynchronousInsertLogElement::ParsingError;
async_insert_log->add(std::move(elem));
}
else
{
elem.status = AsynchronousInsertLogElement::Ok;
log_elements.push_back(std::move(elem));
}
};
try
{
interpreter = std::make_unique<InterpreterInsertQuery>(
@ -819,49 +903,20 @@ try
catch (...)
{
logExceptionBeforeStart(query_for_logging, insert_context, key.query, query_span, start_watch.elapsedMilliseconds());
if (async_insert_log)
{
for (const auto & entry : data->entries)
add_entry_to_asynchronous_insert_log(entry, /*parsing_exception=*/ "", /*num_rows=*/ 0, entry->chunk.byteSize());
auto exception = getCurrentExceptionMessage(false);
auto flush_time = std::chrono::system_clock::now();
appendElementsToLogSafe(*async_insert_log, std::move(log_elements), flush_time, exception);
}
throw;
}
auto add_entry_to_asynchronous_insert_log = [&](const auto & entry,
const auto & entry_query_for_logging,
const auto & exception,
size_t num_rows,
size_t num_bytes,
Milliseconds timeout_ms)
{
if (!async_insert_log)
return;
AsynchronousInsertLogElement elem;
elem.event_time = timeInSeconds(entry->create_time);
elem.event_time_microseconds = timeInMicroseconds(entry->create_time);
elem.query_for_logging = entry_query_for_logging;
elem.database = query_database;
elem.table = query_table;
elem.format = entry->format;
elem.query_id = entry->query_id;
elem.bytes = num_bytes;
elem.rows = num_rows;
elem.exception = exception;
elem.data_kind = entry->chunk.getDataKind();
elem.timeout_milliseconds = timeout_ms.count();
elem.flush_query_id = insert_query_id;
/// If there was a parsing error,
/// the entry won't be flushed anyway,
/// so add the log element immediately.
if (!elem.exception.empty())
{
elem.status = AsynchronousInsertLogElement::ParsingError;
async_insert_log->add(std::move(elem));
}
else
{
log_elements.push_back(elem);
}
};
auto finish_entries = [&]
auto finish_entries = [&](size_t num_rows, size_t num_bytes)
{
for (const auto & entry : data->entries)
{
@ -874,20 +929,7 @@ try
auto flush_time = std::chrono::system_clock::now();
appendElementsToLogSafe(*async_insert_log, std::move(log_elements), flush_time, "");
}
};
Chunk chunk;
auto header = pipeline.getHeader();
if (key.data_kind == DataKind::Parsed)
chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_asynchronous_insert_log);
else
chunk = processPreprocessedEntries(key, data, header, insert_context, add_entry_to_asynchronous_insert_log);
ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows());
auto log_and_add_finish_to_query_log = [&](size_t num_rows, size_t num_bytes)
{
LOG_DEBUG(log, "Flushed {} rows, {} bytes for query '{}'", num_rows, num_bytes, key.query_str);
queue_shard_flush_time_history.updateWithCurrentTime();
@ -896,16 +938,24 @@ try
query_log_elem, insert_context, key.query, pipeline, pulling_pipeline, query_span, QueryCache::Usage::None, internal);
};
try
{
Chunk chunk;
auto header = pipeline.getHeader();
if (key.data_kind == DataKind::Parsed)
chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_asynchronous_insert_log);
else
chunk = processPreprocessedEntries(data, header, add_entry_to_asynchronous_insert_log);
ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows());
if (chunk.getNumRows() == 0)
{
finish_entries();
log_and_add_finish_to_query_log(0, 0);
finish_entries(/*num_rows=*/ 0, /*num_bytes=*/ 0);
return;
}
try
{
size_t num_rows = chunk.getNumRows();
size_t num_bytes = chunk.bytes();
@ -915,7 +965,7 @@ try
CompletedPipelineExecutor completed_executor(pipeline);
completed_executor.execute();
log_and_add_finish_to_query_log(num_rows, num_bytes);
finish_entries(num_rows, num_bytes);
}
catch (...)
{
@ -929,8 +979,6 @@ try
}
throw;
}
finish_entries();
}
catch (const Exception & e)
{
@ -991,7 +1039,6 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
StreamingFormatExecutor executor(header, format, std::move(on_error), std::move(adding_defaults_transform));
auto chunk_info = std::make_shared<AsyncInsertInfo>();
auto query_for_logging = serializeQuery(*key.query, insert_context->getSettingsRef().log_queries_cut_to_length);
for (const auto & entry : data->entries)
{
@ -1009,7 +1056,8 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
size_t num_rows = executor.execute(*buffer);
total_rows += num_rows;
/// for some reason, client can pass zero rows and bytes to server.
/// For some reason, client can pass zero rows and bytes to server.
/// We don't update offsets in this case, because we assume every insert has some rows during dedup
/// but we have nothing to deduplicate for this insert.
if (num_rows > 0)
@ -1018,8 +1066,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
chunk_info->tokens.push_back(entry->async_dedup_token);
}
add_to_async_insert_log(entry, query_for_logging, current_exception, num_rows, num_bytes, data->timeout_ms);
add_to_async_insert_log(entry, current_exception, num_rows, num_bytes);
current_exception.clear();
entry->resetChunk();
}
@ -1031,30 +1078,14 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
template <typename LogFunc>
Chunk AsynchronousInsertQueue::processPreprocessedEntries(
const InsertQuery & key,
const InsertDataPtr & data,
const Block & header,
const ContextPtr & insert_context,
LogFunc && add_to_async_insert_log)
{
size_t total_rows = 0;
auto chunk_info = std::make_shared<AsyncInsertInfo>();
auto result_columns = header.cloneEmptyColumns();
std::unordered_map<String, String> format_to_query;
auto get_query_by_format = [&](const String & format) -> const String &
{
auto [it, inserted] = format_to_query.try_emplace(format);
if (!inserted)
return it->second;
auto query = key.query->clone();
assert_cast<ASTInsertQuery &>(*query).format = format;
it->second = serializeQuery(*query, insert_context->getSettingsRef().log_queries_cut_to_length);
return it->second;
};
for (const auto & entry : data->entries)
{
const auto * block = entry->chunk.asBlock();
@ -1062,23 +1093,26 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries(
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expected entry with data kind Preprocessed. Got: {}", entry->chunk.getDataKind());
auto columns = block->getColumns();
Block block_to_insert = *block;
if (!isCompatibleHeader(block_to_insert, header))
convertBlockToHeader(block_to_insert, header);
auto columns = block_to_insert.getColumns();
for (size_t i = 0, s = columns.size(); i < s; ++i)
result_columns[i]->insertRangeFrom(*columns[i], 0, columns[i]->size());
total_rows += block->rows();
/// for some reason, client can pass zero rows and bytes to server.
total_rows += block_to_insert.rows();
/// For some reason, client can pass zero rows and bytes to server.
/// We don't update offsets in this case, because we assume every insert has some rows during dedup,
/// but we have nothing to deduplicate for this insert.
if (block->rows())
if (block_to_insert.rows() > 0)
{
chunk_info->offsets.push_back(total_rows);
chunk_info->tokens.push_back(entry->async_dedup_token);
}
const auto & query_for_logging = get_query_by_format(entry->format);
add_to_async_insert_log(entry, query_for_logging, "", block->rows(), block->bytes(), data->timeout_ms);
add_to_async_insert_log(entry, /*parsing_exception=*/ "", block_to_insert.rows(), block_to_insert.bytes());
entry->resetChunk();
}

View File

@ -288,10 +288,8 @@ private:
template <typename LogFunc>
static Chunk processPreprocessedEntries(
const InsertQuery & key,
const InsertDataPtr & data,
const Block & header,
const ContextPtr & insert_context,
LogFunc && add_to_async_insert_log);
template <typename E>

View File

@ -33,25 +33,19 @@ public:
}
};
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
static bool startsWith(const std::string & s, const char * prefix)
{
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
}
Lemmatizers::Lemmatizers(const Poco::Util::AbstractConfiguration & config)
{
String prefix = "lemmatizers";
Poco::Util::AbstractConfiguration::Keys keys;
const String prefix = "lemmatizers";
if (!config.has(prefix))
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "No lemmatizers specified in server config on prefix '{}'", prefix);
return;
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(prefix, keys);
for (const auto & key : keys)
{
if (startsWith(key, "lemmatizer"))
if (key.starts_with("lemmatizer"))
{
const auto & lemm_name = config.getString(prefix + "." + key + ".lang", "");
const auto & lemm_path = config.getString(prefix + "." + key + ".path", "");
@ -81,13 +75,13 @@ Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name)
if (paths.find(name) != paths.end())
{
if (!std::filesystem::exists(paths[name]))
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Incorrect path to lemmatizer: {}", paths[name]);
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Path to lemmatizer does not exist: {}", paths[name]);
lemmatizers[name] = std::make_shared<Lemmatizer>(paths[name]);
return lemmatizers[name];
}
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Lemmatizer named: '{}' is not found", name);
throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Lemmatizer with the name '{}' was not found in the configuration", name);
}
}

View File

@ -147,6 +147,7 @@ ColumnDependencies getAllColumnDependencies(
bool isStorageTouchedByMutations(
MergeTreeData::DataPartPtr source_part,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MutationCommand> & commands,
ContextPtr context)
@ -154,7 +155,7 @@ bool isStorageTouchedByMutations(
if (commands.empty())
return false;
auto storage_from_part = std::make_shared<StorageFromMergeTreeDataPart>(source_part);
auto storage_from_part = std::make_shared<StorageFromMergeTreeDataPart>(source_part, mutations_snapshot);
bool all_commands_can_be_skipped = true;
for (const auto & command : commands)
@ -285,8 +286,13 @@ MutationsInterpreter::Source::Source(StoragePtr storage_) : storage(std::move(st
{
}
MutationsInterpreter::Source::Source(MergeTreeData & storage_, MergeTreeData::DataPartPtr source_part_)
: data(&storage_), part(std::move(source_part_))
MutationsInterpreter::Source::Source(
MergeTreeData & storage_,
MergeTreeData::DataPartPtr source_part_,
AlterConversionsPtr alter_conversions_)
: data(&storage_)
, part(std::move(source_part_))
, alter_conversions(std::move(alter_conversions_))
{
}
@ -386,13 +392,14 @@ MutationsInterpreter::MutationsInterpreter(
MutationsInterpreter::MutationsInterpreter(
MergeTreeData & storage_,
MergeTreeData::DataPartPtr source_part_,
AlterConversionsPtr alter_conversions_,
StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_,
Names available_columns_,
ContextPtr context_,
Settings settings_)
: MutationsInterpreter(
Source(storage_, std::move(source_part_)),
Source(storage_, std::move(source_part_), std::move(alter_conversions_)),
std::move(metadata_snapshot_), std::move(commands_),
std::move(available_columns_), std::move(context_), std::move(settings_))
{
@ -1218,8 +1225,10 @@ void MutationsInterpreter::Source::read(
createReadFromPartStep(
MergeTreeSequentialSourceType::Mutation,
plan,
*data, storage_snapshot,
*data,
storage_snapshot,
part,
alter_conversions,
required_columns,
nullptr,
apply_deleted_mask_,

View File

@ -20,6 +20,7 @@ using QueryPipelineBuilderPtr = std::unique_ptr<QueryPipelineBuilder>;
/// Return false if the data isn't going to be changed by mutations.
bool isStorageTouchedByMutations(
MergeTreeData::DataPartPtr source_part,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MutationCommand> & commands,
ContextPtr context
@ -70,6 +71,7 @@ public:
MutationsInterpreter(
MergeTreeData & storage_,
MergeTreeData::DataPartPtr source_part_,
AlterConversionsPtr alter_conversions_,
StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_,
Names available_columns_,
@ -137,7 +139,7 @@ public:
bool can_execute_) const;
explicit Source(StoragePtr storage_);
Source(MergeTreeData & storage_, MergeTreeData::DataPartPtr source_part_);
Source(MergeTreeData & storage_, MergeTreeData::DataPartPtr source_part_, AlterConversionsPtr alter_conversions_);
private:
StoragePtr storage;
@ -145,6 +147,7 @@ public:
/// Special case for *MergeTree.
MergeTreeData * data = nullptr;
MergeTreeData::DataPartPtr part;
AlterConversionsPtr alter_conversions;
};
private:

View File

@ -30,6 +30,8 @@ ColumnsDescription ProcessorProfileLogElement::getColumnsDescription()
{"id", std::make_shared<DataTypeUInt64>(), "ID of processor."},
{"parent_ids", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>()), "Parent processors IDs."},
{"plan_step", std::make_shared<DataTypeUInt64>(), "ID of the query plan step which created this processor. The value is zero if the processor was not added from any step."},
{"plan_step_name", std::make_shared<DataTypeString>(), "Name of the query plan step which created this processor. The value is empty if the processor was not added from any step."},
{"plan_step_description", std::make_shared<DataTypeString>(), "Description of the query plan step which created this processor. The value is empty if the processor was not added from any step."},
{"plan_group", std::make_shared<DataTypeUInt64>(), "Group of the processor if it was created by query plan step. A group is a logical partitioning of processors added from the same query plan step. Group is used only for beautifying the result of EXPLAIN PIPELINE result."},
{"initial_query_id", std::make_shared<DataTypeString>(), "ID of the initial query (for distributed query execution)."},
@ -64,6 +66,8 @@ void ProcessorProfileLogElement::appendToBlock(MutableColumns & columns) const
}
columns[i++]->insert(plan_step);
columns[i++]->insert(plan_step_name);
columns[i++]->insert(plan_step_description);
columns[i++]->insert(plan_group);
columns[i++]->insertData(initial_query_id.data(), initial_query_id.size());
columns[i++]->insertData(query_id.data(), query_id.size());

View File

@ -19,6 +19,8 @@ struct ProcessorProfileLogElement
UInt64 plan_step{};
UInt64 plan_group{};
String plan_step_name;
String plan_step_description;
String initial_query_id;
String query_id;

View File

@ -478,6 +478,8 @@ void logQueryFinish(
processor_elem.parent_ids = std::move(parents);
processor_elem.plan_step = reinterpret_cast<std::uintptr_t>(processor->getQueryPlanStep());
processor_elem.plan_step_name = processor->getPlanStepName();
processor_elem.plan_step_description = processor->getPlanStepDescription();
processor_elem.plan_group = processor->getQueryPlanStepGroup();
processor_elem.processor_name = processor->getName();
@ -793,7 +795,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
/// Verify that AST formatting is consistent:
/// If you format AST, parse it back, and format it again, you get the same string.
String formatted1 = ast->formatWithPossiblyHidingSensitiveData(0, true, true, false);
String formatted1 = ast->formatWithPossiblyHidingSensitiveData(0, true, true, false, false, IdentifierQuotingStyle::Backticks);
/// The query can become more verbose after formatting, so:
size_t new_max_query_size = max_query_size > 0 ? (1000 + 2 * max_query_size) : 0;
@ -818,7 +820,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
chassert(ast2);
String formatted2 = ast2->formatWithPossiblyHidingSensitiveData(0, true, true, false);
String formatted2 = ast2->formatWithPossiblyHidingSensitiveData(0, true, true, false, false, IdentifierQuotingStyle::Backticks);
if (formatted1 != formatted2)
throw Exception(ErrorCodes::LOGICAL_ERROR,

View File

@ -26,7 +26,12 @@ inline String format(const SecretHidingFormatSettings & settings)
&& settings.ctx->getAccess()->isGranted(AccessType::displaySecretsInShowAndSelect);
return settings.query.formatWithPossiblyHidingSensitiveData(
settings.max_length, settings.one_line, show_secrets, settings.ctx->getSettingsRef().print_pretty_type_names);
settings.max_length,
settings.one_line,
show_secrets,
settings.ctx->getSettingsRef().print_pretty_type_names,
settings.ctx->getSettingsRef().output_format_always_quote_identifiers,
settings.ctx->getSettingsRef().output_format_identifier_quoting_style);
}
}

View File

@ -66,8 +66,8 @@ void ASTColumnDeclaration::formatImpl(const FormatSettings & format_settings, Fo
{
frame.need_parens = false;
/// We have to always backquote column names to avoid ambiguity with INDEX and other declarations in CREATE query.
format_settings.ostr << backQuote(name);
/// We have to always quote column names to avoid ambiguity with INDEX and other declarations in CREATE query.
format_settings.quoteIdentifier(name);
if (type)
{

View File

@ -35,7 +35,7 @@ void ASTDictionaryAttributeDeclaration::formatImpl(const FormatSettings & settin
{
frame.need_parens = false;
settings.ostr << backQuote(name);
settings.quoteIdentifier(name);
if (type)
{

View File

@ -79,7 +79,7 @@ void ASTIndexDeclaration::formatImpl(const FormatSettings & s, FormatState & sta
}
else
{
s.ostr << backQuoteIfNeed(name);
s.writeIdentifier(name);
s.ostr << " ";
expr->formatImpl(s, state, frame);
}

View File

@ -17,7 +17,7 @@ ASTPtr ASTProjectionDeclaration::clone() const
void ASTProjectionDeclaration::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << backQuoteIfNeed(name);
settings.writeIdentifier(name);
std::string indent_str = settings.one_line ? "" : std::string(4u * frame.indent, ' ');
std::string nl_or_nothing = settings.one_line ? "" : "\n";
settings.ostr << settings.nl_or_ws << indent_str << "(" << nl_or_nothing;

View File

@ -22,10 +22,8 @@ ASTPtr ASTTableOverride::clone() const
return res;
}
void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState & state, FormatStateStacked frame) const
void ASTTableOverride::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
FormatSettings settings = settings_;
settings.always_quote_identifiers = true;
String nl_or_nothing = settings.one_line ? "" : "\n";
String nl_or_ws = settings.one_line ? " " : "\n";
String hl_keyword = settings.hilite ? hilite_keyword : "";

View File

@ -165,12 +165,21 @@ size_t IAST::checkDepthImpl(size_t max_depth) const
return res;
}
String IAST::formatWithPossiblyHidingSensitiveData(size_t max_length, bool one_line, bool show_secrets, bool print_pretty_type_names) const
String IAST::formatWithPossiblyHidingSensitiveData(
size_t max_length,
bool one_line,
bool show_secrets,
bool print_pretty_type_names,
bool always_quote_identifiers,
IdentifierQuotingStyle identifier_quoting_style) const
{
WriteBufferFromOwnString buf;
FormatSettings settings(buf, one_line);
settings.show_secrets = show_secrets;
settings.print_pretty_type_names = print_pretty_type_names;
settings.always_quote_identifiers = always_quote_identifiers;
settings.identifier_quoting_style = identifier_quoting_style;
format(settings);
return wipeSensitiveDataAndCutToLength(buf.str(), max_length);
}
@ -248,6 +257,34 @@ void IAST::FormatSettings::writeIdentifier(const String & name) const
}
}
void IAST::FormatSettings::quoteIdentifier(const String & name) const
{
switch (identifier_quoting_style)
{
case IdentifierQuotingStyle::None:
{
writeBackQuotedString(name, ostr);
break;
}
case IdentifierQuotingStyle::Backticks:
{
writeBackQuotedString(name, ostr);
break;
}
case IdentifierQuotingStyle::DoubleQuotes:
{
writeDoubleQuotedString(name, ostr);
break;
}
case IdentifierQuotingStyle::BackticksMySQL:
{
writeBackQuotedStringMySQL(name, ostr);
break;
}
}
}
void IAST::dumpTree(WriteBuffer & ostr, size_t indent) const
{
String indent_str(indent, '-');

View File

@ -238,6 +238,9 @@ public:
}
void writeIdentifier(const String & name) const;
// Quote identifier `name` even when `always_quote_identifiers` is false.
// If `identifier_quoting_style` is `IdentifierQuotingStyle::None`, quote it with `IdentifierQuotingStyle::Backticks`
void quoteIdentifier(const String & name) const;
};
/// State. For example, a set of nodes can be remembered, which we already walk through.
@ -278,7 +281,13 @@ public:
/// Secrets are displayed regarding show_secrets, then SensitiveDataMasker is applied.
/// You can use Interpreters/formatWithPossiblyHidingSecrets.h for convenience.
String formatWithPossiblyHidingSensitiveData(size_t max_length, bool one_line, bool show_secrets, bool print_pretty_type_names) const;
String formatWithPossiblyHidingSensitiveData(
size_t max_length,
bool one_line,
bool show_secrets,
bool print_pretty_type_names,
bool always_quote_identifiers,
IdentifierQuotingStyle identifier_quoting_style) const;
/** formatForLogging and formatForErrorMessage always hide secrets. This inconsistent
* behaviour is due to the fact such functions are called from Client which knows nothing about
@ -287,12 +296,12 @@ public:
*/
String formatForLogging(size_t max_length = 0) const
{
return formatWithPossiblyHidingSensitiveData(max_length, true, false, false);
return formatWithPossiblyHidingSensitiveData(max_length, true, false, false, false, IdentifierQuotingStyle::Backticks);
}
String formatForErrorMessage() const
{
return formatWithPossiblyHidingSensitiveData(0, true, false, false);
return formatWithPossiblyHidingSensitiveData(0, true, false, false, false, IdentifierQuotingStyle::Backticks);
}
virtual bool hasSecretParts() const { return childrenHaveSecretParts(); }

View File

@ -1,6 +1,8 @@
#include <Common/typeid_cast.h>
#include <Parsers/ParserStringAndSubstitution.h>
#include <Parsers/ParserAlterQuery.h>
#include <Parsers/ASTAlterQuery.h>
#include <Parsers/ASTColumnDeclaration.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/CommonParsers.h>
#include <Parsers/ExpressionElementParsers.h>
#include <Parsers/ExpressionListParsers.h>
@ -9,14 +11,19 @@
#include <Parsers/ParserRefreshStrategy.h>
#include <Parsers/ParserSelectWithUnionQuery.h>
#include <Parsers/ParserSetQuery.h>
#include <Parsers/ASTAlterQuery.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ParserStringAndSubstitution.h>
#include <Parsers/parseDatabaseAndTableName.h>
#include <Common/typeid_cast.h>
namespace DB
{
namespace ErrorCodes
{
extern const int SYNTAX_ERROR;
}
bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
auto command = std::make_shared<ASTAlterCommand>();
@ -122,7 +129,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
ParserCompoundIdentifier parser_name;
ParserStringLiteral parser_string_literal;
ParserStringAndSubstitution parser_string_and_substituion;
ParserIdentifier parser_remove_property;
ParserCompoundColumnDeclaration parser_col_decl;
ParserIndexDeclaration parser_idx_decl;
ParserStatisticsDeclaration parser_stat_decl;
@ -725,8 +731,21 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
if (!parser_modify_col_decl.parse(pos, command_col_decl, expected))
return false;
auto check_no_type = [&](const std::string_view keyword)
{
const auto & column_decl = command_col_decl->as<const ASTColumnDeclaration &>();
if (!column_decl.children.empty() || column_decl.null_modifier.has_value() || !column_decl.default_specifier.empty()
|| column_decl.ephemeral_default || column_decl.primary_key_specifier)
{
throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot specify column properties before '{}'", keyword);
}
};
if (s_remove.ignore(pos, expected))
{
check_no_type(s_remove.getName());
if (s_default.ignore(pos, expected))
command->remove_property = toStringView(Keyword::DEFAULT);
else if (s_materialized.ignore(pos, expected))
@ -746,11 +765,15 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
}
else if (s_modify_setting.ignore(pos, expected))
{
check_no_type(s_modify_setting.getName());
if (!parser_settings.parse(pos, command_settings_changes, expected))
return false;
}
else if (s_reset_setting.ignore(pos, expected))
{
check_no_type(s_reset_setting.getName());
if (!parser_reset_setting.parse(pos, command_settings_resets, expected))
return false;
}
@ -765,6 +788,11 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
}
}
command->type = ASTAlterCommand::MODIFY_COLUMN;
/// Make sure that type is not populated when REMOVE/MODIFY SETTING/RESET SETTING is used, because we wouldn't modify the type, which can be confusing
chassert(
nullptr == command_col_decl->as<const ASTColumnDeclaration &>().type
|| (command->remove_property.empty() && nullptr == command_settings_changes && nullptr == command_settings_resets));
}
else if (s_modify_order_by.ignore(pos, expected))
{

View File

@ -231,31 +231,31 @@ INSTANTIATE_TEST_SUITE_P(ParserCreateDatabaseQuery, ParserTest,
},
{
"CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw') TABLE OVERRIDE `tbl`\n(PARTITION BY toYYYYMM(created))",
"CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE `tbl`\n(\n PARTITION BY toYYYYMM(`created`)\n)"
"CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE tbl\n(\n PARTITION BY toYYYYMM(created)\n)"
},
{
"CREATE DATABASE db ENGINE=Foo TABLE OVERRIDE `tbl` (), TABLE OVERRIDE a (COLUMNS (_created DateTime MATERIALIZED now())), TABLE OVERRIDE b (PARTITION BY rand())",
"CREATE DATABASE db\nENGINE = Foo\nTABLE OVERRIDE `tbl`\n(\n\n),\nTABLE OVERRIDE `a`\n(\n COLUMNS\n (\n `_created` DateTime MATERIALIZED now()\n )\n),\nTABLE OVERRIDE `b`\n(\n PARTITION BY rand()\n)"
"CREATE DATABASE db\nENGINE = Foo\nTABLE OVERRIDE tbl\n(\n\n),\nTABLE OVERRIDE a\n(\n COLUMNS\n (\n `_created` DateTime MATERIALIZED now()\n )\n),\nTABLE OVERRIDE b\n(\n PARTITION BY rand()\n)"
},
{
"CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw') TABLE OVERRIDE tbl (COLUMNS (id UUID) PARTITION BY toYYYYMM(created))",
"CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE `tbl`\n(\n COLUMNS\n (\n `id` UUID\n )\n PARTITION BY toYYYYMM(`created`)\n)"
"CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE tbl\n(\n COLUMNS\n (\n `id` UUID\n )\n PARTITION BY toYYYYMM(created)\n)"
},
{
"CREATE DATABASE db TABLE OVERRIDE tbl (COLUMNS (INDEX foo foo TYPE minmax GRANULARITY 1) PARTITION BY if(_staged = 1, 'staging', toYYYYMM(created)))",
"CREATE DATABASE db\nTABLE OVERRIDE `tbl`\n(\n COLUMNS\n (\n INDEX foo `foo` TYPE minmax GRANULARITY 1\n )\n PARTITION BY if(`_staged` = 1, 'staging', toYYYYMM(`created`))\n)"
"CREATE DATABASE db\nTABLE OVERRIDE tbl\n(\n COLUMNS\n (\n INDEX foo foo TYPE minmax GRANULARITY 1\n )\n PARTITION BY if(_staged = 1, 'staging', toYYYYMM(created))\n)"
},
{
"CREATE DATABASE db TABLE OVERRIDE t1 (TTL inserted + INTERVAL 1 MONTH DELETE), TABLE OVERRIDE t2 (TTL `inserted` + INTERVAL 2 MONTH DELETE)",
"CREATE DATABASE db\nTABLE OVERRIDE `t1`\n(\n TTL `inserted` + toIntervalMonth(1)\n),\nTABLE OVERRIDE `t2`\n(\n TTL `inserted` + toIntervalMonth(2)\n)"
"CREATE DATABASE db\nTABLE OVERRIDE t1\n(\n TTL inserted + toIntervalMonth(1)\n),\nTABLE OVERRIDE t2\n(\n TTL inserted + toIntervalMonth(2)\n)"
},
{
"CREATE DATABASE db ENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw') SETTINGS allows_query_when_mysql_lost = 1 TABLE OVERRIDE tab3 (COLUMNS (_staged UInt8 MATERIALIZED 1) PARTITION BY (c3) TTL c3 + INTERVAL 10 minute), TABLE OVERRIDE tab5 (PARTITION BY (c3) TTL c3 + INTERVAL 10 minute)",
"CREATE DATABASE db\nENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw')\nSETTINGS allows_query_when_mysql_lost = 1\nTABLE OVERRIDE `tab3`\n(\n COLUMNS\n (\n `_staged` UInt8 MATERIALIZED 1\n )\n PARTITION BY `c3`\n TTL `c3` + toIntervalMinute(10)\n),\nTABLE OVERRIDE `tab5`\n(\n PARTITION BY `c3`\n TTL `c3` + toIntervalMinute(10)\n)"
"CREATE DATABASE db\nENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw')\nSETTINGS allows_query_when_mysql_lost = 1\nTABLE OVERRIDE tab3\n(\n COLUMNS\n (\n `_staged` UInt8 MATERIALIZED 1\n )\n PARTITION BY c3\n TTL c3 + toIntervalMinute(10)\n),\nTABLE OVERRIDE tab5\n(\n PARTITION BY c3\n TTL c3 + toIntervalMinute(10)\n)"
},
{
"CREATE DATABASE db TABLE OVERRIDE tbl (PARTITION BY toYYYYMM(created) COLUMNS (created DateTime CODEC(Delta)))",
"CREATE DATABASE db\nTABLE OVERRIDE `tbl`\n(\n COLUMNS\n (\n `created` DateTime CODEC(Delta)\n )\n PARTITION BY toYYYYMM(`created`)\n)"
"CREATE DATABASE db\nTABLE OVERRIDE tbl\n(\n COLUMNS\n (\n `created` DateTime CODEC(Delta)\n )\n PARTITION BY toYYYYMM(created)\n)"
},
{
"CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1",
@ -267,11 +267,11 @@ INSTANTIATE_TEST_SUITE_P(ParserCreateDatabaseQuery, ParserTest,
},
{
"CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2 TABLE OVERRIDE a (ORDER BY (id, version))",
"CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE `a`\n(\n ORDER BY (`id`, `version`)\n)"
"CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE a\n(\n ORDER BY (id, version)\n)"
},
{
"CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2 COMMENT 'db comment' TABLE OVERRIDE a (ORDER BY (id, version))",
"CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE `a`\n(\n ORDER BY (`id`, `version`)\n)\nCOMMENT 'db comment'"
"CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE a\n(\n ORDER BY (id, version)\n)\nCOMMENT 'db comment'"
}
})));

View File

@ -1287,7 +1287,8 @@ void Planner::buildPlanForUnionNode()
for (const auto & query_node : union_queries_nodes)
{
Planner query_planner(query_node, select_query_options);
Planner query_planner(query_node, select_query_options, planner_context->getGlobalPlannerContext());
query_planner.buildQueryPlanIfNeeded();
for (const auto & row_policy : query_planner.getUsedRowPolicies())
used_row_policies.insert(row_policy);

View File

@ -494,6 +494,12 @@ JoinClausesAndActions buildJoinClausesAndActions(
necessary_names.push_back(name);
};
bool is_join_with_special_storage = false;
if (const auto * right_table_node = join_node.getRightTableExpression()->as<TableNode>())
{
is_join_with_special_storage = dynamic_cast<const StorageJoin *>(right_table_node->getStorage().get());
}
for (auto & join_clause : result.join_clauses)
{
const auto & left_filter_condition_nodes = join_clause.getLeftFilterConditionNodes();
@ -561,7 +567,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
if (!left_key_node->result_type->equals(*common_type))
left_key_node = &left_join_actions.addCast(*left_key_node, common_type, {});
if (!right_key_node->result_type->equals(*common_type))
if (!is_join_with_special_storage && !right_key_node->result_type->equals(*common_type))
right_key_node = &right_join_actions.addCast(*right_key_node, common_type, {});
}

View File

@ -1,5 +1,6 @@
#include <iostream>
#include <Processors/IProcessor.h>
#include <Processors/QueryPlan/IQueryPlanStep.h>
#include <Common/logger_useful.h>
#include <IO/WriteHelpers.h>
@ -9,6 +10,17 @@
namespace DB
{
void IProcessor::setQueryPlanStep(IQueryPlanStep * step, size_t group)
{
query_plan_step = step;
query_plan_step_group = group;
if (step)
{
plan_step_name = step->getName();
plan_step_description = step->getStepDescription();
}
}
void IProcessor::cancel() noexcept
{

View File

@ -311,14 +311,12 @@ public:
constexpr static size_t NO_STREAM = std::numeric_limits<size_t>::max();
/// Step of QueryPlan from which processor was created.
void setQueryPlanStep(IQueryPlanStep * step, size_t group = 0)
{
query_plan_step = step;
query_plan_step_group = group;
}
void setQueryPlanStep(IQueryPlanStep * step, size_t group = 0);
IQueryPlanStep * getQueryPlanStep() const { return query_plan_step; }
size_t getQueryPlanStepGroup() const { return query_plan_step_group; }
const String & getPlanStepName() const { return plan_step_name; }
const String & getPlanStepDescription() const { return plan_step_description; }
uint64_t getElapsedNs() const { return elapsed_ns; }
uint64_t getInputWaitElapsedNs() const { return input_wait_elapsed_ns; }
@ -410,6 +408,8 @@ private:
IQueryPlanStep * query_plan_step = nullptr;
size_t query_plan_step_group = 0;
String plan_step_name;
String plan_step_description;
};

View File

@ -757,7 +757,7 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
projection_reading = reader.readFromParts(
/* parts = */ {},
/* alter_conversions = */ {},
reading->getMutationsSnapshot()->cloneEmpty(),
best_candidate->dag.getRequiredColumnsNames(),
proj_snapshot,
projection_query_info,

View File

@ -199,7 +199,7 @@ std::optional<String> optimizeUseNormalProjections(Stack & stack, QueryPlan::Nod
auto projection_reading = reader.readFromParts(
/*parts=*/ {},
/*alter_conversions=*/ {},
reading->getMutationsSnapshot()->cloneEmpty(),
required_columns,
proj_snapshot,
query_info_copy,

View File

@ -41,12 +41,19 @@ bool canUseProjectionForReadingStep(ReadFromMergeTree * reading)
if (reading->readsInOrder())
return false;
const auto & query_settings = reading->getContext()->getSettingsRef();
// Currently projection don't support deduplication when moving parts between shards.
if (reading->getContext()->getSettingsRef().allow_experimental_query_deduplication)
if (query_settings.allow_experimental_query_deduplication)
return false;
// Currently projection don't support settings which implicitly modify aggregate functions.
if (reading->getContext()->getSettingsRef().aggregate_functions_null_for_empty)
if (query_settings.aggregate_functions_null_for_empty)
return false;
/// Don't use projections if have mutations to apply
/// because we need to apply them on original data.
if (query_settings.apply_mutations_on_fly && reading->getMutationsSnapshot()->hasDataMutations())
return false;
return true;
@ -215,20 +222,15 @@ bool analyzeProjectionCandidate(
{
MergeTreeData::DataPartsVector projection_parts;
MergeTreeData::DataPartsVector normal_parts;
std::vector<AlterConversionsPtr> alter_conversions;
for (const auto & part_with_ranges : parts_with_ranges)
{
const auto & created_projections = part_with_ranges.data_part->getProjectionParts();
auto it = created_projections.find(candidate.projection->name);
if (it != created_projections.end() && !it->second->is_broken)
{
projection_parts.push_back(it->second);
}
else
{
normal_parts.push_back(part_with_ranges.data_part);
alter_conversions.push_back(part_with_ranges.alter_conversions);
}
}
if (projection_parts.empty())
@ -241,6 +243,7 @@ bool analyzeProjectionCandidate(
auto projection_result_ptr = reader.estimateNumMarksToRead(
std::move(projection_parts),
reading.getMutationsSnapshot()->cloneEmpty(),
required_column_names,
candidate.projection->metadata,
projection_query_info,
@ -254,7 +257,7 @@ bool analyzeProjectionCandidate(
if (!normal_parts.empty())
{
/// TODO: We can reuse existing analysis_result by filtering out projection parts
auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts), std::move(alter_conversions));
auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts));
if (normal_result_ptr->selected_marks != 0)
{

View File

@ -155,7 +155,7 @@ bool isPartitionKeySuitsGroupByKey(
return false;
/// We are interested only in calculations required to obtain group by keys (and not aggregate function arguments for example).
auto key_nodes = group_by_actions.findInOutpus(aggregating.getParams().keys);
auto key_nodes = group_by_actions.findInOutputs(aggregating.getParams().keys);
auto group_by_key_actions = ActionsDAG::cloneSubDAG(key_nodes, /*remove_aliases=*/ true);
const auto & gb_key_required_columns = group_by_key_actions.getRequiredColumnsNames();

View File

@ -229,7 +229,6 @@ public:
{
ranges_in_data_parts.emplace_back(
initial_ranges_in_data_parts[part_index].data_part,
initial_ranges_in_data_parts[part_index].alter_conversions,
initial_ranges_in_data_parts[part_index].part_index_in_query,
MarkRanges{mark_range});
part_index_to_initial_ranges_in_data_parts_index[it->second] = part_index;

View File

@ -173,7 +173,10 @@ Pipe ReadFromMemoryStorageStep::makePipe()
for (size_t stream = 0; stream < num_streams; ++stream)
{
pipes.emplace_back(std::make_shared<MemorySource>(columns_to_read, storage_snapshot, current_data, parallel_execution_index));
auto source = std::make_shared<MemorySource>(columns_to_read, storage_snapshot, current_data, parallel_execution_index);
if (stream == 0)
source->addTotalRowsApprox(snapshot_data.rows_approx);
pipes.emplace_back(std::move(source));
}
return Pipe::unitePipes(std::move(pipes));
}

View File

@ -266,7 +266,7 @@ void ReadFromMergeTree::AnalysisResult::checkLimits(const Settings & settings, c
ReadFromMergeTree::ReadFromMergeTree(
MergeTreeData::DataPartsVector parts_,
std::vector<AlterConversionsPtr> alter_conversions_,
MergeTreeData::MutationsSnapshotPtr mutations_,
Names all_column_names_,
const MergeTreeData & data_,
const SelectQueryInfo & query_info_,
@ -283,7 +283,7 @@ ReadFromMergeTree::ReadFromMergeTree(
query_info_.prewhere_info)}, all_column_names_, query_info_, storage_snapshot_, context_)
, reader_settings(getMergeTreeReaderSettings(context_, query_info_))
, prepared_parts(std::move(parts_))
, alter_conversions_for_parts(std::move(alter_conversions_))
, mutations_snapshot(std::move(mutations_))
, all_column_names(std::move(all_column_names_))
, data(data_)
, actions_settings(ExpressionActionsSettings::fromContext(context_))
@ -372,6 +372,7 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
auto pool = std::make_shared<MergeTreeReadPoolParallelReplicas>(
std::move(extension),
std::move(parts_with_range),
mutations_snapshot,
shared_virtual_fields,
storage_snapshot,
prewhere_info,
@ -453,6 +454,7 @@ Pipe ReadFromMergeTree::readFromPool(
{
pool = std::make_shared<MergeTreePrefetchedReadPool>(
std::move(parts_with_range),
mutations_snapshot,
shared_virtual_fields,
storage_snapshot,
prewhere_info,
@ -466,6 +468,7 @@ Pipe ReadFromMergeTree::readFromPool(
{
pool = std::make_shared<MergeTreeReadPool>(
std::move(parts_with_range),
mutations_snapshot,
shared_virtual_fields,
storage_snapshot,
prewhere_info,
@ -554,6 +557,7 @@ Pipe ReadFromMergeTree::readInOrder(
std::move(extension),
mode,
parts_with_ranges,
mutations_snapshot,
shared_virtual_fields,
storage_snapshot,
prewhere_info,
@ -569,6 +573,7 @@ Pipe ReadFromMergeTree::readInOrder(
has_limit_below_one_block,
read_type,
parts_with_ranges,
mutations_snapshot,
shared_virtual_fields,
storage_snapshot,
prewhere_info,
@ -1038,7 +1043,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(
}
ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);
new_parts.emplace_back(part.data_part, part.alter_conversions, part.part_index_in_query, std::move(ranges_to_get_from_part));
new_parts.emplace_back(part.data_part, part.part_index_in_query, std::move(ranges_to_get_from_part));
}
splitted_parts_and_ranges.emplace_back(std::move(new_parts));
@ -1265,7 +1270,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
RangesInDataParts new_parts;
for (auto part_it = parts_to_merge_ranges[range_index]; part_it != parts_to_merge_ranges[range_index + 1]; ++part_it)
new_parts.emplace_back(part_it->data_part, part_it->alter_conversions, part_it->part_index_in_query, part_it->ranges);
new_parts.emplace_back(part_it->data_part, part_it->part_index_in_query, part_it->ranges);
if (new_parts.empty())
continue;
@ -1378,15 +1383,14 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(bool find_exact_ranges) const
{
return selectRangesToRead(prepared_parts, alter_conversions_for_parts, find_exact_ranges);
return selectRangesToRead(prepared_parts, find_exact_ranges);
}
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
MergeTreeData::DataPartsVector parts, std::vector<AlterConversionsPtr> alter_conversions, bool find_exact_ranges) const
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(MergeTreeData::DataPartsVector parts, bool find_exact_ranges) const
{
return selectRangesToRead(
std::move(parts),
std::move(alter_conversions),
mutations_snapshot,
storage_snapshot->metadata,
query_info,
context,
@ -1404,9 +1408,11 @@ static void buildIndexes(
const ActionsDAG * filter_actions_dag,
const MergeTreeData & data,
const MergeTreeData::DataPartsVector & parts,
const MergeTreeData::MutationsSnapshotPtr & mutations_snapshot,
const ContextPtr & context,
const SelectQueryInfo & query_info,
const StorageMetadataPtr & metadata_snapshot)
const StorageMetadataPtr & metadata_snapshot,
const LoggerPtr & log)
{
indexes.reset();
@ -1432,19 +1438,21 @@ static void buildIndexes(
indexes->partition_pruner.emplace(metadata_snapshot, filter_actions_dag, context, false /* strict */);
}
indexes->part_values
= MergeTreeDataSelectExecutor::filterPartsByVirtualColumns(metadata_snapshot, data, parts, filter_actions_dag, context);
indexes->part_values = MergeTreeDataSelectExecutor::filterPartsByVirtualColumns(metadata_snapshot, data, parts, filter_actions_dag, context);
MergeTreeDataSelectExecutor::buildKeyConditionFromPartOffset(indexes->part_offset_condition, filter_actions_dag, context);
indexes->use_skip_indexes = settings.use_skip_indexes;
bool final = query_info.isFinal();
if (final && !settings.use_skip_indexes_if_final)
if (query_info.isFinal() && !settings.use_skip_indexes_if_final)
indexes->use_skip_indexes = false;
if (!indexes->use_skip_indexes)
return;
const auto & all_indexes = metadata_snapshot->getSecondaryIndices();
if (all_indexes.empty())
return;
std::unordered_set<std::string> ignored_index_names;
if (settings.ignore_data_skipping_indices.changed)
@ -1469,15 +1477,34 @@ static void buildIndexes(
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse ignore_data_skipping_indices ('{}')", indices);
}
auto all_updated_columns = mutations_snapshot->getAllUpdatedColumns();
UsefulSkipIndexes skip_indexes;
using Key = std::pair<String, size_t>;
std::map<Key, size_t> merged;
for (const auto & index : metadata_snapshot->getSecondaryIndices())
{
if (!ignored_index_names.contains(index.name))
for (const auto & index : all_indexes)
{
if (ignored_index_names.contains(index.name))
continue;
auto index_helper = MergeTreeIndexFactory::instance().get(index);
if (!all_updated_columns.empty())
{
auto required_columns = index_helper->getColumnsRequiredForIndexCalc();
auto it = std::ranges::find_if(required_columns, [&](const auto & column_name)
{
return all_updated_columns.contains(column_name);
});
if (it != required_columns.end())
{
LOG_TRACE(log, "Index {} is not used because it depends on column {} which will be updated on fly", index.name, *it);
continue;
}
}
if (index_helper->isMergeable())
{
auto [it, inserted] = merged.emplace(Key{index_helper->index.type, index_helper->getGranularity()}, skip_indexes.merged_indices.size());
@ -1488,10 +1515,11 @@ static void buildIndexes(
}
skip_indexes.merged_indices[it->second].addIndex(index_helper);
continue;
}
else
{
MergeTreeIndexConditionPtr condition;
if (index_helper->isVectorSimilarityIndex())
{
#if USE_USEARCH
@ -1500,17 +1528,18 @@ static void buildIndexes(
#endif
if (const auto * legacy_vector_similarity_index = typeid_cast<const MergeTreeIndexLegacyVectorSimilarity *>(index_helper.get()))
condition = legacy_vector_similarity_index->createIndexCondition(query_info, context);
if (!condition)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown vector search index {}", index_helper->index.name);
}
else
{
condition = index_helper->createIndexCondition(filter_actions_dag, context);
}
if (!condition->alwaysUnknownOrTrue())
skip_indexes.useful_indices.emplace_back(index_helper, condition);
}
}
}
// move minmax indices to first positions, so they will be applied first as cheapest ones
std::stable_sort(begin(skip_indexes.useful_indices), end(skip_indexes.useful_indices), [](const auto & l, const auto & r)
@ -1547,15 +1576,17 @@ void ReadFromMergeTree::applyFilters(ActionDAGNodes added_filter_nodes)
query_info.filter_actions_dag.get(),
data,
prepared_parts,
mutations_snapshot,
context,
query_info,
storage_snapshot->metadata);
storage_snapshot->metadata,
log);
}
}
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info_,
ContextPtr context_,
@ -1586,7 +1617,7 @@ ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
const Names & primary_key_column_names = primary_key.column_names;
if (!indexes)
buildIndexes(indexes, query_info_.filter_actions_dag.get(), data, parts, context_, query_info_, metadata_snapshot);
buildIndexes(indexes, query_info_.filter_actions_dag.get(), data, parts, mutations_snapshot, context_, query_info_, metadata_snapshot, log);
if (indexes->part_values && indexes->part_values->empty())
return std::make_shared<AnalysisResult>(std::move(result));
@ -1617,10 +1648,9 @@ ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
{
MergeTreeDataSelectExecutor::filterPartsByPartition(
parts,
indexes->partition_pruner,
indexes->minmax_idx_condition,
parts,
alter_conversions,
indexes->part_values,
metadata_snapshot,
data,
@ -1649,7 +1679,6 @@ ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
auto reader_settings = getMergeTreeReaderSettings(context_, query_info_);
result.parts_with_ranges = MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipIndexes(
std::move(parts),
std::move(alter_conversions),
metadata_snapshot,
context_,
indexes->key_condition,

View File

@ -110,7 +110,7 @@ public:
ReadFromMergeTree(
MergeTreeData::DataPartsVector parts_,
std::vector<AlterConversionsPtr> alter_conversions_,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot_,
Names all_column_names_,
const MergeTreeData & data_,
const SelectQueryInfo & query_info_,
@ -154,7 +154,7 @@ public:
static AnalysisResultPtr selectRangesToRead(
MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
ContextPtr context,
@ -166,8 +166,7 @@ public:
std::optional<Indexes> & indexes,
bool find_exact_ranges);
AnalysisResultPtr selectRangesToRead(
MergeTreeData::DataPartsVector parts, std::vector<AlterConversionsPtr> alter_conversions, bool find_exact_ranges = false) const;
AnalysisResultPtr selectRangesToRead(MergeTreeData::DataPartsVector parts, bool find_exact_ranges = false) const;
AnalysisResultPtr selectRangesToRead(bool find_exact_ranges = false) const;
@ -188,7 +187,7 @@ public:
void setAnalyzedResult(AnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); }
const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; }
const std::vector<AlterConversionsPtr> & getAlterConvertionsForParts() const { return alter_conversions_for_parts; }
MergeTreeData::MutationsSnapshotPtr getMutationsSnapshot() const { return mutations_snapshot; }
const MergeTreeData & getMergeTreeData() const { return data; }
size_t getMaxBlockSize() const { return block_size.max_block_size_rows; }
@ -209,7 +208,7 @@ private:
MergeTreeReaderSettings reader_settings;
MergeTreeData::DataPartsVector prepared_parts;
std::vector<AlterConversionsPtr> alter_conversions_for_parts;
MergeTreeData::MutationsSnapshotPtr mutations_snapshot;
Names all_column_names;

View File

@ -83,7 +83,11 @@ void WriteBufferFromHTTPServerResponse::finishSendHeaders()
return;
if (!headers_started_sending)
{
if (compression_method != CompressionMethod::None)
response.set("Content-Encoding", toContentEncodingName(compression_method));
startSendHeaders();
}
writeHeaderSummary();
writeExceptionCode();
@ -105,7 +109,13 @@ void WriteBufferFromHTTPServerResponse::nextImpl()
initialized = true;
if (compression_method != CompressionMethod::None)
{
/// If we've already sent headers, just send the `Content-Encoding` down the socket directly
if (headers_started_sending)
socketSendStr("Content-Encoding: " + toContentEncodingName(compression_method) + "\r\n");
else
response.set("Content-Encoding", toContentEncodingName(compression_method));
}
startSendHeaders();
finishSendHeaders();
@ -177,8 +187,12 @@ void WriteBufferFromHTTPServerResponse::finalizeImpl()
/// If no body data just send header
startSendHeaders();
/// `finalizeImpl` must be idempotent, so set `initialized` here to not send stuff twice
if (!initialized && offset() && compression_method != CompressionMethod::None)
{
initialized = true;
socketSendStr("Content-Encoding: " + toContentEncodingName(compression_method) + "\r\n");
}
finishSendHeaders();
}

View File

@ -271,6 +271,7 @@ public:
/// Return true if the trivial count query could be optimized without reading the data at all
/// in totalRows() or totalRowsByPartitionPredicate() methods or with optimized reading in read() method.
/// 'storage_snapshot' may be nullptr.
virtual bool supportsTrivialCountOptimization(const StorageSnapshotPtr & /*storage_snapshot*/, ContextPtr /*query_context*/) const
{
return false;

View File

@ -9,9 +9,15 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
bool AlterConversions::supportsMutationCommandType(MutationCommand::Type t)
bool AlterConversions::isSupportedDataMutation(MutationCommand::Type)
{
return t == MutationCommand::Type::RENAME_COLUMN;
/// Currently there is no such mutations. See setting 'apply_mutations_on_fly'.
return false;
}
bool AlterConversions::isSupportedMetadataMutation(MutationCommand::Type type)
{
return type == MutationCommand::Type::RENAME_COLUMN;
}
void AlterConversions::addMutationCommand(const MutationCommand & command)

View File

@ -1,8 +1,8 @@
#pragma once
#include <Storages/MutationCommands.h>
#include <string>
#include <unordered_map>
#include <Interpreters/Context_fwd.h>
#include <Storages/StorageInMemoryMetadata.h>
namespace DB
@ -11,11 +11,17 @@ namespace DB
/// Alter conversions which should be applied on-fly for part.
/// Built from of the most recent mutation commands for part.
/// Now only ALTER RENAME COLUMN is applied.
class AlterConversions : private boost::noncopyable
class AlterConversions : private WithContext, boost::noncopyable
{
public:
AlterConversions() = default;
AlterConversions(StorageMetadataPtr metadata_snapshot_, ContextPtr context_)
: WithContext(context_)
, metadata_snapshot(std::move(metadata_snapshot_))
{
}
struct RenamePair
{
std::string rename_to;
@ -34,11 +40,13 @@ public:
/// Get column old name before rename (lookup by key in rename_map)
std::string getColumnOldName(const std::string & new_name) const;
static bool supportsMutationCommandType(MutationCommand::Type);
static bool isSupportedDataMutation(MutationCommand::Type type);
static bool isSupportedMetadataMutation(MutationCommand::Type type);
private:
/// Rename map new_name -> old_name.
std::vector<RenamePair> rename_map;
StorageMetadataPtr metadata_snapshot;
};
using AlterConversionsPtr = std::shared_ptr<const AlterConversions>;

View File

@ -2052,6 +2052,7 @@ DataPartStoragePtr IMergeTreeDataPart::makeCloneInDetached(const String & prefix
IDataPartStorage::ClonePartParams params
{
.copy_instead_of_hardlink = isStoredOnRemoteDiskWithZeroCopySupport() && storage.supportsReplication() && storage_settings->allow_remote_fs_zero_copy_replication,
.keep_metadata_version = prefix == "covered-by-broken",
.make_source_readonly = true,
.external_transaction = disk_transaction
};

View File

@ -291,6 +291,14 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const
if (enabledBlockOffsetColumn(global_ctx))
addGatheringColumn(global_ctx, BlockOffsetColumn::name, BlockOffsetColumn::type);
MergeTreeData::IMutationsSnapshot::Params params
{
.metadata_version = global_ctx->metadata_snapshot->getMetadataVersion(),
.min_part_metadata_version = MergeTreeData::getMinMetadataVersion(global_ctx->future_part->parts),
};
auto mutations_snapshot = global_ctx->data->getMutationsSnapshot(params);
SerializationInfo::Settings info_settings =
{
.ratio_of_defaults_for_sparse = global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization,
@ -298,10 +306,12 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const
};
SerializationInfoByName infos(global_ctx->storage_columns, info_settings);
global_ctx->alter_conversions.reserve(global_ctx->future_part->parts.size());
for (const auto & part : global_ctx->future_part->parts)
{
global_ctx->new_data_part->ttl_infos.update(part->ttl_infos);
if (global_ctx->metadata_snapshot->hasAnyTTL() && !part->checkAllTTLCalculated(global_ctx->metadata_snapshot))
{
LOG_INFO(ctx->log, "Some TTL values were not calculated for part {}. Will calculate them forcefully during merge.", part->name);
@ -322,6 +332,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const
infos.add(part_infos);
}
global_ctx->alter_conversions.push_back(MergeTreeData::getAlterConversionsForPart(part, mutations_snapshot, global_ctx->metadata_snapshot, global_ctx->context));
}
const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
@ -879,7 +891,7 @@ MergeTask::VerticalMergeRuntimeContext::PreparedColumnPipeline MergeTask::Vertic
{
/// Read from all parts
std::vector<QueryPlanPtr> plans;
for (const auto & part : global_ctx->future_part->parts)
for (size_t part_num = 0; part_num < global_ctx->future_part->parts.size(); ++part_num)
{
auto plan_for_part = std::make_unique<QueryPlan>();
createReadFromPartStep(
@ -887,7 +899,8 @@ MergeTask::VerticalMergeRuntimeContext::PreparedColumnPipeline MergeTask::Vertic
*plan_for_part,
*global_ctx->data,
global_ctx->storage_snapshot,
part,
global_ctx->future_part->parts[part_num],
global_ctx->alter_conversions[part_num],
Names{column_name},
global_ctx->input_rows_filtered,
/*apply_deleted_mask=*/ true,
@ -1570,7 +1583,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() const
/// Read from all parts
std::vector<QueryPlanPtr> plans;
for (const auto & part : global_ctx->future_part->parts)
for (size_t i = 0; i < global_ctx->future_part->parts.size(); ++i)
{
if (part->getMarksCount() == 0)
LOG_TRACE(ctx->log, "Part {} is empty", part->name);
@ -1581,7 +1594,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() const
*plan_for_part,
*global_ctx->data,
global_ctx->storage_snapshot,
part,
global_ctx->future_part->parts[i],
global_ctx->alter_conversions[i],
global_ctx->merging_columns.getNames(),
global_ctx->input_rows_filtered,
/*apply_deleted_mask=*/ true,

View File

@ -166,6 +166,7 @@ private:
StorageSnapshotPtr storage_snapshot{nullptr};
StorageMetadataPtr metadata_snapshot{nullptr};
FutureMergedMutatedPartPtr future_part{nullptr};
std::vector<AlterConversionsPtr> alter_conversions;
/// This will be either nullptr or new_data_part, so raw pointer is ok.
IMergeTreeDataPart * parent_part{nullptr};
ContextPtr context{nullptr};

View File

@ -7156,11 +7156,16 @@ UInt64 MergeTreeData::estimateNumberOfRowsToRead(
ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info) const
{
const auto & snapshot_data = assert_cast<const MergeTreeData::SnapshotData &>(*storage_snapshot->data);
const auto & parts = snapshot_data.parts;
MergeTreeDataSelectExecutor reader(*this);
auto result_ptr = reader.estimateNumMarksToRead(
parts, {}, storage_snapshot->metadata, query_info, query_context, query_context->getSettingsRef().max_threads);
snapshot_data.parts,
snapshot_data.mutations_snapshot,
storage_snapshot->metadata->getColumns().getAll().getNames(),
storage_snapshot->metadata,
query_info,
query_context,
query_context->getSettingsRef().max_threads);
UInt64 total_rows = result_ptr->selected_rows;
if (query_info.trivial_limit > 0 && query_info.trivial_limit < total_rows)
@ -8174,11 +8179,15 @@ bool MergeTreeData::canUsePolymorphicParts(const MergeTreeSettings & settings, S
return true;
}
AlterConversionsPtr MergeTreeData::getAlterConversionsForPart(MergeTreeDataPartPtr part) const
AlterConversionsPtr MergeTreeData::getAlterConversionsForPart(
const MergeTreeDataPartPtr & part,
const MutationsSnapshotPtr & mutations,
const StorageMetadataPtr & metadata,
const ContextPtr & query_context)
{
auto commands = getAlterMutationCommandsForPart(part);
auto commands = mutations->getAlterMutationCommandsForPart(part);
auto result = std::make_shared<AlterConversions>(metadata, query_context);
auto result = std::make_shared<AlterConversions>();
for (const auto & command : commands | std::views::reverse)
result->addMutationCommand(command);
@ -8470,9 +8479,28 @@ void MergeTreeData::updateObjectColumns(const DataPartPtr & part, const DataPart
DB::updateObjectColumns(object_columns, columns, part->getColumns());
}
bool MergeTreeData::supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const
bool MergeTreeData::supportsTrivialCountOptimization(const StorageSnapshotPtr & storage_snapshot, ContextPtr query_context) const
{
return !hasLightweightDeletedMask();
if (hasLightweightDeletedMask())
return false;
if (!storage_snapshot)
return !query_context->getSettingsRef().apply_mutations_on_fly;
const auto & snapshot_data = assert_cast<const MergeTreeData::SnapshotData &>(*storage_snapshot->data);
return !snapshot_data.mutations_snapshot->hasDataMutations();
}
Int64 MergeTreeData::getMinMetadataVersion(const DataPartsVector & parts)
{
Int64 version = -1;
for (const auto & part : parts)
{
Int64 part_version = part->getMetadataVersion();
if (version == -1 || part_version < version)
version = part_version;
}
return version;
}
StorageSnapshotPtr MergeTreeData::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const
@ -8486,10 +8514,14 @@ StorageSnapshotPtr MergeTreeData::getStorageSnapshot(const StorageMetadataPtr &
object_columns_copy = object_columns;
}
snapshot_data->alter_conversions.reserve(snapshot_data->parts.size());
for (const auto & part : snapshot_data->parts)
snapshot_data->alter_conversions.push_back(getAlterConversionsForPart(part));
IMutationsSnapshot::Params params
{
.metadata_version = metadata_snapshot->getMetadataVersion(),
.min_part_metadata_version = getMinMetadataVersion(snapshot_data->parts),
.need_data_mutations = query_context->getSettingsRef().apply_mutations_on_fly,
};
snapshot_data->mutations_snapshot = getMutationsSnapshot(params);
return std::make_shared<StorageSnapshot>(*this, metadata_snapshot, std::move(object_columns_copy), std::move(snapshot_data));
}
@ -8707,28 +8739,57 @@ void MergeTreeData::verifySortingKey(const KeyDescription & sorting_key)
}
}
bool updateAlterConversionsMutations(const MutationCommands & commands, std::atomic<ssize_t> & alter_conversions_mutations, bool remove)
static void updateMutationsCounters(
Int64 & num_data_mutations_to_apply,
Int64 & num_metadata_mutations_to_apply,
const MutationCommands & commands,
Int64 increment)
{
if (num_data_mutations_to_apply < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "On-fly data mutations counter is negative ({})", num_data_mutations_to_apply);
if (num_metadata_mutations_to_apply < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "On-fly metadata mutations counter is negative ({})", num_metadata_mutations_to_apply);
bool has_data_mutation = false;
bool has_metadata_mutation = false;
for (const auto & command : commands)
{
if (AlterConversions::supportsMutationCommandType(command.type))
if (!has_data_mutation && AlterConversions::isSupportedDataMutation(command.type))
{
if (remove)
num_data_mutations_to_apply += increment;
has_data_mutation = true;
if (num_data_mutations_to_apply < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "On-fly data mutations counter is negative ({})", num_data_mutations_to_apply);
}
if (!has_metadata_mutation && AlterConversions::isSupportedMetadataMutation(command.type))
{
--alter_conversions_mutations;
if (alter_conversions_mutations < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "On-fly mutations counter is negative ({})", alter_conversions_mutations);
num_metadata_mutations_to_apply += increment;
has_metadata_mutation = true;
if (num_metadata_mutations_to_apply < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "On-fly metadata mutations counter is negative ({})", num_metadata_mutations_to_apply);
}
else
}
}
void incrementMutationsCounters(
Int64 & num_data_mutations_to_apply,
Int64 & num_metadata_mutations_to_apply,
const MutationCommands & commands)
{
if (alter_conversions_mutations < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "On-fly mutations counter is negative ({})", alter_conversions_mutations);
++alter_conversions_mutations;
updateMutationsCounters(num_data_mutations_to_apply, num_metadata_mutations_to_apply, commands, 1);
}
return true;
}
}
return false;
void decrementMutationsCounters(
Int64 & num_data_mutations_to_apply,
Int64 & num_metadata_mutations_to_apply,
const MutationCommands & commands)
{
updateMutationsCounters(num_data_mutations_to_apply, num_metadata_mutations_to_apply, commands, -1);
}
}

View File

@ -443,14 +443,53 @@ public:
bool areAsynchronousInsertsEnabled() const override;
bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override;
bool supportsTrivialCountOptimization(const StorageSnapshotPtr & storage_snapshot, ContextPtr query_context) const override;
/// A snapshot of pending mutations that weren't applied to some of the parts yet
/// and should be applied on the fly (i.e. when reading from the part).
/// Mutations not supported by AlterConversions (supportsMutationCommandType()) can be omitted.
struct IMutationsSnapshot
{
/// Contains info that doesn't depend on state of mutations.
struct Params
{
Int64 metadata_version = -1;
Int64 min_part_metadata_version = -1;
bool need_data_mutations = false;
};
/// Contains info that depends on state of mutations.
struct Info
{
Int64 num_data_mutations = 0;
Int64 num_metadata_mutations = 0;
};
Params params;
Info info;
IMutationsSnapshot() = default;
IMutationsSnapshot(Params params_, Info info_): params(std::move(params_)), info(std::move(info_)) {}
/// Returns mutation commands that are required to be applied to the `part`.
/// @return list of mutation commands, in *reverse* order (newest to oldest)
virtual MutationCommands getAlterMutationCommandsForPart(const DataPartPtr & part) const = 0;
virtual std::shared_ptr<IMutationsSnapshot> cloneEmpty() const = 0;
virtual NameSet getAllUpdatedColumns() const = 0;
bool hasDataMutations() const { return params.need_data_mutations && info.num_data_mutations > 0; }
virtual ~IMutationsSnapshot() = default;
};
using MutationsSnapshotPtr = std::shared_ptr<const IMutationsSnapshot>;
/// Snapshot for MergeTree contains the current set of data parts
/// at the moment of the start of query.
/// and mutations required to be applied at the moment of the start of query.
struct SnapshotData : public StorageSnapshot::Data
{
DataPartsVector parts;
std::vector<AlterConversionsPtr> alter_conversions;
MutationsSnapshotPtr mutations_snapshot;
};
StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override;
@ -929,8 +968,18 @@ public:
Disks getDisks() const { return getStoragePolicy()->getDisks(); }
/// Returns a snapshot of mutations that probably will be applied on the fly to parts during reading.
virtual MutationsSnapshotPtr getMutationsSnapshot(const IMutationsSnapshot::Params & params) const = 0;
/// Returns the minimum version of metadata among parts.
static Int64 getMinMetadataVersion(const DataPartsVector & parts);
/// Return alter conversions for part which must be applied on fly.
AlterConversionsPtr getAlterConversionsForPart(MergeTreeDataPartPtr part) const;
static AlterConversionsPtr getAlterConversionsForPart(
const MergeTreeDataPartPtr & part,
const MutationsSnapshotPtr & mutations,
const StorageMetadataPtr & metadata,
const ContextPtr & query_context);
/// Returns destination disk or volume for the TTL rule according to current storage policy.
SpacePtr getDestinationForMoveTTL(const TTLDescription & move_ttl) const;
@ -1450,13 +1499,6 @@ protected:
/// mechanisms for parts locking
virtual bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const = 0;
/// Return pending mutations that weren't applied to `part` yet and should be applied on the fly
/// (i.e. when reading from the part). Mutations not supported by AlterConversions
/// (supportsMutationCommandType()) can be omitted.
///
/// @return list of mutations, in *reverse* order (newest to oldest)
virtual MutationCommands getAlterMutationCommandsForPart(const DataPartPtr & part) const = 0;
struct PartBackupEntries
{
String part_name;
@ -1731,7 +1773,14 @@ struct CurrentlySubmergingEmergingTagger
};
/// Look at MutationCommands if it contains mutations for AlterConversions, update the counter.
/// Return true if the counter had been updated
bool updateAlterConversionsMutations(const MutationCommands & commands, std::atomic<ssize_t> & alter_conversions_mutations, bool remove);
void incrementMutationsCounters(
Int64 & num_data_mutations_to_apply,
Int64 & num_metadata_mutations_to_apply,
const MutationCommands & commands);
void decrementMutationsCounters(
Int64 & num_data_mutations_to_apply,
Int64 & num_metadata_mutations_to_apply,
const MutationCommands & commands);
}

View File

@ -133,12 +133,10 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
bool enable_parallel_reading) const
{
const auto & snapshot_data = assert_cast<const MergeTreeData::SnapshotData &>(*storage_snapshot->data);
const auto & parts = snapshot_data.parts;
const auto & alter_conversions = snapshot_data.alter_conversions;
auto step = readFromParts(
parts,
alter_conversions,
snapshot_data.parts,
snapshot_data.mutations_snapshot,
column_names_to_return,
storage_snapshot,
query_info,
@ -500,10 +498,9 @@ std::optional<std::unordered_set<String>> MergeTreeDataSelectExecutor::filterPar
}
void MergeTreeDataSelectExecutor::filterPartsByPartition(
MergeTreeData::DataPartsVector & parts,
const std::optional<PartitionPruner> & partition_pruner,
const std::optional<KeyCondition> & minmax_idx_condition,
MergeTreeData::DataPartsVector & parts,
std::vector<AlterConversionsPtr> & alter_conversions,
const std::optional<std::unordered_set<String>> & part_values,
const StorageMetadataPtr & metadata_snapshot,
const MergeTreeData & data,
@ -512,8 +509,6 @@ void MergeTreeDataSelectExecutor::filterPartsByPartition(
LoggerPtr log,
ReadFromMergeTree::IndexStats & index_stats)
{
chassert(alter_conversions.empty() || parts.size() == alter_conversions.size());
const Settings & settings = context->getSettingsRef();
DataTypes minmax_columns_types;
@ -537,7 +532,6 @@ void MergeTreeDataSelectExecutor::filterPartsByPartition(
if (query_context->getSettingsRef().allow_experimental_query_deduplication)
selectPartsToReadWithUUIDFilter(
parts,
alter_conversions,
part_values,
data.getPinnedPartUUIDs(),
minmax_idx_condition,
@ -550,7 +544,6 @@ void MergeTreeDataSelectExecutor::filterPartsByPartition(
else
selectPartsToRead(
parts,
alter_conversions,
part_values,
minmax_idx_condition,
minmax_columns_types,
@ -589,7 +582,6 @@ void MergeTreeDataSelectExecutor::filterPartsByPartition(
RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipIndexes(
MergeTreeData::DataPartsVector && parts,
std::vector<AlterConversionsPtr> && alter_conversions,
StorageMetadataPtr metadata_snapshot,
const ContextPtr & context,
const KeyCondition & key_condition,
@ -602,8 +594,6 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd
bool use_skip_indexes,
bool find_exact_ranges)
{
chassert(alter_conversions.empty() || parts.size() == alter_conversions.size());
RangesInDataParts parts_with_ranges;
parts_with_ranges.resize(parts.size());
const Settings & settings = context->getSettingsRef();
@ -662,11 +652,8 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd
auto process_part = [&](size_t part_index)
{
auto & part = parts[part_index];
auto alter_conversions_for_part = !alter_conversions.empty()
? alter_conversions[part_index]
: std::make_shared<AlterConversions>();
RangesInDataPart ranges(part, alter_conversions_for_part, part_index);
RangesInDataPart ranges(part, part_index);
size_t total_marks_count = part->index_granularity.getMarksCountWithoutFinal();
if (metadata_snapshot->hasPrimaryKey() || part_offset_condition)
@ -904,6 +891,7 @@ std::shared_ptr<QueryIdHolder> MergeTreeDataSelectExecutor::checkLimits(
ReadFromMergeTree::AnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMarksToRead(
MergeTreeData::DataPartsVector parts,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const Names & column_names_to_return,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
@ -916,11 +904,9 @@ ReadFromMergeTree::AnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMar
return std::make_shared<ReadFromMergeTree::AnalysisResult>();
std::optional<ReadFromMergeTree::Indexes> indexes;
/// NOTE: We don't need alter_conversions because the returned analysis_result is only used for:
/// 1. estimate the number of rows to read; 2. projection reading, which doesn't have alter_conversions.
return ReadFromMergeTree::selectRangesToRead(
std::move(parts),
/*alter_conversions=*/{},
mutations_snapshot,
metadata_snapshot,
query_info,
context,
@ -935,7 +921,7 @@ ReadFromMergeTree::AnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMar
QueryPlanStepPtr MergeTreeDataSelectExecutor::readFromParts(
MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const Names & column_names_to_return,
const StorageSnapshotPtr & storage_snapshot,
const SelectQueryInfo & query_info,
@ -957,7 +943,7 @@ QueryPlanStepPtr MergeTreeDataSelectExecutor::readFromParts(
return std::make_unique<ReadFromMergeTree>(
std::move(parts),
std::move(alter_conversions),
std::move(mutations_snapshot),
column_names_to_return,
data,
query_info,
@ -1554,7 +1540,6 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingMergedIndex(
void MergeTreeDataSelectExecutor::selectPartsToRead(
MergeTreeData::DataPartsVector & parts,
std::vector<AlterConversionsPtr> & alter_conversions,
const std::optional<std::unordered_set<String>> & part_values,
const std::optional<KeyCondition> & minmax_idx_condition,
const DataTypes & minmax_columns_types,
@ -1563,14 +1548,11 @@ void MergeTreeDataSelectExecutor::selectPartsToRead(
PartFilterCounters & counters)
{
MergeTreeData::DataPartsVector prev_parts;
std::vector<AlterConversionsPtr> prev_conversions;
std::swap(prev_parts, parts);
std::swap(prev_conversions, alter_conversions);
for (size_t i = 0; i < prev_parts.size(); ++i)
for (const auto & part_or_projection : prev_parts)
{
const auto * part = prev_parts[i]->isProjectionPart() ? prev_parts[i]->getParentPart() : prev_parts[i].get();
const auto * part = part_or_projection->isProjectionPart() ? part_or_projection->getParentPart() : part_or_projection.get();
if (part_values && part_values->find(part->name) == part_values->end())
continue;
@ -1607,15 +1589,12 @@ void MergeTreeDataSelectExecutor::selectPartsToRead(
counters.num_parts_after_partition_pruner += 1;
counters.num_granules_after_partition_pruner += num_granules;
parts.push_back(prev_parts[i]);
if (!prev_conversions.empty())
alter_conversions.push_back(prev_conversions[i]);
parts.push_back(part_or_projection);
}
}
void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
MergeTreeData::DataPartsVector & parts,
std::vector<AlterConversionsPtr> & alter_conversions,
const std::optional<std::unordered_set<String>> & part_values,
MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
const std::optional<KeyCondition> & minmax_idx_condition,
@ -1628,22 +1607,17 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
{
/// process_parts prepare parts that have to be read for the query,
/// returns false if duplicated parts' UUID have been met
auto select_parts = [&] (
MergeTreeData::DataPartsVector & selected_parts,
std::vector<AlterConversionsPtr> & selected_conversions) -> bool
auto select_parts = [&](MergeTreeData::DataPartsVector & selected_parts) -> bool
{
auto ignored_part_uuids = query_context->getIgnoredPartUUIDs();
std::unordered_set<UUID> temp_part_uuids;
MergeTreeData::DataPartsVector prev_parts;
std::vector<AlterConversionsPtr> prev_conversions;
std::swap(prev_parts, selected_parts);
std::swap(prev_conversions, selected_conversions);
for (size_t i = 0; i < prev_parts.size(); ++i)
for (const auto & part_or_projection : prev_parts)
{
const auto * part = prev_parts[i]->isProjectionPart() ? prev_parts[i]->getParentPart() : prev_parts[i].get();
const auto * part = part_or_projection->isProjectionPart() ? part_or_projection->getParentPart() : part_or_projection.get();
if (part_values && part_values->find(part->name) == part_values->end())
continue;
@ -1693,9 +1667,7 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
throw Exception(ErrorCodes::LOGICAL_ERROR, "Found a part with the same UUID on the same replica.");
}
selected_parts.push_back(prev_parts[i]);
if (!prev_conversions.empty())
selected_conversions.push_back(prev_conversions[i]);
selected_parts.push_back(part_or_projection);
}
if (!temp_part_uuids.empty())
@ -1714,7 +1686,7 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
};
/// Process parts that have to be read for a query.
auto needs_retry = !select_parts(parts, alter_conversions);
auto needs_retry = !select_parts(parts);
/// If any duplicated part UUIDs met during the first step, try to ignore them in second pass.
/// This may happen when `prefer_localhost_replica` is set and "distributed" stage runs in the same process with "remote" stage.
@ -1725,7 +1697,7 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
counters = PartFilterCounters();
/// Second attempt didn't help, throw an exception
if (!select_parts(parts, alter_conversions))
if (!select_parts(parts))
throw Exception(ErrorCodes::DUPLICATED_PART_UUIDS, "Found duplicate UUIDs while processing query.");
}
}

View File

@ -40,7 +40,7 @@ public:
/// The same as read, but with specified set of parts.
QueryPlanStepPtr readFromParts(
MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
const SelectQueryInfo & query_info,
@ -56,6 +56,7 @@ public:
/// This method is used to select best projection for table.
ReadFromMergeTree::AnalysisResultPtr estimateNumMarksToRead(
MergeTreeData::DataPartsVector parts,
MergeTreeData::MutationsSnapshotPtr mutations_snapshot,
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
@ -120,7 +121,6 @@ private:
/// as well as `max_block_number_to_read`.
static void selectPartsToRead(
MergeTreeData::DataPartsVector & parts,
std::vector<AlterConversionsPtr> & alter_conversions,
const std::optional<std::unordered_set<String>> & part_values,
const std::optional<KeyCondition> & minmax_idx_condition,
const DataTypes & minmax_columns_types,
@ -131,7 +131,6 @@ private:
/// Same as previous but also skip parts uuids if any to the query context, or skip parts which uuids marked as excluded.
static void selectPartsToReadWithUUIDFilter(
MergeTreeData::DataPartsVector & parts,
std::vector<AlterConversionsPtr> & alter_conversions,
const std::optional<std::unordered_set<String>> & part_values,
MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
const std::optional<KeyCondition> & minmax_idx_condition,
@ -175,10 +174,9 @@ public:
/// Filter parts using minmax index and partition key.
static void filterPartsByPartition(
MergeTreeData::DataPartsVector & parts,
const std::optional<PartitionPruner> & partition_pruner,
const std::optional<KeyCondition> & minmax_idx_condition,
MergeTreeData::DataPartsVector & parts,
std::vector<AlterConversionsPtr> & alter_conversions,
const std::optional<std::unordered_set<String>> & part_values,
const StorageMetadataPtr & metadata_snapshot,
const MergeTreeData & data,
@ -192,7 +190,6 @@ public:
/// If 'check_limits = true' it will throw exception if the amount of data exceed the limits from settings.
static RangesInDataParts filterPartsByPrimaryKeyAndSkipIndexes(
MergeTreeData::DataPartsVector && parts,
std::vector<AlterConversionsPtr> && alter_conversions,
StorageMetadataPtr metadata_snapshot,
const ContextPtr & context,
const KeyCondition & key_condition,

View File

@ -50,7 +50,7 @@ UInt64 MergeTreeMutationEntry::parseFileName(const String & file_name_)
MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk_, const String & path_prefix_, UInt64 tmp_number,
const TransactionID & tid_, const WriteSettings & settings)
: create_time(time(nullptr))
, commands(std::move(commands_))
, commands(std::make_shared<MutationCommands>(std::move(commands_)))
, disk(std::move(disk_))
, path_prefix(path_prefix_)
, file_name("tmp_mutation_" + toString(tmp_number) + ".txt")
@ -63,7 +63,7 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskP
*out << "format version: 1\n"
<< "create time: " << LocalDateTime(create_time, DateLUT::serverTimezoneInstance()) << "\n";
*out << "commands: ";
commands.writeText(*out, /* with_pure_metadata_commands = */ false);
commands->writeText(*out, /* with_pure_metadata_commands = */ false);
*out << "\n";
if (tid.isPrehistoric())
{
@ -116,7 +116,8 @@ void MergeTreeMutationEntry::writeCSN(CSN csn_)
}
MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & path_prefix_, const String & file_name_)
: disk(std::move(disk_))
: commands(std::make_shared<MutationCommands>())
, disk(std::move(disk_))
, path_prefix(path_prefix_)
, file_name(file_name_)
, is_temp(false)
@ -133,7 +134,7 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & pat
create_time_dt.hour(), create_time_dt.minute(), create_time_dt.second());
*buf >> "commands: ";
commands.readText(*buf);
commands->readText(*buf);
*buf >> "\n";
if (buf->eof())
@ -177,7 +178,7 @@ std::shared_ptr<const IBackupEntry> MergeTreeMutationEntry::backup() const
out << "block number: " << block_number << "\n";
out << "commands: ";
commands.writeText(out, /* with_pure_metadata_commands = */ false);
commands->writeText(out, /* with_pure_metadata_commands = */ false);
out << "\n";
return std::make_shared<BackupEntryFromMemory>(out.str());

View File

@ -16,7 +16,7 @@ class IBackupEntry;
struct MergeTreeMutationEntry
{
time_t create_time = 0;
MutationCommands commands;
std::shared_ptr<MutationCommands> commands;
DiskPtr disk;
String path_prefix;

View File

@ -85,6 +85,7 @@ MergeTreeReadTask::Readers MergeTreePrefetchedReadPool::PrefetchedReaders::get()
MergeTreePrefetchedReadPool::MergeTreePrefetchedReadPool(
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -95,6 +96,7 @@ MergeTreePrefetchedReadPool::MergeTreePrefetchedReadPool(
const ContextPtr & context_)
: MergeTreeReadPoolBase(
std::move(parts_),
std::move(mutations_snapshot_),
std::move(shared_virtual_fields_),
storage_snapshot_,
prewhere_info_,
@ -103,7 +105,6 @@ MergeTreePrefetchedReadPool::MergeTreePrefetchedReadPool(
column_names_,
settings_,
context_)
, WithContext(context_)
, prefetch_threadpool(getContext()->getPrefetchThreadpool())
, log(getLogger("MergeTreePrefetchedReadPool(" + (parts_ranges.empty() ? "" : parts_ranges.front().data_part->storage.getStorageID().getNameForLogs()) + ")"))
{

View File

@ -14,11 +14,12 @@ using MergeTreeReaderPtr = std::unique_ptr<IMergeTreeReader>;
/// A class which is responsible for creating read tasks
/// which are later taken by readers via getTask method.
/// Does prefetching for the read tasks it creates.
class MergeTreePrefetchedReadPool : public MergeTreeReadPoolBase, private WithContext
class MergeTreePrefetchedReadPool : public MergeTreeReadPoolBase
{
public:
MergeTreePrefetchedReadPool(
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,

View File

@ -29,6 +29,7 @@ extern const int BAD_ARGUMENTS;
MergeTreeReadPool::MergeTreeReadPool(
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -39,6 +40,7 @@ MergeTreeReadPool::MergeTreeReadPool(
const ContextPtr & context_)
: MergeTreeReadPoolBase(
std::move(parts_),
std::move(mutations_snapshot_),
std::move(shared_virtual_fields_),
storage_snapshot_,
prewhere_info_,

View File

@ -26,6 +26,7 @@ public:
MergeTreeReadPool(
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,

View File

@ -4,9 +4,6 @@
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
#include <cmath>
namespace DB
{
@ -17,6 +14,7 @@ namespace ErrorCodes
MergeTreeReadPoolBase::MergeTreeReadPoolBase(
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -25,7 +23,9 @@ MergeTreeReadPoolBase::MergeTreeReadPoolBase(
const Names & column_names_,
const PoolSettings & pool_settings_,
const ContextPtr & context_)
: parts_ranges(std::move(parts_))
: WithContext(context_)
, parts_ranges(std::move(parts_))
, mutations_snapshot(std::move(mutations_snapshot_))
, shared_virtual_fields(std::move(shared_virtual_fields_))
, storage_snapshot(storage_snapshot_)
, prewhere_info(prewhere_info_)
@ -120,9 +120,9 @@ void MergeTreeReadPoolBase::fillPerPartInfos(const Settings & settings)
}
read_task_info.part_index_in_query = part_with_ranges.part_index_in_query;
read_task_info.alter_conversions = part_with_ranges.alter_conversions;
read_task_info.alter_conversions = MergeTreeData::getAlterConversionsForPart(part_with_ranges.data_part, mutations_snapshot, storage_snapshot->metadata, getContext());
LoadedMergeTreeDataPartInfoForReader part_info(part_with_ranges.data_part, part_with_ranges.alter_conversions);
LoadedMergeTreeDataPartInfoForReader part_info(part_with_ranges.data_part, read_task_info.alter_conversions);
read_task_info.task_columns = getReadTaskColumns(
part_info,

View File

@ -6,9 +6,11 @@
namespace DB
{
class MergeTreeReadPoolBase : public IMergeTreeReadPool
class MergeTreeReadPoolBase : public IMergeTreeReadPool, protected WithContext
{
public:
using MutationsSnapshotPtr = MergeTreeData::MutationsSnapshotPtr;
struct PoolSettings
{
size_t threads = 0;
@ -23,6 +25,7 @@ public:
MergeTreeReadPoolBase(
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -37,6 +40,7 @@ public:
protected:
/// Initialized in constructor
const RangesInDataParts parts_ranges;
const MutationsSnapshotPtr mutations_snapshot;
const VirtualFields shared_virtual_fields;
const StorageSnapshotPtr storage_snapshot;
const PrewhereInfoPtr prewhere_info;

View File

@ -12,6 +12,7 @@ MergeTreeReadPoolInOrder::MergeTreeReadPoolInOrder(
bool has_limit_below_one_block_,
MergeTreeReadType read_type_,
RangesInDataParts parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -22,6 +23,7 @@ MergeTreeReadPoolInOrder::MergeTreeReadPoolInOrder(
const ContextPtr & context_)
: MergeTreeReadPoolBase(
std::move(parts_),
std::move(mutations_snapshot_),
std::move(shared_virtual_fields_),
storage_snapshot_,
prewhere_info_,

View File

@ -11,6 +11,7 @@ public:
bool has_limit_below_one_block_,
MergeTreeReadType read_type_,
RangesInDataParts parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,

View File

@ -14,6 +14,7 @@ namespace ErrorCodes
MergeTreeReadPoolParallelReplicas::MergeTreeReadPoolParallelReplicas(
ParallelReadingExtension extension_,
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -24,6 +25,7 @@ MergeTreeReadPoolParallelReplicas::MergeTreeReadPoolParallelReplicas(
const ContextPtr & context_)
: MergeTreeReadPoolBase(
std::move(parts_),
std::move(mutations_snapshot_),
std::move(shared_virtual_fields_),
storage_snapshot_,
prewhere_info_,

View File

@ -11,6 +11,7 @@ public:
MergeTreeReadPoolParallelReplicas(
ParallelReadingExtension extension_,
RangesInDataParts && parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,

View File

@ -13,6 +13,7 @@ MergeTreeReadPoolParallelReplicasInOrder::MergeTreeReadPoolParallelReplicasInOrd
ParallelReadingExtension extension_,
CoordinationMode mode_,
RangesInDataParts parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,
@ -23,6 +24,7 @@ MergeTreeReadPoolParallelReplicasInOrder::MergeTreeReadPoolParallelReplicasInOrd
const ContextPtr & context_)
: MergeTreeReadPoolBase(
std::move(parts_),
std::move(mutations_snapshot_),
std::move(shared_virtual_fields_),
storage_snapshot_,
prewhere_info_,

View File

@ -12,6 +12,7 @@ public:
ParallelReadingExtension extension_,
CoordinationMode mode_,
RangesInDataParts parts_,
MutationsSnapshotPtr mutations_snapshot_,
VirtualFields shared_virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
const PrewhereInfoPtr & prewhere_info_,

View File

@ -34,6 +34,7 @@ public:
const MergeTreeData & storage_,
const StorageSnapshotPtr & storage_snapshot_,
MergeTreeData::DataPartPtr data_part_,
AlterConversionsPtr alter_conversions_,
Names columns_to_read_,
std::optional<MarkRanges> mark_ranges_,
bool apply_deleted_mask,
@ -58,6 +59,9 @@ private:
/// Data part will not be removed if the pointer owns it
MergeTreeData::DataPartPtr data_part;
/// Alter and mutation commands that are required to be applied to the part on-fly.
AlterConversionsPtr alter_conversions;
/// Columns we have to read (each Block from read will contain them)
Names columns_to_read;
@ -87,6 +91,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
const MergeTreeData & storage_,
const StorageSnapshotPtr & storage_snapshot_,
MergeTreeData::DataPartPtr data_part_,
AlterConversionsPtr alter_conversions_,
Names columns_to_read_,
std::optional<MarkRanges> mark_ranges_,
bool apply_deleted_mask,
@ -96,6 +101,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
, storage(storage_)
, storage_snapshot(storage_snapshot_)
, data_part(std::move(data_part_))
, alter_conversions(std::move(alter_conversions_))
, columns_to_read(std::move(columns_to_read_))
, read_with_direct_io(read_with_direct_io_)
, mark_ranges(std::move(mark_ranges_))
@ -109,8 +115,6 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
LOG_DEBUG(log, "Reading {} marks from part {}, total {} rows starting from the beginning of the part",
data_part->getMarksCount(), data_part->name, data_part->rows_count);
auto alter_conversions = storage.getAlterConversionsForPart(data_part);
/// Note, that we don't check setting collaborate_with_coordinator presence, because this source
/// is only used in background merges.
addTotalRowsApprox(data_part->rows_count);
@ -299,6 +303,7 @@ Pipe createMergeTreeSequentialSource(
const MergeTreeData & storage,
const StorageSnapshotPtr & storage_snapshot,
MergeTreeData::DataPartPtr data_part,
AlterConversionsPtr alter_conversions,
Names columns_to_read,
std::optional<MarkRanges> mark_ranges,
std::shared_ptr<std::atomic<size_t>> filtered_rows_count,
@ -315,7 +320,8 @@ Pipe createMergeTreeSequentialSource(
columns_to_read.emplace_back(RowExistsColumn::name);
auto column_part_source = std::make_shared<MergeTreeSequentialSource>(type,
storage, storage_snapshot, data_part, columns_to_read, std::move(mark_ranges),
storage, storage_snapshot, data_part, alter_conversions,
columns_to_read, std::move(mark_ranges),
/*apply_deleted_mask=*/ false, read_with_direct_io, prefetch);
Pipe pipe(std::move(column_part_source));
@ -346,6 +352,7 @@ public:
const MergeTreeData & storage_,
const StorageSnapshotPtr & storage_snapshot_,
MergeTreeData::DataPartPtr data_part_,
AlterConversionsPtr alter_conversions_,
Names columns_to_read_,
std::shared_ptr<std::atomic<size_t>> filtered_rows_count_,
bool apply_deleted_mask_,
@ -359,6 +366,7 @@ public:
, storage(storage_)
, storage_snapshot(storage_snapshot_)
, data_part(std::move(data_part_))
, alter_conversions(std::move(alter_conversions_))
, columns_to_read(std::move(columns_to_read_))
, filtered_rows_count(std::move(filtered_rows_count_))
, apply_deleted_mask(apply_deleted_mask_)
@ -405,6 +413,7 @@ public:
storage,
storage_snapshot,
data_part,
alter_conversions,
columns_to_read,
std::move(mark_ranges),
filtered_rows_count,
@ -420,6 +429,7 @@ private:
const MergeTreeData & storage;
const StorageSnapshotPtr storage_snapshot;
const MergeTreeData::DataPartPtr data_part;
const AlterConversionsPtr alter_conversions;
const Names columns_to_read;
const std::shared_ptr<std::atomic<size_t>> filtered_rows_count;
const bool apply_deleted_mask;
@ -436,6 +446,7 @@ void createReadFromPartStep(
const MergeTreeData & storage,
const StorageSnapshotPtr & storage_snapshot,
MergeTreeData::DataPartPtr data_part,
AlterConversionsPtr alter_conversions,
Names columns_to_read,
std::shared_ptr<std::atomic<size_t>> filtered_rows_count,
bool apply_deleted_mask,
@ -450,6 +461,7 @@ void createReadFromPartStep(
storage,
storage_snapshot,
std::move(data_part),
std::move(alter_conversions),
std::move(columns_to_read),
filtered_rows_count,
apply_deleted_mask,

View File

@ -21,6 +21,7 @@ Pipe createMergeTreeSequentialSource(
const MergeTreeData & storage,
const StorageSnapshotPtr & storage_snapshot,
MergeTreeData::DataPartPtr data_part,
AlterConversionsPtr alter_conversions,
Names columns_to_read,
std::optional<MarkRanges> mark_ranges,
std::shared_ptr<std::atomic<size_t>> filtered_rows_count,
@ -36,6 +37,7 @@ void createReadFromPartStep(
const MergeTreeData & storage,
const StorageSnapshotPtr & storage_snapshot,
MergeTreeData::DataPartPtr data_part,
AlterConversionsPtr alter_conversions,
Names columns_to_read,
std::shared_ptr<std::atomic<size_t>> filtered_rows_count,
bool apply_deleted_mask,

View File

@ -115,6 +115,7 @@ static UInt64 getExistingRowsCount(const Block & block)
static void splitAndModifyMutationCommands(
MergeTreeData::DataPartPtr part,
StorageMetadataPtr metadata_snapshot,
AlterConversionsPtr alter_conversions,
const MutationCommands & commands,
MutationCommands & for_interpreter,
MutationCommands & for_file_renames,
@ -180,8 +181,6 @@ static void splitAndModifyMutationCommands(
}
auto alter_conversions = part->storage.getAlterConversionsForPart(part);
/// We don't add renames from commands, instead we take them from rename_map.
/// It's important because required renames depend not only on part's data version (i.e. mutation version)
/// but also on part's metadata version. Why we have such logic only for renames? Because all other types of alter
@ -297,7 +296,6 @@ static void splitAndModifyMutationCommands(
}
}
auto alter_conversions = part->storage.getAlterConversionsForPart(part);
/// We don't add renames from commands, instead we take them from rename_map.
/// It's important because required renames depend not only on part's data version (i.e. mutation version)
/// but also on part's metadata version. Why we have such logic only for renames? Because all other types of alter
@ -2042,6 +2040,15 @@ bool MutateTask::prepare()
ctx->num_mutations = std::make_unique<CurrentMetrics::Increment>(CurrentMetrics::PartMutation);
MergeTreeData::IMutationsSnapshot::Params params
{
.metadata_version = ctx->metadata_snapshot->getMetadataVersion(),
.min_part_metadata_version = ctx->source_part->getMetadataVersion(),
};
auto mutations_snapshot = ctx->data->getMutationsSnapshot(params);
auto alter_conversions = MergeTreeData::getAlterConversionsForPart(ctx->source_part, mutations_snapshot, ctx->metadata_snapshot, ctx->context);
auto context_for_reading = Context::createCopy(ctx->context);
/// Allow mutations to work when force_index_by_date or force_primary_key is on.
@ -2056,7 +2063,7 @@ bool MutateTask::prepare()
ctx->commands_for_part.emplace_back(command);
if (ctx->source_part->isStoredOnDisk() && !isStorageTouchedByMutations(
ctx->source_part, ctx->metadata_snapshot, ctx->commands_for_part, context_for_reading))
ctx->source_part, mutations_snapshot, ctx->metadata_snapshot, ctx->commands_for_part, context_for_reading))
{
NameSet files_to_copy_instead_of_hardlinks;
auto settings_ptr = ctx->data->getSettings();
@ -2116,8 +2123,13 @@ bool MutateTask::prepare()
context_for_reading->setSetting("read_from_filesystem_cache_if_exists_otherwise_bypass_cache", 1);
MutationHelpers::splitAndModifyMutationCommands(
ctx->source_part, ctx->metadata_snapshot,
ctx->commands_for_part, ctx->for_interpreter, ctx->for_file_renames, ctx->log);
ctx->source_part,
ctx->metadata_snapshot,
alter_conversions,
ctx->commands_for_part,
ctx->for_interpreter,
ctx->for_file_renames,
ctx->log);
ctx->stage_progress = std::make_unique<MergeStageProgress>(1.0);
@ -2131,7 +2143,8 @@ bool MutateTask::prepare()
settings.apply_deleted_mask = false;
ctx->interpreter = std::make_unique<MutationsInterpreter>(
*ctx->data, ctx->source_part, ctx->metadata_snapshot, ctx->for_interpreter,
*ctx->data, ctx->source_part, alter_conversions,
ctx->metadata_snapshot, ctx->for_interpreter,
ctx->metadata_snapshot->getColumns().getNamesOfPhysical(), context_for_reading, settings);
ctx->materialized_indices = ctx->interpreter->grabMaterializedIndices();

View File

@ -42,7 +42,6 @@ struct RangesInDataPartsDescription: public std::deque<RangesInDataPartDescripti
struct RangesInDataPart
{
DataPartPtr data_part;
AlterConversionsPtr alter_conversions;
size_t part_index_in_query;
MarkRanges ranges;
MarkRanges exact_ranges;
@ -51,14 +50,13 @@ struct RangesInDataPart
RangesInDataPart(
const DataPartPtr & data_part_,
const AlterConversionsPtr & alter_conversions_,
const size_t part_index_in_query_,
const MarkRanges & ranges_ = MarkRanges{})
: data_part{data_part_}
, alter_conversions{alter_conversions_}
, part_index_in_query{part_index_in_query_}
, ranges{ranges_}
{}
{
}
RangesInDataPartDescription getDescription() const;

Some files were not shown because too many files have changed in this diff Show More